pax_global_header00006660000000000000000000000064137143243740014522gustar00rootroot0000000000000052 comment=dae467dc8ded7bcf46f4191671b6a653c37ee43f mumax3-3.10/000077500000000000000000000000001371432437400126575ustar00rootroot00000000000000mumax3-3.10/.github/000077500000000000000000000000001371432437400142175ustar00rootroot00000000000000mumax3-3.10/.github/ISSUE_TEMPLATE000066400000000000000000000004451371432437400163300ustar00rootroot00000000000000 mumax3-3.10/.gitignore000066400000000000000000000002061371432437400146450ustar00rootroot00000000000000*.swp *.swo *.5 *.6 *.8 *.o *.a *.log *.dump *.table *.gplot *.pprof mx3 *.tar.gz mumax3.*linux*cuda* *.*~ tmp/ *.out .idea/ .vscode/ mumax3-3.10/.travis.yml000066400000000000000000000006141371432437400147710ustar00rootroot00000000000000language: go dist: xenial sudo: required install: true env: global: - GOARCH=amd64 before_install: - wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/cuda-repo-ubuntu1604_9.2.88-1_amd64.deb - sudo dpkg -i cuda-repo-ubuntu1604_9.2.88-1_amd64.deb - sudo apt-get -qq update - sudo apt-get install cuda -y --allow-unauthenticated script: - go build ./...mumax3-3.10/LICENSE000066400000000000000000000021511371432437400136630ustar00rootroot00000000000000Mumax3 GPU-accelerated micromagnetic simulator Copyright (C) 2012-2014 Arne Vansteenkiste. Contributions by Ahmad Syukri, Colin Jermain, Jonathan Leliaert, Mykola Dvornik. Mumax3 is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Mumax3 uses svgo (http://github.com/ajstarks/svgo), copyright Anthony Starks, licensed under the Creative Commons Attribution 3.0 license as described in http://creativecommons.org/licenses/by/3.0/us/ . Mumax3 uses freetype-go (http://code.google.com/p/freetype-go/), copyright Google Inc., Jeff R. Allen, Rémy Oudompheng, Roger Peppe, licensed under the FreeType License or the GNU General Public License (GPL), version 2 or later. Mumax3 uses CUDA libraries, copyright NVIDIA. mumax3-3.10/Makefile000066400000000000000000000017661371432437400143310ustar00rootroot00000000000000 # Use the default go compiler GO_BUILDFLAGS=-compiler gc # Or uncomment the line below to use the gccgo compiler, which may # or may not be faster than gc and which may or may not compile... # GO_BUILDFLAGS=-compiler gccgo -gccgoflags '-static-libgcc -O4 -Ofast -march=native' CGO_CFLAGS_ALLOW='(-fno-schedule-insns|-malign-double|-ffast-math)' .PHONY: all cudakernels clean realclean checktests runtests hooks all: cudakernels hooks go install -v $(GO_BUILDFLAGS) github.com/mumax/3/... cudakernels: cd cuda && $(MAKE) doc: cd doc && $(MAKE) test: all go test -vet=off -i github.com/mumax/3/... go test -vet=off $(PKGS) github.com/mumax/3/... cd test && ./run.bash hooks: .git/hooks/post-commit .git/hooks/pre-commit .git/hooks/post-commit: post-commit ln -sf $(CURDIR)/$< $@ .git/hooks/pre-commit: pre-commit ln -sf $(CURDIR)/$< $@ clean: rm -frv $(GOPATH)/pkg/*/github.com/mumax/3/* rm -frv $(GOPATH)/bin/mumax3* cd cuda && $(MAKE) clean realclean: clean cd cuda && ${MAKE} realcleanmumax3-3.10/README.md000066400000000000000000000032331371432437400141370ustar00rootroot00000000000000mumax3 ====== [![Build Status](https://travis-ci.org/mumax/3.svg?branch=master)](https://travis-ci.org/mumax/3) GPU accelerated micromagnetic simulator. Downloads and documentation --------------------------- http://mumax.github.io Paper ----- The Design and Verification of mumax3: http://scitation.aip.org/content/aip/journal/adva/4/10/10.1063/1.4899186 Tools ----- https://godoc.org/github.com/mumax/3/cmd Building from source (for linux) -------------------- Consider downloading a pre-compiled binary. If you want to compile nevertheless: * install the nvidia proprietary driver, if not yet present. - if unsure, it's probably already there - version 440.44 recommended * install Go - https://golang.org/dl/ - set $GOPATH * install CUDA - https://developer.nvidia.com/cuda-downloads (pick default installation path) - or `sudo apt-get install nvidia-cuda-toolkit` * install a C compiler - on Ubuntu: `sudo apt-get install gcc` * if you have git installed: - `go get github.com/mumax/3/cmd/mumax3` * if you don't have git: - seriously, no git? - get the source from https://github.com/mumax/3/releases - unzip the source into $GOPATH/src/github.com/mumax/3 - `cd $GOPATH/src/github.com/mumax/3/cmd/mumax3` - `go install` * optional: install gnuplot if you want pretty graphs - on ubuntu: `sudo apt-get install gnuplot` * use the Makefile if there is a need to recompile the cuda kernels - `make realclean && make` Your binary is now at `$GOPATH/bin/mumax3` Contributing ------------ Contributions are gratefully accepted. To contribute code, fork our repo on github and send a pull request. mumax3-3.10/bench/000077500000000000000000000000001371432437400137365ustar00rootroot00000000000000mumax3-3.10/bench/bench.mx3000066400000000000000000000007471371432437400154560ustar00rootroot00000000000000msat = 800e3 aex = 13e-12 alpha = 0.01 c := 4e-9 setcellsize(c, c, c) setsolver(2) for e:=5; e<14; e++{ n := pow(2, e) setgridsize(n, n, 1) print(n, "x", n) steps(1) // warm-up kernel b_ext = vector(0, 0.01, 0) m=uniform(1, 0, 0) // warm-up dt steps(3) m=uniform(1, 0, 0) // start! t = 0 start := now() neval0 := Neval.get() steps(100) wall := since(start).Seconds() nevl := Neval.get() - neval0 N2 := n*n fprintln("benchmark.txt", N2, N2*nevl/wall, t/nevl) } mumax3-3.10/bench/gpus.gplot000077500000000000000000000005531371432437400157710ustar00rootroot00000000000000#! /usr/bin/gnuplot set term pdf size 4in, 3in; set output "gpus.pdf" set boxwidth 0.5 set style fill solid set key off set ylabel "throughput (M cells/s)" set xtics rotate by -90 #set xtics out offset 0,-1.2 set yrange[0:650] plot "gpus.txt" u ($0+1):($2/1e6):xtic(4) w boxes, "oommf4M.txt" u (0):(4*$1**2 * $2 /$3/1e6):xtic("OOMMF(CPU)") w boxes set output mumax3-3.10/bench/gpus.txt000066400000000000000000000044621371432437400154630ustar00rootroot000000000000004.194304e+06 2.0895753045290217e+07 1.6945210251645377e-14 "GT 650M" 1.048576e+06 4.033703669606458e+07 1.6946205941339026e-14 "GT 755M (iMac 2013)" 4.194304e+06 5.527840160261479e+07 1.6945213680693907e-14 "GTX 860M" 4.194304e+06 6.696363509497398e+07 1.6945210251645377e-14 "Tesla M2070" 4.194304e+06 6.945448262641394e+07 1.6945209171548022e-14 "Tesla 2050" 4.194304e+06 8.771628448955536e+07 1.5722384085696076e-14 "GTX 660" 4.194304e+06 9.70511185846317e+07 1.6945209171548022e-14 "Quadro K4200" 4.194304e+06 1.1248743156367055e+08 1.5722384085696076e-14 "GTX 480" 4.194304e+06 1.1436826272529118e+08 1.5722384085696076e-14 "GTX 680" 4.194304e+06 1.205878382259874e+08 1.6945213680693907e-14 "GTX 970" 4.194304e+06 1.2765247865136869e+08 1.5722384085696076e-14 "GTX 580" 4.194304e+06 1.282698958240901e+08 1.6945224358062508e-14 "GTX 1060 (mobile)" 4.194304e+06 1.3064462210481596e+08 1.6945210251645377e-14 "Tesla K20XM" 4.194304e+06 1.3222269984079185e+08 1.6945213680693907e-14 "GTX 980" 4.194304e+06 1.702974441584964e+08 1.6945224358062508e-14 "GTX 1070" 4.194304e+06 1.752769108651334e+08 1.6945209171548022e-14 "GTX TITAN BLACK FE" 4.194304e+06 1.7967269114941204e+08 1.6945210251645377e-14 "GTX TITAN" 4.194304e+06 1.8968187897582138e+08 1.6945207552204512e-14 "GTX 1080" 4.194304e+06 1.974787604705023e+08 1.6945207552204512e-14 "Tesla M40" 4.194304e+06 1.9961744689897743e+08 1.6945207552204512e-14 "GTX 980 Ti" 4.194304e+06 2.0445838079431638e+08 1.6945210988316847e-14 "Quadro P5000" 4.194304e+06 2.3944869412188095e+08 1.6945207552204512e-14 "Tesla P40" 4.194304e+06 2.747775864824991e+08 1.694521108780564e-14 "GTX TITAN X (Pascal)" 4.194304e+06 2.846817297049518e+08 1.6945216799964015e-14 "RTX 2060" 4.194304e+06 2.7516254149838316e+08 1.6945224358062508e-14 "GTX 1080 Ti" 4.194304e+06 3.413751301174529e+08 1.6945224358062508e-14 "GTX TITAN Xp" 4.194304e+06 3.6200963642011607e+08 1.6945207552204512e-14 "Tesla P100" 4.194304e+06 3.987991717041427e+08 1.6945207552204512e-14 "Tesla P100 SXM2" 4.194304e+06 4.283830343840416e+08 1.6945210988316847e-14 "RTX 2080 Ti" 4.194304e+06 4.687789231921795e+08 1.6945210988316847e-14 "TITAN V" 4.194304e+06 5.0719408253287315e+08 1.6945216799964015e-14 "RTX 2080 Ti OC" 4.194304e+06 6.118821270070031e+08 1.6945207552204512e-14 "Tesla V100" mumax3-3.10/bench/oommf4M.txt000066400000000000000000000000161371432437400160120ustar00rootroot000000000000002048 3 35 mumax3-3.10/cmd/000077500000000000000000000000001371432437400134225ustar00rootroot00000000000000mumax3-3.10/cmd/gccgorun000077500000000000000000000003531371432437400151600ustar00rootroot00000000000000#! /bin/bash # wrapper for "go run" using gccgo with flags for speed. echo go run -compiler=gccgo -gccgoflags '-static-libgcc -O4 -Ofast -march=native' $@ go run -compiler=gccgo -gccgoflags '-static-libgcc -O4 -Ofast -march=native' $@ mumax3-3.10/cmd/mumax3-convert/000077500000000000000000000000001371432437400163125ustar00rootroot00000000000000mumax3-3.10/cmd/mumax3-convert/.gitignore000066400000000000000000000000241371432437400202760ustar00rootroot00000000000000main mumax3-convert mumax3-3.10/cmd/mumax3-convert/Makefile000066400000000000000000000000241371432437400177460ustar00rootroot00000000000000all: go install -v mumax3-3.10/cmd/mumax3-convert/csv.go000066400000000000000000000006711371432437400174400ustar00rootroot00000000000000package main import ( "fmt" "io" "github.com/mumax/3/data" ) // comma-separated values func dumpCSV(f *data.Slice, info data.Meta, out io.Writer) { f2 := ", " + *flag_format a := f.Tensors() for _, a := range a { for _, a := range a { for _, a := range a { fmt.Fprintf(out, *flag_format, a[0]) for i := 1; i < len(a); i++ { fmt.Fprintf(out, f2, a[i]) } fmt.Fprintln(out) } fmt.Fprintln(out) } } } mumax3-3.10/cmd/mumax3-convert/gnuplot.go000066400000000000000000000015621371432437400203350ustar00rootroot00000000000000package main // Output for gnuplot's "splot" import ( "bufio" "fmt" "io" "github.com/mumax/3/data" ) const DELIM = "\t" func dumpGnuplot(f *data.Slice, m data.Meta, out io.Writer) { buf := bufio.NewWriter(out) defer buf.Flush() data := f.Tensors() cellsize := m.CellSize // If no cell size is set, use generic cell index. if cellsize == [3]float64{0, 0, 0} { cellsize = [3]float64{1, 1, 1} } ncomp := f.NComp() for iz := range data[0] { z := float64(iz) * cellsize[Z] for iy := range data[0][iz] { y := float64(iy) * cellsize[Y] for ix := range data[0][iz][iy] { x := float64(ix) * cellsize[X] fmt.Fprint(buf, x, DELIM, y, DELIM, z, DELIM) for c := 0; c < ncomp-1; c++ { fmt.Fprint(buf, data[c][iz][iy][ix], DELIM) } fmt.Fprint(buf, data[ncomp-1][iz][iy][ix]) fmt.Fprint(buf, "\n") } fmt.Fprint(buf, "\n") } } } mumax3-3.10/cmd/mumax3-convert/json.go000066400000000000000000000002771371432437400176200ustar00rootroot00000000000000package main import ( "encoding/json" "io" "github.com/mumax/3/data" ) func dumpJSON(f *data.Slice, info data.Meta, out io.Writer) { w := json.NewEncoder(out) w.Encode(f.Tensors()) } mumax3-3.10/cmd/mumax3-convert/main.go000066400000000000000000000301161371432437400175660ustar00rootroot00000000000000/* mumax3-convert converts mumax3 output files to various formats and images. It also provides basic manipulations like data rescale etc. Usage Command-line flags must always preceed the input files: mumax3-convert [flags] files For a overview of flags, run: mumax3-convert -help Example: convert all .ovf files to PNG: mumax3-convert -png *.ovf For scalar data, the color scale is automatically stretched to cover the all values. The values corresponding to minimum and maximum color can be overridden by the -min and -max flags. Values falling outside of this range will be clipped. E.g.: mumax3-convert -png -min=0 -max=1 file.ovf. The default scalar color map is black,gray,white (minimum value maps to black, maximum to white). This can be overridden by -color. E.g., a rather colorful map: mumax3-convert -png -color black,blue,cyan,green,yellow,red,white file.ovf Example: resize data to a 32 x 32 x 1 mesh, normalize vectors to unit length and convert the result to OOMMF binary output: mumax3-convert -resize 32x32x1 -normalize -ovf binary file.ovf Example: convert all .ovf files to VTK binary saving only the X component. Also output to JPEG in the meanwhile: mumax3-convert -comp 0 -vtk binary -jpg *.ovf Example: convert legacy .dump files to .ovf: mumax3-convert -ovf2 *.dump Example: cut out a piece of the data between min:max. max is exclusive bound. bounds can be omitted, default to 0 lower bound or maximum upper bound mumax3-convert -xrange 50:100 -yrange :100 file.ovf Example: select the bottom layer mumax3-convert -zrange :1 file.ovf Output file names are automatically assigned. */ package main import ( "compress/gzip" "flag" "fmt" "image/color" "io" "log" "os" "path" "path/filepath" "strconv" "strings" "github.com/mumax/3/data" "github.com/mumax/3/draw" "github.com/mumax/3/dump" "github.com/mumax/3/httpfs" "github.com/mumax/3/oommf" "github.com/mumax/3/util" ) var ( flag_comp = flag.String("comp", "", "Select a component of vector data. (0,1,2 or x,y,z)") flag_show = flag.Bool("show", false, "Human-readible output to stdout") flag_format = flag.String("f", "%v", "Printf format string") flag_png = flag.Bool("png", false, "PNG output") flag_jpeg = flag.Bool("jpg", false, "JPEG output") flag_gif = flag.Bool("gif", false, "GIF output") flag_svg = flag.Bool("svg", false, "SVG output") flag_svgz = flag.Bool("svgz", false, "SVGZ output (compressed)") flag_gnuplot = flag.Bool("gplot", false, "Gnuplot-compatible output") flag_ovf1 = flag.String("ovf", "", `"text" or "binary" OVF1 output`) flag_omf = flag.String("omf", "", `"text" or "binary" OVF1 output`) flag_ovf2 = flag.String("ovf2", "", `"text" or "binary" OVF2 output`) flag_vtk = flag.String("vtk", "", `"ascii" or "binary" VTK output`) flag_dump = flag.Bool("dump", false, `output in dump format`) flag_csv = flag.Bool("csv", false, `output in CSV format`) flag_numpy = flag.Bool("numpy", false, "Numpy output") flag_json = flag.Bool("json", false, `output in JSON format`) flag_min = flag.String("min", "auto", `Minimum of color scale: "auto" or value.`) flag_max = flag.String("max", "auto", `Maximum of color scale: "auto" or value.`) flag_normalize = flag.Bool("normalize", false, `Normalize vector data to unit length`) flag_normpeak = flag.Bool("normpeak", false, `Scale vector data, maximum to unit length`) flag_resize = flag.String("resize", "", "Resize. E.g.: 128x128x4") flag_cropx = flag.String("xrange", "", "Crop x range min:max (both optional, max=exclusive)") flag_cropy = flag.String("yrange", "", "Crop y range min:max (both optional, max=exclusive)") flag_cropz = flag.String("zrange", "", "Crop z range min:max (both optional, max=exclusive)") flag_dir = flag.String("o", "", "Save all output in this directory") flag_arrows = flag.Int("arrows", 0, "Arrow size for vector bitmap image output") flag_color = flag.String("color", "black,gray,white", "Colormap for scalar image output.") ) var ( colormap []draw.ColorMapSpec ) type task struct { *data.Slice info data.Meta fname string } func main() { log.SetFlags(0) flag.Parse() if flag.NArg() == 0 { log.Fatal("no input files") } colormap = make([]draw.ColorMapSpec, 1, 1) colormap[0].Cmap = parseColors(*flag_color) // politely try to make the output directory if *flag_dir != "" { _ = os.Mkdir(*flag_dir, 0777) } // determine which outputs we want var wantOut []output for flag, out := range outputs { if *flag { wantOut = append(wantOut, out) } } switch { case *flag_ovf1 != "": wantOut = append(wantOut, output{".ovf", outputOVF1}) case *flag_omf != "": wantOut = append(wantOut, output{".omf", outputOMF}) case *flag_ovf2 != "": wantOut = append(wantOut, output{".ovf", outputOVF2}) case *flag_vtk != "": wantOut = append(wantOut, output{".vts", outputVTK}) } if len(wantOut) == 0 && *flag_show == false { log.Fatal("no output format specified (e.g.: -png)") } // expand wildcards which are not expanded by the shell // (pointing a finger at cmd.exe) var fnames []string for _, input := range flag.Args() { fmt.Println(input) expanded, _ := filepath.Glob(input) fnames = append(fnames, expanded...) } // read all input files and put them in the task que for _, fname := range fnames { for _, outp := range wantOut { fname := fname // closure caveats outp := outp Queue(func() { doFile(fname, outp) }) } } // wait for work to finish Wait() fmt.Println(succeeded, "files converted, ", skipped, "skipped, ", failed, "failed") if failed > 0 { os.Exit(1) } } var ( failed, skipped, succeeded util.Atom ) func doFile(infname string, outp output) { // determine output file outfname := util.NoExt(infname) + outp.Ext if *flag_dir != "" { outfname = filepath.Join(*flag_dir, filepath.Base(outfname)) } msg := infname + "\t-> " + outfname defer func() { log.Println(msg) }() if infname == outfname { msg = fail(msg, "input and output file are the same") return } defer func() { if err := recover(); err != nil { msg = fail(msg, err) os.Remove(outfname) } }() if !(strings.HasPrefix(infname, "http://") || strings.HasPrefix(outfname, "http://")) { inStat, errS := os.Stat(infname) if errS != nil { panic(errS) } outStat, errO := os.Stat(outfname) if errO == nil && outStat.ModTime().Sub(inStat.ModTime()) > 0 { msg = "[skip] " + msg + ": skipped based on time stamps" skipped.Add(1) return } } var slice *data.Slice var info data.Meta var err error in, errI := httpfs.Open(infname) if errI != nil { msg = fail(msg, errI) return } defer in.Close() switch path.Ext(infname) { default: msg = fail(msg, ": skipping unsupported type: "+path.Ext(infname)) return case ".ovf", ".omf", ".ovf2": slice, info, err = oommf.Read(in) case ".dump": slice, info, err = dump.Read(in) } if err != nil { msg = fail(msg, err) return } out, err := httpfs.Create(outfname) if err != nil { msg = fail(msg, err) return } defer out.Close() preprocess(slice) outp.Convert(slice, info, panicWriter{out}) succeeded.Add(1) msg = "[ ok ] " + msg } func fail(msg string, x ...interface{}) string { failed.Add(1) return "[fail] " + msg + ": " + fmt.Sprint(x...) } // writer that panics on error, so we don't have to check it type panicWriter struct { io.Writer } func (w panicWriter) Write(p []byte) (int, error) { n, err := w.Writer.Write(p) if err != nil { panic(err) } return n, nil } type output struct { Ext string Convert func(*data.Slice, data.Meta, io.Writer) } var outputs = map[*bool]output{ flag_png: {".png", renderPNG}, flag_jpeg: {".jpg", renderJPG}, flag_gif: {".gif", renderGIF}, flag_svg: {".svg", renderSVG}, flag_svgz: {".svgz", renderSVGZ}, flag_gnuplot: {".gplot", dumpGnuplot}, flag_dump: {".dump", outputDUMP}, flag_csv: {".csv", dumpCSV}, flag_numpy: {".npy", dumpNUMPY}, flag_json: {".json", dumpJSON}, flag_show: {"", show}, } func renderPNG(f *data.Slice, info data.Meta, out io.Writer) { draw.RenderFormat(out, f, *flag_min, *flag_max, *flag_arrows, ".png", colormap...) } func renderJPG(f *data.Slice, info data.Meta, out io.Writer) { draw.RenderFormat(out, f, *flag_min, *flag_max, *flag_arrows, ".jpg", colormap...) } func renderGIF(f *data.Slice, info data.Meta, out io.Writer) { draw.RenderFormat(out, f, *flag_min, *flag_max, *flag_arrows, ".gif", colormap...) } func renderSVG(f *data.Slice, info data.Meta, out io.Writer) { draw.SVG(out, f.Vectors()) } func renderSVGZ(f *data.Slice, info data.Meta, out io.Writer) { out2 := gzip.NewWriter(out) defer out2.Close() draw.SVG(out2, f.Vectors()) } func outputOVF1(f *data.Slice, info data.Meta, out io.Writer) { oommf.WriteOVF1(out, f, info, *flag_ovf1) } func outputOMF(f *data.Slice, info data.Meta, out io.Writer) { oommf.WriteOVF1(out, f, info, *flag_omf) } func outputOVF2(f *data.Slice, info data.Meta, out io.Writer) { oommf.WriteOVF2(out, f, info, *flag_ovf2) } func outputVTK(f *data.Slice, info data.Meta, out io.Writer) { dumpVTK(out, f, info, *flag_vtk) } func outputDUMP(f *data.Slice, info data.Meta, out io.Writer) { dump.Write(out, f, info) } // does not output to out, just prints to stdout func show(f *data.Slice, info data.Meta, out io.Writer) { fmt.Println(info) util.Fprintf(os.Stdout, *flag_format, f.Tensors()) } func preprocess(f *data.Slice) { if *flag_normalize { normalize(f, 1) } if *flag_normpeak { normpeak(f) } colormap[0].Ccomp = -1 if *flag_comp != "" { c := parseComp(*flag_comp) colormap[0].Ccomp = c if *flag_arrows == 0 { *f = *f.Comp(c) } } crop(f) if *flag_resize != "" { resize(f, *flag_resize) } } func parseComp(c string) int { if i, err := strconv.Atoi(c); err == nil { return i } switch c { default: log.Fatal("illegal component:", c, "(need x, y or z)") panic(0) case "x", "X": return 0 case "y", "Y": return 1 case "z", "Z": return 2 } } func crop(f *data.Slice) { N := f.Size() // default ranges x1, x2 := 0, N[X] y1, y2 := 0, N[Y] z1, z2 := 0, N[Z] havework := false if *flag_cropz != "" { z1, z2 = parseRange(*flag_cropz, N[Z]) havework = true } if *flag_cropy != "" { y1, y2 = parseRange(*flag_cropy, N[Y]) havework = true } if *flag_cropx != "" { x1, x2 = parseRange(*flag_cropx, N[X]) havework = true } if havework { *f = *data.Crop(f, x1, x2, y1, y2, z1, z2) } } func parseRange(r string, max int) (int, int) { a, b := 0, max spl := strings.Split(r, ":") if len(spl) != 2 { log.Fatal("range needs min:max syntax, have:", r) } if spl[0] != "" { a = atoi(spl[0]) } if spl[1] != "" { b = atoi(spl[1]) } return a, b } func atoi(a string) int { i, err := strconv.Atoi(a) if err != nil { panic(err) } return i } const ( X = data.X Y = data.Y Z = data.Z ) func parseColors(s string) (m []color.RGBA) { words := strings.Split(s, ",") for _, w := range words { m = append(m, parseColor(w)) } return } func parseColor(s string) color.RGBA { if c, ok := colors[s]; ok { return c } fmt.Println("refusing to use ugly color '" + s + "', options are:") for k, _ := range colors { fmt.Println(k) } log.Fatal("illegal color") return color.RGBA{} } var colors = map[string]color.RGBA{ "white": color.RGBA{R: 255, G: 255, B: 255, A: 255}, "black": color.RGBA{R: 0, G: 0, B: 0, A: 255}, "transparent": color.RGBA{R: 0, G: 0, B: 0, A: 0}, "red": color.RGBA{R: 255, G: 0, B: 0, A: 255}, "green": color.RGBA{R: 0, G: 255, B: 0, A: 255}, "blue": color.RGBA{R: 0, G: 0, B: 255, A: 255}, "lightred": color.RGBA{R: 255, G: 127, B: 127, A: 255}, "lightgreen": color.RGBA{R: 127, G: 255, B: 127, A: 255}, "lightblue": color.RGBA{R: 127, G: 127, B: 255, A: 255}, "yellow": color.RGBA{R: 255, G: 255, B: 0, A: 255}, "darkyellow": color.RGBA{R: 127, G: 127, B: 0, A: 255}, "cyan": color.RGBA{R: 0, G: 255, B: 255, A: 255}, "darkcyan": color.RGBA{R: 0, G: 127, B: 127, A: 255}, "magenta": color.RGBA{R: 255, G: 0, B: 255, A: 255}, "darkmagenta": color.RGBA{R: 127, G: 0, B: 127, A: 255}, "gray": color.RGBA{R: 127, G: 127, B: 127, A: 255}, } mumax3-3.10/cmd/mumax3-convert/normalize.go000066400000000000000000000021641371432437400206440ustar00rootroot00000000000000package main import ( "math" "github.com/mumax/3/data" ) // normalize vector data to given length func normalize(f *data.Slice, length float64) { a := f.Vectors() for i := range a[0] { for j := range a[0][i] { for k := range a[0][i][j] { x, y, z := a[0][i][j][k], a[1][i][j][k], a[2][i][j][k] norm := math.Sqrt(float64(x*x + y*y + z*z)) invnorm := float32(1) if norm != 0 { invnorm = float32(length / norm) } a[0][i][j][k] *= invnorm a[1][i][j][k] *= invnorm a[2][i][j][k] *= invnorm } } } } func normpeak(f *data.Slice) { a := f.Vectors() maxnorm := 0. for i := range a[0] { for j := range a[0][i] { for k := range a[0][i][j] { x, y, z := a[0][i][j][k], a[1][i][j][k], a[2][i][j][k] norm := math.Sqrt(float64(x*x + y*y + z*z)) if norm > maxnorm { maxnorm = norm } } } } scale(f, float32(1/maxnorm)) } func scale(f *data.Slice, factor float32) { a := f.Vectors() for i := range a[0] { for j := range a[0][i] { for k := range a[0][i][j] { a[0][i][j][k] *= factor a[1][i][j][k] *= factor a[2][i][j][k] *= factor } } } } mumax3-3.10/cmd/mumax3-convert/numpy.go000066400000000000000000000020511371432437400200070ustar00rootroot00000000000000package main import ( "encoding/binary" "fmt" "github.com/mumax/3/data" "io" ) func dumpNUMPY(f *data.Slice, info data.Meta, out io.Writer) { // see npy format: https://www.numpy.org/devdocs/reference/generated/numpy.lib.format.html // write the first 10 bytes of the 128 byte header fmt.Fprintf(out, "\x93NUMPY") // magic string fmt.Fprintf(out, "\x01\x00") // npy format version binary.Write(out, binary.LittleEndian, uint16(118)) // length of the actual header data (128-10) // write the actual header data (118 bytes) shapestr := fmt.Sprintf("(%d,%d,%d,%d)", f.NComp(), f.Size()[2], f.Size()[1], f.Size()[0]) headerData := fmt.Sprintf("{'descr': '") _, err = fmt.Fprintln(out, "") _, err = fmt.Fprintf(out, "\t\n", gridsize[0]-1, gridsize[1]-1, gridsize[2]-1) _, err = fmt.Fprintf(out, "\t\t\n", gridsize[0]-1, gridsize[1]-1, gridsize[2]-1) return } func writeVTKPoints(out io.Writer, q *data.Slice, dataformat string, info data.Meta) (err error) { _, err = fmt.Fprintln(out, "\t\t\t") fmt.Fprintf(out, "\t\t\t\t\n\t\t\t\t\t", dataformat) gridsize := q.Size() cellsize := info.CellSize switch dataformat { case "ascii": for k := 0; k < gridsize[2]; k++ { for j := 0; j < gridsize[1]; j++ { for i := 0; i < gridsize[0]; i++ { x := (float32)(i) * (float32)(cellsize[0]) y := (float32)(j) * (float32)(cellsize[1]) z := (float32)(k) * (float32)(cellsize[2]) _, err = fmt.Fprint(out, x, " ", y, " ", z, " ") } } } case "binary": buffer := new(bytes.Buffer) for k := 0; k < gridsize[2]; k++ { for j := 0; j < gridsize[1]; j++ { for i := 0; i < gridsize[0]; i++ { x := (float32)(i) * (float32)(cellsize[0]) y := (float32)(j) * (float32)(cellsize[1]) z := (float32)(k) * (float32)(cellsize[2]) binary.Write(buffer, binary.LittleEndian, x) binary.Write(buffer, binary.LittleEndian, y) binary.Write(buffer, binary.LittleEndian, z) } } } b64len := uint32(len(buffer.Bytes())) bufLen := new(bytes.Buffer) binary.Write(bufLen, binary.LittleEndian, b64len) base64out := base64.NewEncoder(base64.StdEncoding, out) base64out.Write(bufLen.Bytes()) base64out.Write(buffer.Bytes()) base64out.Close() default: log.Fatalf("Illegal VTK data format: %v. Options are: ascii, binary", dataformat) } _, err = fmt.Fprintln(out, "\n\t\t\t\t") _, err = fmt.Fprintln(out, "\t\t\t") return } func writeVTKCellData(out io.Writer, q *data.Slice, meta data.Meta, dataformat string) (err error) { N := q.NComp() data := q.Tensors() switch N { case 1: fmt.Fprintf(out, "\t\t\t\n", meta.Name) fmt.Fprintf(out, "\t\t\t\t\n\t\t\t\t\t", meta.Name, N, dataformat) case 3: fmt.Fprintf(out, "\t\t\t\n", meta.Name) fmt.Fprintf(out, "\t\t\t\t\n\t\t\t\t\t", meta.Name, N, dataformat) case 6, 9: fmt.Fprintf(out, "\t\t\t\n", meta.Name) fmt.Fprintf(out, "\t\t\t\t\n\t\t\t\t\t", meta.Name, 9, dataformat) // must be 9! default: log.Fatalf("vtk: cannot handle %v components", N) } gridsize := q.Size() switch dataformat { case "ascii": for k := 0; k < gridsize[2]; k++ { for j := 0; j < gridsize[1]; j++ { for i := 0; i < gridsize[0]; i++ { // if symmetric tensor manage it appart to write the full 9 components if N == 6 { fmt.Fprint(out, data[0][k][j][i], " ") fmt.Fprint(out, data[1][k][j][i], " ") fmt.Fprint(out, data[2][k][j][i], " ") fmt.Fprint(out, data[1][k][j][i], " ") fmt.Fprint(out, data[3][k][j][i], " ") fmt.Fprint(out, data[4][k][j][i], " ") fmt.Fprint(out, data[2][k][j][i], " ") fmt.Fprint(out, data[4][k][j][i], " ") fmt.Fprint(out, data[5][k][j][i], " ") } else { for c := 0; c < N; c++ { fmt.Fprint(out, data[c][k][j][i], " ") } } } } } case "binary": // Inlined for performance, terabytes of data will pass here... buffer := new(bytes.Buffer) for k := 0; k < gridsize[2]; k++ { for j := 0; j < gridsize[1]; j++ { for i := 0; i < gridsize[0]; i++ { // if symmetric tensor manage it appart to write the full 9 components if N == 6 { binary.Write(buffer, binary.LittleEndian, data[0][k][j][i]) binary.Write(buffer, binary.LittleEndian, data[1][k][j][i]) binary.Write(buffer, binary.LittleEndian, data[2][k][j][i]) binary.Write(buffer, binary.LittleEndian, data[1][k][j][i]) binary.Write(buffer, binary.LittleEndian, data[3][k][j][i]) binary.Write(buffer, binary.LittleEndian, data[4][k][j][i]) binary.Write(buffer, binary.LittleEndian, data[2][k][j][i]) binary.Write(buffer, binary.LittleEndian, data[4][k][j][i]) binary.Write(buffer, binary.LittleEndian, data[5][k][j][i]) } else { for c := 0; c < N; c++ { binary.Write(buffer, binary.LittleEndian, data[c][k][j][i]) } } } } } b64len := uint32(len(buffer.Bytes())) bufLen := new(bytes.Buffer) binary.Write(bufLen, binary.LittleEndian, b64len) base64out := base64.NewEncoder(base64.StdEncoding, out) base64out.Write(bufLen.Bytes()) base64out.Write(buffer.Bytes()) base64out.Close() default: panic(fmt.Errorf("vtk: illegal data format " + dataformat + ". Options are: ascii, binary")) } fmt.Fprintln(out, "\n\t\t\t\t") fmt.Fprintln(out, "\t\t\t") return } func writeVTKFooter(out io.Writer) (err error) { _, err = fmt.Fprintln(out, "\t\t") _, err = fmt.Fprintln(out, "\t") _, err = fmt.Fprintln(out, "") return } mumax3-3.10/cmd/mumax3-httpfsd/000077500000000000000000000000001371432437400163065ustar00rootroot00000000000000mumax3-3.10/cmd/mumax3-httpfsd/Makefile000066400000000000000000000000211371432437400177370ustar00rootroot00000000000000all: go install mumax3-3.10/cmd/mumax3-httpfsd/main.go000066400000000000000000000013421371432437400175610ustar00rootroot00000000000000/* httpfs server, useful for debugging mumax3-server. Usage Start mumax3-httpfsd in a certain working directory. $ ls file.mx3 $ mumax3-server -l :35362 Then you can remotely run mumax3 input files: $ cd elsewhere $ mumax3 http://localhost:35362/file.mx3 */ package main import ( "flag" "log" "net/http" _ "net/http/pprof" "github.com/mumax/3/httpfs" ) var ( flag_addr = flag.String("l", ":35360", "Listen and serve at this network address") flag_log = flag.Bool("log", false, "log debug output") ) func main() { flag.Parse() log.Println("serving at", *flag_addr) httpfs.Logging = *flag_log httpfs.RegisterHandlers() err := http.ListenAndServe(*flag_addr, nil) if err != nil { log.Fatal(err) } } mumax3-3.10/cmd/mumax3-plot/000077500000000000000000000000001371432437400156105ustar00rootroot00000000000000mumax3-3.10/cmd/mumax3-plot/.gitignore000066400000000000000000000000141371432437400175730ustar00rootroot00000000000000mumax3-plot mumax3-3.10/cmd/mumax3-plot/Makefile000066400000000000000000000000241371432437400172440ustar00rootroot00000000000000all: go install -v mumax3-3.10/cmd/mumax3-plot/main.go000066400000000000000000000044451371432437400170720ustar00rootroot00000000000000/* The mumax3-plot utility uses gnuplot to automatically plot mumax3 data tables. mumax3-plot table.txt Creates graphs of all columns as .svg files. */ package main import ( "bufio" "flag" "fmt" "log" "os" "os/exec" "path" "strings" ) func main() { log.SetFlags(0) flag.Parse() for _, f := range flag.Args() { plotFile(f) } } func plotFile(fname string) { hdr := readHeader(fname) // quantities grouped by vector Qs := []*Q{&Q{[]string{"t"}, "s", []int{1}}} prev := Qs[0] quants := strings.Split(hdr, "\t") for i := 1; i < len(quants); i++ { spl := strings.Split(quants[i], " ") name := spl[0] unit := spl[1] if unit == "()" { unit = "" } if name[:len(name)-1] == prev.name[0][:len(prev.name[0])-1] { prev.cols = append(prev.cols, i+1) prev.name = append(prev.name, name) } else { n := &Q{[]string{name}, unit, []int{i + 1}} Qs = append(Qs, n) prev = n } } log.Println(Qs) for i := 1; i < len(Qs); i++ { makePlot(fname, Qs[i]) } } func makePlot(fname string, q *Q) { term := "svg" outf := path.Dir(fname) + "/" + q.vecname() cmd := fmt.Sprintf(`set term %v noenhanced size 400 300 font 'Arial,10'; set output "%v.%v";`, term, outf, term) cmd += fmt.Sprintf(`set xlabel "t(ns)";`) cmd += fmt.Sprintf(`set ylabel "%v %v";`, q.vecname(), q.unit) cmd += fmt.Sprint(`set format y "%g";`) cmd += fmt.Sprint(`plot "`, fname, `" u ($1*1e9):`, q.cols[0], ` w li title "`, q.name[0], `"`) for i := 1; i < len(q.cols); i++ { cmd += fmt.Sprint(`, "`, fname, `" u ($1*1e9):`, q.cols[i], ` w li title "`, q.name[i], `"`) } cmd += "; set output;" out, err := exec.Command("gnuplot", "-e", cmd).CombinedOutput() os.Stderr.Write(out) check(err) } type Q struct { name []string unit string cols []int } func (q *Q) String() string { return fmt.Sprint(q.name, "(", q.unit, ")", q.cols) } func (q *Q) vecname() string { if len(q.cols) > 1 { return q.name[0][:len(q.name[0])-1] } else { return q.name[0] } } func readHeader(fname string) string { f, err := os.Open(fname) check(err) defer f.Close() in := bufio.NewReader(f) hdrBytes, _, err2 := in.ReadLine() check(err2) hdr := string(hdrBytes) if hdr[0] != '#' { log.Fatal("invalid table header:", hdr) } hdr = hdr[2:] return hdr } func check(err error) { if err != nil { log.Fatal(err) } } mumax3-3.10/cmd/mumax3-script/000077500000000000000000000000001371432437400161365ustar00rootroot00000000000000mumax3-3.10/cmd/mumax3-script/.gitignore000066400000000000000000000000161371432437400201230ustar00rootroot00000000000000mumax3-script mumax3-3.10/cmd/mumax3-script/Makefile000066400000000000000000000000241371432437400175720ustar00rootroot00000000000000all: go install -v mumax3-3.10/cmd/mumax3-script/main.go000066400000000000000000000022321371432437400174100ustar00rootroot00000000000000/* Toy interpreter executes scripts or stdin. */ package main import ( "bufio" "flag" "fmt" "github.com/mumax/3/script" "io" "log" "os" ) var debug = flag.Bool("g", false, "print debug output") var ( world *script.World ps1 string ) func main() { log.SetFlags(0) flag.Parse() world = script.NewWorld() world.Func("exit", exit) script.Debug = *debug if flag.NArg() > 1 { check(fmt.Errorf("need 0 or 1 input files")) } if flag.NArg() == 1 { src, err := os.Open(flag.Arg(0)) check(err) ps1 = ">" interpret(src) } else { ps1 = "" interpret(os.Stdin) } } func interpret(in io.Reader) { scanner := bufio.NewScanner(in) for scanner.Scan() { safecall(scanner.Text()) } check(scanner.Err()) } func safecall(code string) { if code == "" { return } defer func() { err := recover() if err != nil { fmt.Fprintln(os.Stderr, "panic:", err) } }() tree, err := world.Compile(code) if err == nil { for _, stmt := range tree.Child() { fmt.Println(stmt.Eval()) } } else { fmt.Fprintln(os.Stderr, err) } } func check(e error) { if e != nil { fmt.Fprintln(os.Stderr, e) os.Exit(1) } } func exit() { os.Exit(0) } mumax3-3.10/cmd/mumax3-script/mumax3-int000077500000000000000000000000551371432437400200660ustar00rootroot00000000000000#! /bin/bash rlwrap -m -S '> ' mumax3-script mumax3-3.10/cmd/mumax3-server/000077500000000000000000000000001371432437400161405ustar00rootroot00000000000000mumax3-3.10/cmd/mumax3-server/Makefile000066400000000000000000000000211371432437400175710ustar00rootroot00000000000000all: go install mumax3-3.10/cmd/mumax3-server/compute.go000066400000000000000000000142241371432437400201460ustar00rootroot00000000000000package main /* Compute service runs jobs on this node's GPUs, if any. */ import ( "fmt" "io" "log" "os/exec" "strings" "time" "github.com/mumax/3/httpfs" "github.com/mumax/3/util" ) var ( MumaxVersion string GPUs []string Processes = make(map[string]*Process) // job id -> process ) // Process is a running simulation process type Process struct { *exec.Cmd Start time.Time Out io.WriteCloser ID string OutputURL string GUI string Killed bool } func (p *Process) Host() string { return JobHost(p.OutputURL) } // Runs a compute service on this node, if GPUs are available. // The compute service asks storage nodes for a job, runs it, // saves results over httpfs and notifies storage when ready. func RunComputeService() { if len(GPUs) == 0 { return } // queue of available GPU numbers idle := make(chan int, len(GPUs)) for i := range GPUs { idle <- i } for { gpu := <-idle // take an available GPU GUIAddr := fmt.Sprint(thisHost+":", GUI_PORT+gpu) ID := WaitForJob() // take an available job go func() { defer func() { // remove from "running" list WLock() delete(Processes, ID) WUnlock() // add GPU number back to idle stack idle <- gpu }() p := NewProcess(ID, gpu, GUIAddr) if p == nil { return } WLock() Processes[ID] = p WUnlock() p.Run() _, err := RPCCall(JobHost(ID), "UpdateJob", ID) if err != nil { log.Println(err) } }() } } func WaitForJob() string { ID := FindJob() for ID == "" { time.Sleep(2 * time.Second) // TODO: don't poll ID = FindJob() } return ID } func FindJob() string { // quickly list peers first RLock() p := make([]string, 0, len(peers)) for addr, _ := range peers { p = append(p, addr) } RUnlock() // TODO: pick peers fairly // then do slow RPC calls without blocking the rest of the program for _, addr := range p { ID, _ := RPCCall(addr, "GiveJob", thisAddr) if ID != "" { return ID } } return "" } // RPC-callable function kills job corresponding to given job id. // The job has to be running on this node. func Kill(id string) string { log.Println("KILL", id) WLock() // modifies Cmd state defer WUnlock() job := Processes[id] if job == nil { return fmt.Sprintf("kill %v: job not running.", id) } job.Killed = true err := job.Cmd.Process.Kill() if err != nil { return err.Error() } return "" // OK } // prepare exec.Cmd to run mumax3 compute process func NewProcess(ID string, gpu int, webAddr string) *Process { // prepare command inputURL := "http://" + ID command := *flag_mumax gpuFlag := fmt.Sprint(`-gpu=`, gpu) httpFlag := fmt.Sprint(`-http=`, webAddr) cacheFlag := fmt.Sprint(`-cache=`, *flag_cachedir) forceFlag := `-f=0` cmd := exec.Command(command, gpuFlag, httpFlag, cacheFlag, forceFlag, inputURL) // Pipe stdout, stderr to log file over httpfs outDir := util.NoExt(inputURL) + ".out" errMkdir := httpfs.Mkdir(outDir) if errMkdir != nil { SetJobError(ID, errMkdir) log.Println("makeProcess", errMkdir) j := JobByName(ID) if j != nil { j.Reque() } return nil } out, errD := httpfs.Create(outDir + "/stdout.txt") if errD != nil { SetJobError(ID, errD) log.Println("makeProcess", errD) j := JobByName(ID) if j != nil { j.Reque() } return nil } cmd.Stderr = out cmd.Stdout = out return &Process{ID: ID, Cmd: cmd, Start: time.Now(), Out: out, OutputURL: OutputDir(inputURL), GUI: webAddr} } func (p *Process) Run() { log.Println("=> exec ", p.Path, p.Args) defer p.Out.Close() httpfs.Put(p.OutputURL+"host", []byte(thisAddr)) startTime := AskTime(p.Host()) httpfs.Put(p.OutputURL+"start", []byte(startTime.Format(time.UnixDate))) WLock() // Cmd.Start() modifies state err1 := p.Cmd.Start() // err? WUnlock() if err1 != nil { SetJobError(p.ID, err1) } timeOffset := time.Now().Sub(startTime) // our clock is most likely out-of-sync with host tick := time.NewTicker(KeepaliveInterval) // need initial alive in case watchdog sniffs between start and first alive tick httpfs.Put(p.OutputURL+"alive", []byte(time.Now().Add(timeOffset).Format(time.UnixDate))) go func() { for t := range tick.C { httpfs.Put(p.OutputURL+"alive", []byte(t.Add(timeOffset).Format(time.UnixDate))) } }() err2 := p.Cmd.Wait() if err1 == nil && err2 != nil { SetJobError(p.ID, err2) } tick.Stop() status := -1 // TODO: determine proper status number if err1 != nil || err2 != nil { log.Println(p.Path, p.Args, err1, err2) status = 1 } else { status = 0 } if p.Killed { httpfs.Put(p.OutputURL+"killed", []byte(time.Now().Format(time.UnixDate))) } else { httpfs.Put(p.OutputURL+"exitstatus", []byte(fmt.Sprint(status))) } stopTime := AskTime(p.Host()) nanos := stopTime.Sub(startTime).Nanoseconds() httpfs.Put(p.OutputURL+"duration", []byte(fmt.Sprint(nanos))) if status == 0 { ret, err := RPCCall(p.Host(), "AddFairShare", JobUser(p.ID)+"/"+fmt.Sprint(nanos/1e9)) if err != nil || ret != "" { log.Println("***ERR: AddFairShare", JobUser(p.ID), ret, err) } } return } func (p *Process) Duration() time.Duration { return Since(time.Now(), p.Start) } func DetectGPUs() { if GPUs != nil { panic("multiple DetectGPUs() calls") } for i := 0; i < MAXGPU; i++ { gpuflag := fmt.Sprint("-gpu=", i) out, err := exec.Command(*flag_mumax, "-test", gpuflag).Output() if err == nil { info := string(out) if strings.HasSuffix(info, "\n") { info = info[:len(info)-1] } log.Println("gpu", i, ":", info) GPUs = append(GPUs, info) } } } func DetectMumax() { out, err := exec.Command(*flag_mumax, "-test", "-v").CombinedOutput() info := string(out) if err == nil { split := strings.SplitN(info, "\n", 2) version := split[0] log.Println("have", version) MumaxVersion = version } else { MumaxVersion = fmt.Sprint(*flag_mumax, "-test", ": ", err, info) } } // RPC-callable function, answers by this node's time func WhatsTheTime(string) string { return time.Now().Format(time.UnixDate) } func AskTime(host string) time.Time { str, _ := RPCCall(host, "WhatsTheTime", "") return parseTime(str) } func parseTime(str string) time.Time { t, _ := time.Parse(time.UnixDate, str) return t } mumax3-3.10/cmd/mumax3-server/doc.go000066400000000000000000000111741371432437400172400ustar00rootroot00000000000000/* Easy-to-use cluster management tool for mumax3, with auto-configuration and web interface. When nodes are connected behind a home router, mumax3-server can run without any configuration. Otherwise only the IP address range where the other nodes reside has to be specified. Input files Upon starting mumax3-server, it scans the current working directory for input files. These should be organised in directories corresponding to user names. E.g.: john/file1.mx3 john/file2.mx3 ... kate/file1.mx3 kate/file2.mx3 ... Other files will be ignored. These input files will run on all available nodes in the network. After adding/removing files, you should click "rescan" in the web interface, or wait for a few minutes. Web interface mumax3-server serves a web interface at http://localhost:35360 (you have overridden the port, see below). Depending on your OS you may need to use your exact IP address instead of localhost, e.g.: http://192.168.0.1:35360. The web interface shows you the queued jobs, running jobs, output files, etc., and allows to re-scan for new job files or kill running jobs Compute nodes Each node that runs mumax3-server and has a working mumax3 installation will automatically serve as a compute node (even if it stores input files as well). The web interface will show the mumax version and available GPUs. The -exec flag may be used to override which mumax3 binary to use. E.g: mumax3-server -exec /usr/local/mumax3/mumax3-cuda6.5 #override mumax3 binary Scan for other nodes Upon starting mumax3-server, it will automatically scan for other nodes in the local network. These will automatically start running jobs (if they have a GPU and mumax3 installed), or may serve job files to be executed by other nodes. By default, we search for nodes with IP addresses in the range 192.168.0.1-128 (local network behind, e.g., a router). This can be changed by the -scan flag. E.g.: mumax3-server -scan 127.0.0.1,169.254.0-1.1-254 mumax3-server -ports 35360-25369 Even when a new node appears on the network after the port scan, it should still be automatically detected. If not, hit "rescan" in the web interface. The -ports flag may be used to change the port numbers being scanned, in case the server uses a non-standard port (-l flag). Override port number mumax3-server uses tcp port 35360, which needs to be accessible (e.g., through your firewall). This port and the service's IP address, can be overridden with the -l flag: mumax3-server -l :35361 #serves at non-standard port mumax3-server -l 192.168.1.1:35360 #serves at specific IP address, e.g. for dual-link machines Fault tolerance mumax3-server does a great effort to recover from failed nodes, network outages, reboots etc. If a simulation is interrupted for any such reason, it should be re-queued and automatically re-started later. In that case the web interface will show [1x requeued] to indicate that the job has been interrupted, but it will run later nevertheless. Command line flags Usage of mumax3-server: -cache="": mumax3 kernel cache path -exec="mumax3": mumax3 executable -halflife=24h0m0s: share decay half-life -l=":35360": Listen and serve at this network address -log=true: log debug output -ports="35360-35361": Scan these ports for other servers -scan="192.168.0.1-128": Scan these IP address for other servers -timeout=2s: Portscan timeout Web interface example http://localhost:35360 157.193.57.146:35360 Uptime: 27h45m38s Peer nodes scan 157.193.57.2-254: 35360-35361 ports 35360-35361 (Rescan) 157.193.57.146:35360 157.193.57.228:35360 Compute service mumax: mumax 3.6 beta2 linux_amd64 go1.3.3 (gc) GPU0: CUDA 6 GeForce GTX 680(2047MB) cc3.0 GPU1: CUDA 6 GeForce GTX 680(2047MB) cc3.0 GPU2: CUDA 6 GeForce GTX 680(2047MB) cc3.0 Running jobs [157.193.57.146:35360/john/b_ext_add.mx3] [3s] [GUI] [kill] [157.193.57.146:35360/john/demag2D.mx3] [2s] [GUI] [kill] [157.193.57.146:35360/john/demag2Dpbc.mx3] [1s] [GUI] [kill] Queue service Users john 589 GPU-seconds has queued jobs kate 0 GPU-seconds no queued jobs Next job for: john Jobs [Reload all] [Wake-up Watchdog] john [Reload] [john/anisenergy.mx3] [.out] [157.193.57.146:35360] [ OK ] [1s] [john/anisenergyconservation.mx3] [.out] [157.193.57.146:35360] [ OK ] [2s] [john/anisenergyconservation2.mx3] [.out] [157.193.57.146:35360] [ OK ] [2s] [john/anisenergyconservation3.mx3] [.out] [157.193.57.228:35360] [ OK ] [1s] [john/anisenergyconservation4.mx3] [.out] [157.193.57.146:35360] [ OK ] [2s] kate [Reload] */ package main mumax3-3.10/cmd/mumax3-server/job.go000066400000000000000000000125551371432437400172510ustar00rootroot00000000000000package main import ( "log" "os" "strconv" "time" "github.com/mumax/3/httpfs" "github.com/mumax/3/util" ) const MaxRequeue = 10 // maximum number of re-queues, don't run job if re-queued to many times // compute Job type Job struct { ID string // host/path of the input file, e.g., hostname:port/user/inputfile.mx3 // in-memory properties: RequeCount int // how many times requeued. Error interface{} // error that cannot be consolidated to disk // all of this is cache: Output string // if exists, points to output ID Host string // node address in host file (=last host who started this job) ExitStatus string // what's in the exitstatus file Start time.Time // When this job was started, if applicable Alive time.Time // Last time when this job was seen alive duration time.Duration } // Find job belonging to ID func JobByName(ID string) *Job { user := Users[BaseDir(LocalPath(ID))] if user == nil { log.Println("JobByName: no user for", ID) return nil } jobs := user.Jobs low := 0 high := len(jobs) - 1 mid := -1 for low <= high { mid = (low + high) / 2 switch { case jobs[mid].ID > ID: high = mid - 1 case jobs[mid].ID < ID: low = mid + 1 default: low = high + 1 // break for loop :-( } } if mid >= 0 && mid < len(jobs) && jobs[mid].ID == ID { return jobs[mid] } else { log.Println("JobByName: not found:", ID) return nil } } // read job files from storage and update status cache func (j *Job) Update() { out := j.LocalOutputDir() if exists(out) { j.Output = thisAddr + "/" + out } else { j.Output = "" j.ExitStatus = "" j.Start = time.Time{} j.Alive = time.Time{} j.duration = 0 } if j.Output != "" { j.Host = httpfsRead(out + "host") j.ExitStatus = httpfsRead(out + "exitstatus") j.Start = parseTime(httpfsRead(out + "start")) j.Alive = parseTime(httpfsRead(out + "alive")) j.duration = time.Duration(atoi(httpfsRead(out + "duration"))) } } // Put job back in queue for later, e.g., when killed. func (j *Job) Reque() { log.Println("requeue", j.ID) j.RequeCount++ httpfs.Remove(j.LocalOutputDir()) j.Update() } func SetJobError(ID string, err interface{}) { log.Println("SetJobErr", ID, err) WLock() defer WUnlock() j := JobByName(ID) if j == nil { return } j.Error = err } // How long job has been running, if running. func (j *Job) Duration() time.Duration { if j.Start.IsZero() { return 0 } if j.duration != 0 { return j.duration } if j.IsRunning() { return Since(time.Now(), j.Start) } return 0 // unknown duration } // user name for this job ID func (j *Job) User() string { return JobUser(j.ID) } // user name for this job ID func JobUser(ID string) string { return BaseDir(LocalPath(ID)) } // local path of input file func (j *Job) LocalPath() string { return LocalPath(j.ID) } // local path of input file, without host prefix. E.g.: // host:123/user/file.mx3 -> user/file.mx3 func LocalPath(ID string) string { host := JobHost(ID) if len(host)+1 >= len(ID) { log.Println("Invalid LocalPath call on", ID) return "" } return ID[len(host)+1:] } // local path of output dir func (j *Job) LocalOutputDir() string { return OutputDir(j.LocalPath()) } // output directory for input file func OutputDir(path string) string { return util.NoExt(path) + ".out/" } // insert "/fs" in front of url path func (*Job) FS(id string) string { return FS(id) } // insert "/fs" in front of url path func FS(id string) string { return BaseDir(id) + "/fs/" + LocalPath(id) } // is job queued? func (j *Job) IsQueued() bool { return j.Output == "" && j.RequeCount < MaxRequeue } // is job running? func (j *Job) IsRunning() bool { return j.Output != "" && j.ExitStatus == "" && j.Host != "" } // Host of job with this ID (=first path element). E.g.: // host:123/user/file.mx3 -> host:123 func JobHost(ID string) string { return BaseDir(ID) } // Job status number queued, running,... type Status int const ( QUEUED Status = iota RUNNING FINISHED FAILED ) var statusString = map[Status]string{ QUEUED: "QUEUED", RUNNING: "RUNNING", FINISHED: "FINISHED", FAILED: "FAILED", } func (s Status) String() string { return statusString[s] } // human-readable status string (for gui) func (j *Job) Status() string { if j.IsQueued() { return QUEUED.String() } if j.ExitStatus == "0" { return FINISHED.String() } if j.ExitStatus == "" && j.Host == "" { return FINISHED.String() } if j.Host != "" && j.ExitStatus == "" { return RUNNING.String() } if j.ExitStatus != "" && j.ExitStatus != "0" { return FAILED.String() } return "UNKNOWN" } // remove job output func Rm(URL string) string { err := httpfs.Remove("http://" + OutputDir(URL)) // update status after output removal UpdateJob(URL) if err != nil { return err.Error() } // report re-queue // handy if others remove your jobs job := JobByName(URL) if job != nil { job.RequeCount++ } // make sure job runs again quickly user := JobUser(URL) u := Users[user] if u != nil { u.nextPtr = 0 } return "" } // check if path exists func exists(path string) bool { _, err := os.Stat(path) return err == nil } // atoi, does not return error func atoi(a string) int64 { i, _ := strconv.ParseInt(a, 10, 64) return i } // return file content as string, no errors func httpfsRead(fname string) string { data, err := httpfs.Read(fname) if err != nil { return "" } return string(data) } mumax3-3.10/cmd/mumax3-server/main.go000066400000000000000000000123401371432437400174130ustar00rootroot00000000000000package main import ( "flag" "fmt" "log" "net" "net/http" _ "net/http/pprof" "os" "strconv" "strings" "sync" "time" "github.com/mumax/3/httpfs" "github.com/mumax/3/util" ) var ( flag_addr = flag.String("l", ":35360", "Listen and serve at this network address") flag_scan = flag.String("scan", "192.168.0.1-128", "Scan these IP address for other servers") flag_ports = flag.String("ports", "35360-35361", "Scan these ports for other servers") flag_timeout = flag.Duration("timeout", 2*time.Second, "Portscan timeout") flag_mumax = flag.String("exec", "mumax3", "mumax3 executable") flag_cachedir = flag.String("cache", "", "mumax3 kernel cache path") flag_log = flag.Bool("log", true, "log debug output") flag_halflife = flag.Duration("halflife", 24*time.Hour, "share decay half-life") ) const ( MaxIPs = 1024 // maximum number of IP address to portscan N_SCANNERS = 32 // number of parallel portscan goroutines MAXGPU = 16 // maximum number of GPU's to check for KeepaliveInterval = 10 * time.Second // signal process liveness every KeepaliveInterval ) var ( thisAddr string // unique address of this node, e.g., name:1234 thisHost string // unique hostname of this node, e.g., name IPs []string MinPort, MaxPort int global_lock sync.RWMutex ) func RLock() { global_lock.RLock() } func RUnlock() { global_lock.RUnlock() } func WLock() { global_lock.Lock() } func WUnlock() { global_lock.Unlock() } const GUI_PORT = 35367 // base port number for GUI (to be incremented by GPU number) func main() { flag.Parse() IPs = parseIPs() MinPort, MaxPort = parsePorts() thisAddr = canonicalAddr(*flag_addr, IPs) var err error thisHost, _, err = net.SplitHostPort(thisAddr) util.FatalErr(err) DetectMumax() DetectGPUs() LoadJobs() http.HandleFunc("/do/", HandleRPC) http.HandleFunc("/", HandleStatus) httpfs.RegisterHandlers() // Listen and serve on all interfaces go func() { log.Println("serving at", thisAddr) // Resolve the IPs for thisHost thisIP, err := net.LookupHost(thisHost) Fatal(err) // try to listen and serve on all interfaces other than thisAddr // this is for convenience, errors are not fatal. _, p, err := net.SplitHostPort(thisAddr) Fatal(err) ips := util.InterfaceAddrs() for _, ip := range ips { addr := net.JoinHostPort(ip, p) if !contains(thisIP, ip) { // skip thisIP, will start later and is fatal on error go func() { log.Println("serving at", addr) err := http.ListenAndServe(addr, nil) if err != nil { log.Println("info:", err, "(but still serving other interfaces)") } }() } } // only on thisAddr, this server's unique address, // we HAVE to be listening. Fatal(http.ListenAndServe(thisAddr, nil)) }() ProbePeer(thisAddr) // make sure we have ourself as peer go FindPeers(IPs, MinPort, MaxPort) go RunComputeService() go LoopWatchdog() go RunShareDecay() // re-load jobs every hour so we don't stall on very exceptional circumstances go func() { for { time.Sleep(1 * time.Hour) LoadJobs() } }() <-make(chan struct{}) // wait forever } // replace laddr by a canonical form, as it will serve as unique ID func canonicalAddr(laddr string, IPs []string) string { // safe initial guess: hostname:port h, p, err := net.SplitHostPort(laddr) Fatal(err) if h == "" { h, _ = os.Hostname() } name := net.JoinHostPort(h, p) ips := util.InterfaceAddrs() for _, ip := range ips { if contains(IPs, ip) { return net.JoinHostPort(ip, p) } } return name } func contains(arr []string, x string) bool { for _, s := range arr { if x == s { return true } } return false } // Parse port range flag. E.g.: // 1234-1237 -> 1234, 1237 func parsePorts() (minPort, maxPort int) { p := *flag_ports split := strings.Split(p, "-") if len(split) > 2 { log.Fatal("invalid port range:", p) } minPort, _ = strconv.Atoi(split[0]) if len(split) > 1 { maxPort, _ = strconv.Atoi(split[1]) } if maxPort == 0 { maxPort = minPort } if minPort == 0 || maxPort == 0 || maxPort < minPort { log.Fatal("invalid port range:", p) } return } // init IPs from flag func parseIPs() []string { var IPs []string defer func() { if err := recover(); err != nil { log.Fatal("invalid IP range:", *flag_scan) } }() p := *flag_scan split := strings.Split(p, ",") for _, s := range split { split := strings.Split(s, ".") if len(split) != 4 { log.Fatal("invalid IP address range:", s) } var start, stop [4]uint for i, s := range split { split := strings.Split(s, "-") first := atobyte(split[0]) start[i], stop[i] = first, first if len(split) > 1 { stop[i] = atobyte(split[1]) } } for A := start[0]; A <= stop[0]; A++ { for B := start[1]; B <= stop[1]; B++ { for C := start[2]; C <= stop[2]; C++ { for D := start[3]; D <= stop[3]; D++ { if len(IPs) > MaxIPs { log.Fatal("too many IP addresses to scan in", p) } IPs = append(IPs, fmt.Sprintf("%v.%v.%v.%v", A, B, C, D)) } } } } } return IPs } func atobyte(a string) uint { i, err := strconv.Atoi(a) if err != nil { panic(err) } if int(byte(i)) != i { panic("too large") } return uint(i) } mumax3-3.10/cmd/mumax3-server/peers.go000066400000000000000000000024411371432437400176060ustar00rootroot00000000000000package main // Peer management: // portscan for peers // ping peers import ( "fmt" "log" ) var ( peers = make(map[string]*Peer) ) type Peer struct { } func AddPeer(pAddr string) { WLock() defer WUnlock() if _, ok := peers[pAddr]; !ok { log.Println("add new peer:", pAddr) peers[pAddr] = NewPeer() } } func NewPeer() *Peer { return &Peer{} } // RPC-called func Ping(peerAddr string) string { WLock() defer WUnlock() // Somebody just called my status, // and him as a peer (if not yet so). if _, ok := peers[peerAddr]; !ok { peers[peerAddr] = NewPeer() } return thisAddr } // Ping peer at address, add to peers list if he responds and is not yet added func ProbePeer(addr string) { ret, _ := RPCCall(addr, "Ping", thisAddr) if ret != "" { AddPeer(ret) } } // Scan IPs and port range for peers that respond to Ping, // add them to peers list. func FindPeers(IPs []string, minPort, maxPort int) { //log.Println("Portscan start") scanners := make(chan func()) for i := 0; i < N_SCANNERS; i++ { go func() { for f := range scanners { f() } }() } for _, ip := range IPs { for port := minPort; port <= maxPort; port++ { addr := fmt.Sprint(ip, ":", port) scanners <- func() { ProbePeer(addr) } } } close(scanners) log.Println("-- portscan done") } mumax3-3.10/cmd/mumax3-server/que.go000066400000000000000000000064161371432437400172700ustar00rootroot00000000000000package main import ( "log" "math" "os" "path/filepath" "sort" "strings" "time" ) /* Queue service scans the working directory for job files. The working directory should contain per-user subdirectories. E.g.: arne/ bartel/ ... The in-memory representation is a cache and can be out-of-date at any point. The queue service decides which job to hand out to a node if asked so. */ var ( Users = make(map[string]*User) // maps user -> joblist ) // RPC-callable method: picks a job of the queue returns it // for the node to run it. func GiveJob(nodeAddr string) string { WLock() defer WUnlock() user := nextUser() if user == "" { return "" } Users[user].FairShare += 1 // 1 second penalty because a job has started return Users[user].giveJob(nodeAddr).ID } func AddFairShare(s string) string { username := BaseDir(s) share := atoi(s[len(username)+1:]) WLock() defer WUnlock() u := Users[username] if u == nil { return "no user " + username } log.Println("AddFairShare", username, share) u.FairShare += float64(share) return "" // ok } func nextUser() string { // search user with least share and jobs in queue leastShare := math.Inf(1) var bestUser string for n, u := range Users { if u.HasJob() && u.FairShare < leastShare { leastShare = u.FairShare bestUser = n } } return bestUser } // (Re-)load all jobs in the working directory. // Called upon program startup. func LoadJobs() { dir, err := os.Open(".") Fatal(err) subdirs, err2 := dir.Readdir(-1) Fatal(err2) for _, d := range subdirs { if d.IsDir() { LoadUserJobs(d.Name()) } } } // (Re-)load all jobs in the user's subdirectory. func LoadUserJobs(dir string) string { log.Println("LoadUserJobs", dir) var newJobs []*Job err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { if strings.HasSuffix(path, ".mx3") && !strings.HasPrefix(info.Name(), ".") { ID := thisAddr + "/" + path log.Println("addingJob", ID) job := &Job{ID: ID} job.Update() newJobs = append(newJobs, job) } return nil }) l := joblist(newJobs) sort.Sort(&l) Fatal(err) // TODO: recover? WLock() defer WUnlock() if _, ok := Users[dir]; !ok { Users[dir] = NewUser() } Users[dir].Jobs = newJobs Users[dir].nextPtr = 0 return "" } type joblist []*Job func (l *joblist) Len() int { return len(*l) } func (l *joblist) Less(i, j int) bool { return (*l)[i].ID < (*l)[j].ID } func (l *joblist) Swap(i, j int) { (*l)[i], (*l)[j] = (*l)[j], (*l)[i] } // RPC-callable function. Refreshes the in-memory cached info about this job. // Called, e.g., after a node has finished a job. func UpdateJob(jobURL string) string { WLock() defer WUnlock() j := JobByName(jobURL) if j == nil { log.Println("update", jobURL, ": no such job") return "" // empty conventionally means error } j.Update() return "updated " + jobURL // not used, but handy if called by Human. } // Periodically updates user's usedShare so they decay // exponentially according to flag_haflife func RunShareDecay() { halflife := *flag_halflife quantum := halflife / 100 // several updates per half-life gives smooth decay reduce := math.Pow(0.5, float64(quantum)/float64(halflife)) for { time.Sleep(quantum) WLock() for _, u := range Users { u.FairShare *= reduce } WUnlock() } } mumax3-3.10/cmd/mumax3-server/rpc.go000066400000000000000000000040731371432437400172570ustar00rootroot00000000000000package main import ( "fmt" "io/ioutil" "log" "net/http" "strings" "time" ) type RPCFunc func(string) string var methods = map[string]RPCFunc{ "AddFairShare": AddFairShare, "GiveJob": GiveJob, "Kill": Kill, "LoadJobs": wrap(LoadJobs), "LoadUserJobs": LoadUserJobs, "Ping": Ping, "UpdateJob": UpdateJob, "Rescan": func(string) string { go FindPeers(IPs, MinPort, MaxPort); return "" }, "WhatsTheTime": WhatsTheTime, "WakeupWatchdog": WakeupWatchdog, "rm": Rm, } func wrap(f func()) RPCFunc { return func(string) string { f(); return "" } } func HandleRPC(w http.ResponseWriter, r *http.Request) { var ret string defer func() { //log.Println(" < call ", r.Host, r.URL.Path, "->", ret) if err := recover(); err != nil { log.Println("*** RPC panic: ", r.URL.Path, ":", err) http.Error(w, "Does not compute: "+r.URL.Path, http.StatusBadRequest) } }() request := r.URL.Path[len("/do/"):] slashPos := strings.Index(request, "/") method := request[:slashPos] arg := request[slashPos+1:] m, ok := methods[method] if !ok { log.Println("*** RPC no such method", r.URL.Path) http.Error(w, "Does not compute: "+method, http.StatusBadRequest) return } ret = m(arg) fmt.Fprint(w, ret) } // re-usable http client for making RPC calls var httpClient = http.Client{Timeout: 2 * time.Second} // make RPC call to method on node with given address. func RPCCall(addr, method, arg string) (ret string, err error) { //defer func() { log.Println(" > call ", addr, method, arg, "->", ret, err) }() //TODO: escape args? resp, err := httpClient.Get("http://" + addr + "/do/" + method + "/" + arg) if err != nil { //log.Println("*** RPC error: ", err) return "", err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { log.Println("*** RPC error: ", resp.Status) return "", fmt.Errorf("http status %v", resp.Status) } if b, err := ioutil.ReadAll(resp.Body); err != nil { log.Println("*** RPC read error: ", err) return "", err } else { return string(b), nil } } mumax3-3.10/cmd/mumax3-server/status.go000066400000000000000000000126271371432437400200220ustar00rootroot00000000000000package main // Serves human-readable status information over http. import ( "html/template" "net/http" "time" ) var ( templ = template.Must(template.New("status").Parse(templText)) upSince = time.Now() ) func HandleStatus(w http.ResponseWriter, r *http.Request) { RLock() defer RUnlock() if r.URL.Path != "/" { http.Error(w, "Does not compute", http.StatusNotFound) return } err := templ.Execute(w, &status{}) if err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) } } type status struct{} // dummy type to define template methods on func (*status) IPRange() string { return *flag_scan + ": " + *flag_ports } func (*status) Ports() string { return *flag_ports } func (*status) ThisAddr() string { return thisAddr } func (*status) Uptime() time.Duration { return Since(time.Now(), upSince) } func (*status) MumaxVersion() string { return MumaxVersion } func (*status) GPUs() []string { return GPUs } func (*status) Processes() map[string]*Process { return Processes } func (*status) Users() map[string]*User { return Users } func (*status) NextUser() string { return nextUser() } func (*status) Peers() map[string]*Peer { return peers } func (*status) FS(a string) string { return FS(a) } const templText = ` {{define "Job"}} [{{.LocalPath}}] [{{with .Output}}.out{{end}}] [{{with .Output}}rm{{end}}] [{{with .Host}}{{.}}{{end}}] [{{with .ExitStatus}}{{if eq . "0"}} OK {{else}}FAIL{{end}}{{end}}] [{{with .Output}}{{$.Duration}}{{end}}{{with .RequeCount}} {{.}}x re-queued{{end}}{{with .Error}} {{.}}{{end}}] {{end}}

{{.ThisAddr}}

Uptime: {{.Uptime}}

Peer nodes

scan {{.IPRange}}
ports {{.Ports}}

{{range $k,$v := .Peers}} {{$k}}
{{end}}

Compute service

mumax: {{with .MumaxVersion}} {{.}} {{else}} not available
{{end}}
{{with .GPUs}} {{range $i, $v := .}} GPU{{$i}}: {{$v}}
{{end}} {{else}} No GPUs available
{{end}}

Running jobs

{{range $k,$v := .Processes}} {{end}}
[{{$k}}] [{{$v.Duration}}] [GUI]

Queue service

Users

{{range $k,$v := .Users}} {{end}}
{{$k}}{{$v.FairShare}} GPU-seconds{{with .HasJob}} has {{else}} no {{end}} queued jobs
Next job for: {{.NextUser}}

Jobs

(consider reloading just your own files).
(re-queue dead simulations right now). {{range $k,$v := .Users}}

▾ {{$k}}

Jobs (only needed when you changed your files on disk) {{range $v.Jobs}} {{template "Job" .}} {{end}}

{{end}}

` mumax3-3.10/cmd/mumax3-server/user.go000066400000000000000000000017151371432437400174510ustar00rootroot00000000000000package main import "time" type User struct { Jobs []*Job FairShare float64 // Used-up compute time in the past (decays) nextPtr int // pointer suggesting next job to start. Reset on re-scan. len(Jobs) means no queued job } func NewUser() *User { return &User{} } // nextJob looks for the next free job in the list. // it does a tiny bit of linear search, starting from nextPtr. func (u *User) giveJob(node string) *Job { index := u.nextJobPtr() if index >= len(u.Jobs) { return nil } u.nextPtr++ j := u.Jobs[index] // all below are preliminary, to get rapid gui response. // may be overwritten by update j.Host = node j.Output = OutputDir(j.ID) j.Start = time.Now() return j } func (u *User) HasJob() bool { i := u.nextJobPtr() return i < len(u.Jobs) } // returns func (u *User) nextJobPtr() int { for ; u.nextPtr < len(u.Jobs); u.nextPtr++ { j := u.Jobs[u.nextPtr] if j.IsQueued() { return u.nextPtr } } return u.nextPtr } mumax3-3.10/cmd/mumax3-server/utitl.go000066400000000000000000000015211371432437400176270ustar00rootroot00000000000000package main import ( "log" "net/url" "strings" "time" ) // BaseDir returns the first path element, without slashes and ignoring http:// . E.g.: // /home/user/file -> home // user/file -> user // http://home/user/file -> home func BaseDir(dir string) string { if strings.HasPrefix(dir, "http://") { return BaseDir(dir[len("http://"):]) } firstSlash := strings.Index(dir, "/") switch { case firstSlash < 0: return dir case firstSlash == 0: return BaseDir(dir[1:]) default: return dir[:firstSlash] } } func Fatal(err error) { if err != nil { log.Fatal(err) } } // rounded up to 1s precission func Since(a, b time.Time) time.Duration { d := a.Sub(b) return (d/1e9)*1e9 + 1e9 } // Parse URL, panic on error func MustParseURL(URL string) *url.URL { u, err := url.Parse(URL) if err != nil { panic(err) } return u } mumax3-3.10/cmd/mumax3-server/watchdog.go000066400000000000000000000021521371432437400202670ustar00rootroot00000000000000package main import ( "log" "time" ) var runWatchdog = make(chan struct{}) func init() { // run watchdog daemon in background go func() { for { <-runWatchdog // wait for start DoWatchdog() } }() } func LoopWatchdog() { for { WakeupWatchdog("") time.Sleep(3 * KeepaliveInterval) } } func WakeupWatchdog(string) string { select { default: return "already running" case runWatchdog <- struct{}{}: return "" // ok } } // single watchdog run: // re-queues all dead processes func DoWatchdog() { //log.Println("Watchdog wake-up") WLock() defer WUnlock() for _, u := range Users { for _, j := range u.Jobs { id := j.ID //log.Println(id, "running:", j.IsRunning(), "alive:", time.Since(j.Alive)) if j.IsRunning() && time.Since(j.Alive) > 3*KeepaliveInterval { j.Update() lastHeartbeat := time.Since(j.Alive) if lastHeartbeat > 3*KeepaliveInterval { log.Println("*** Re-queue", id, "after", lastHeartbeat, "inactivity") j.Reque() } } } // re-set nextPtr to beginning so we can start re-queued jobs if u.nextPtr >= len(u.Jobs) { u.nextPtr = 0 } } } mumax3-3.10/cmd/mumax3/000077500000000000000000000000001371432437400146345ustar00rootroot00000000000000mumax3-3.10/cmd/mumax3/.gitignore000066400000000000000000000000071371432437400166210ustar00rootroot00000000000000mumax3 mumax3-3.10/cmd/mumax3/Makefile000066400000000000000000000000241371432437400162700ustar00rootroot00000000000000all: go install -v mumax3-3.10/cmd/mumax3/browser.go000066400000000000000000000007411371432437400166500ustar00rootroot00000000000000package main import ( "fmt" "os/exec" ) // Try to open url in a browser. Instruct to do so if it fails. func openbrowser(url string) { for _, cmd := range browsers { err := exec.Command(cmd, url).Start() if err == nil { fmt.Println("//openend web interface in", cmd) return } } fmt.Println("//please open ", url, " in a browser") } // list of browsers to try. var browsers = []string{"x-www-browser", "google-chrome", "chromium-browser", "firefox", "explorer"} mumax3-3.10/cmd/mumax3/main.go000066400000000000000000000100601371432437400161040ustar00rootroot00000000000000// mumax3 main command package main import ( "flag" "fmt" "log" "os" "os/exec" "path" "time" "github.com/mumax/3/cuda" "github.com/mumax/3/engine" "github.com/mumax/3/script" "github.com/mumax/3/util" ) var ( flag_failfast = flag.Bool("failfast", false, "If one simulation fails, stop entire batch immediately") flag_test = flag.Bool("test", false, "Cuda test (internal)") flag_version = flag.Bool("v", true, "Print version") flag_vet = flag.Bool("vet", false, "Check input files for errors, but don't run them") // more flags in engine/gofiles.go ) func main() { flag.Parse() log.SetPrefix("") log.SetFlags(0) cuda.Init(*engine.Flag_gpu) cuda.Synchronous = *engine.Flag_sync if *flag_version { printVersion() } // used by bootstrap launcher to test cuda // successful exit means cuda was initialized fine if *flag_test { fmt.Println(cuda.GPUInfo) os.Exit(0) } defer engine.Close() // flushes pending output, if any if *flag_vet { vet() return } switch flag.NArg() { case 0: if *engine.Flag_interactive { runInteractive() } case 1: runFileAndServe(flag.Arg(0)) default: RunQueue(flag.Args()) } } func runInteractive() { fmt.Println("//no input files: starting interactive session") //initEngine() // setup outut dir now := time.Now() outdir := fmt.Sprintf("mumax-%v-%02d-%02d_%02dh%02d.out", now.Year(), int(now.Month()), now.Day(), now.Hour(), now.Minute()) engine.InitIO(outdir, outdir, *engine.Flag_forceclean) engine.Timeout = 365 * 24 * time.Hour // basically forever // set up some sensible start configuration engine.Eval(`SetGridSize(128, 64, 1) SetCellSize(4e-9, 4e-9, 4e-9) Msat = 1e6 Aex = 10e-12 alpha = 1 m = RandomMag()`) addr := goServeGUI() openbrowser("http://127.0.0.1" + addr) engine.RunInteractive() } func runFileAndServe(fname string) { if path.Ext(fname) == ".go" { runGoFile(fname) } else { runScript(fname) } } func runScript(fname string) { outDir := util.NoExt(fname) + ".out" if *engine.Flag_od != "" { outDir = *engine.Flag_od } engine.InitIO(fname, outDir, *engine.Flag_forceclean) fname = engine.InputFile var code *script.BlockStmt var err2 error if fname != "" { // first we compile the entire file into an executable tree code, err2 = engine.CompileFile(fname) util.FatalErr(err2) } // now the parser is not used anymore so it can handle web requests goServeGUI() if *engine.Flag_interactive { openbrowser("http://127.0.0.1" + *engine.Flag_port) } // start executing the tree, possibly injecting commands from web gui engine.EvalFile(code) if *engine.Flag_interactive { engine.RunInteractive() } } func runGoFile(fname string) { // pass through flags flags := []string{"run", fname} flag.Visit(func(f *flag.Flag) { if f.Name != "o" { flags = append(flags, fmt.Sprintf("-%v=%v", f.Name, f.Value)) } }) if *engine.Flag_od != "" { flags = append(flags, fmt.Sprintf("-o=%v", *engine.Flag_od)) } cmd := exec.Command("go", flags...) log.Println("go", flags) cmd.Stdout = os.Stdout cmd.Stdin = os.Stdin cmd.Stderr = os.Stderr err := cmd.Run() if err != nil { os.Exit(1) } } // start Gui server and return server address func goServeGUI() string { if *engine.Flag_port == "" { log.Println(`//not starting GUI (-http="")`) return "" } addr := engine.GoServe(*engine.Flag_port) fmt.Print("//starting GUI at http://127.0.0.1", addr, "\n") return addr } // print version to stdout func printVersion() { engine.LogOut(engine.UNAME) engine.LogOut(fmt.Sprintf("GPU info: %s, using cc=%d PTX", cuda.GPUInfo, cuda.UseCC)) engine.LogOut("(c) Arne Vansteenkiste, Dynamat LAB, Ghent University, Belgium") engine.LogOut("This is free software without any warranty. See license.txt") engine.LogOut("********************************************************************//") engine.LogOut(" If you use mumax in any work or publication, //") engine.LogOut(" we kindly ask you to cite the references in references.bib //") engine.LogOut("********************************************************************//") } mumax3-3.10/cmd/mumax3/mumax3.sh000077500000000000000000000004561371432437400164120ustar00rootroot00000000000000#! /bin/bash # # This script adds the current directory to your library path # and launches mumax3 using the shipped cuda libraries. # # When you have correctly set-up cuda, you can just run # mumax directly without this wrapper. # export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$(pwd) ./mumax3-cuda6.0 $@ mumax3-3.10/cmd/mumax3/queue.go000066400000000000000000000104071371432437400163110ustar00rootroot00000000000000package main // File que for distributing multiple input files over GPUs. import ( "flag" "fmt" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/engine" "io" "log" "net/http" "os" "os/exec" "sync" "sync/atomic" ) var ( exitStatus atom = 0 numOK, numFailed atom = 0, 0 ) func RunQueue(files []string) { s := NewStateTab(files) s.PrintTo(os.Stdout) go s.ListenAndServe(*engine.Flag_port) s.Run() fmt.Println(numOK.get(), "OK, ", numFailed.get(), "failed") os.Exit(int(exitStatus)) } // StateTab holds the queue state (list of jobs + statuses). // All operations are atomic. type stateTab struct { lock sync.Mutex jobs []job next int } // Job info. type job struct { inFile string // input file to run webAddr string // http address for gui of running process uid int } // NewStateTab constructs a queue for the given input files. // After construction, it is accessed atomically. func NewStateTab(inFiles []string) *stateTab { s := new(stateTab) s.jobs = make([]job, len(inFiles)) for i, f := range inFiles { s.jobs[i] = job{inFile: f, uid: i} } return s } // StartNext advances the next job and marks it running, setting its webAddr to indicate the GUI url. // A copy of the job info is returned, the original remains unmodified. // ok is false if there is no next job. func (s *stateTab) StartNext(webAddr string) (next job, ok bool) { s.lock.Lock() defer s.lock.Unlock() if s.next >= len(s.jobs) { return job{}, false } s.jobs[s.next].webAddr = webAddr jobCopy := s.jobs[s.next] s.next++ return jobCopy, true } // Finish marks the job with j's uid as finished. func (s *stateTab) Finish(j job) { s.lock.Lock() defer s.lock.Unlock() s.jobs[j.uid].webAddr = "" } // Runs all the jobs in stateTab. func (s *stateTab) Run() { nGPU := cu.DeviceGetCount() idle := initGPUs(nGPU) for { gpu := <-idle addr := fmt.Sprint(":", 35368+gpu) j, ok := s.StartNext(addr) if !ok { break } go func() { run(j.inFile, gpu, j.webAddr) s.Finish(j) idle <- gpu }() } // drain remaining tasks (one already done) for i := 1; i < nGPU; i++ { <-idle } } type atom int32 func (a *atom) set(v int) { atomic.StoreInt32((*int32)(a), int32(v)) } func (a *atom) get() int { return int(atomic.LoadInt32((*int32)(a))) } func (a *atom) inc() { atomic.AddInt32((*int32)(a), 1) } func run(inFile string, gpu int, webAddr string) { // overridden flags gpuFlag := fmt.Sprint(`-gpu=`, gpu) httpFlag := fmt.Sprint(`-http=`, webAddr) // pass through flags flags := []string{gpuFlag, httpFlag} flag.Visit(func(f *flag.Flag) { if f.Name != "gpu" && f.Name != "http" && f.Name != "failfast" { flags = append(flags, fmt.Sprintf("-%v=%v", f.Name, f.Value)) } }) flags = append(flags, inFile) cmd := exec.Command(os.Args[0], flags...) log.Println(os.Args[0], flags) output, err := cmd.CombinedOutput() if err != nil { log.Println(inFile, err) log.Printf("%s\n", output) exitStatus.set(1) numFailed.inc() if *flag_failfast { os.Exit(1) } } else { numOK.inc() } } func initGPUs(nGpu int) chan int { if nGpu == 0 { log.Fatal("no GPUs available") panic(0) } idle := make(chan int, nGpu) for i := 0; i < nGpu; i++ { idle <- i } return idle } func (s *stateTab) PrintTo(w io.Writer) { s.lock.Lock() defer s.lock.Unlock() for i, j := range s.jobs { fmt.Fprintf(w, "%3d %v %v\n", i, j.inFile, j.webAddr) } } func (s *stateTab) RenderHTML(w io.Writer) { s.lock.Lock() defer s.lock.Unlock() fmt.Fprintln(w, ` `+engine.CSS+` mumax3 queue status

`)

	hostname := "localhost"
	hostname, _ = os.Hostname()
	for _, j := range s.jobs {
		if j.webAddr != "" {
			fmt.Fprint(w, ``, j.uid, ` `, j.inFile, " ", j.webAddr, "\n")
		} else {
			fmt.Fprint(w, j.uid, " ", j.inFile, "\n")
		}
	}

	fmt.Fprintln(w, `

`) } func (s *stateTab) ListenAndServe(addr string) { http.Handle("/", s) go http.ListenAndServe(addr, nil) } func (s *stateTab) ServeHTTP(w http.ResponseWriter, r *http.Request) { s.RenderHTML(w) } mumax3-3.10/cmd/mumax3/vet.go000066400000000000000000000010551371432437400157620ustar00rootroot00000000000000package main import ( "flag" "fmt" "io/ioutil" "os" "github.com/mumax/3/engine" "github.com/mumax/3/util" ) // check all input files for errors, don't run. func vet() { status := 0 for _, f := range flag.Args() { src, ioerr := ioutil.ReadFile(f) util.FatalErr(ioerr) engine.World.EnterScope() // avoid name collisions between separate files _, err := engine.World.Compile(string(src)) engine.World.ExitScope() if err != nil { fmt.Println(f, ":", err) status = 1 } else { fmt.Println(f, ":", "OK") } } os.Exit(status) } mumax3-3.10/cuda/000077500000000000000000000000001371432437400135735ustar00rootroot00000000000000mumax3-3.10/cuda/.gitignore000066400000000000000000000000411371432437400155560ustar00rootroot00000000000000*.ptx cuda2go cuda2go.exe *.orig mumax3-3.10/cuda/Makefile000066400000000000000000000052441371432437400152400ustar00rootroot00000000000000# Builds mumax3 cuda kernels and create GO wrappers for the compute capabilities listed in $CUDA_CC. # If $CUDA_CC is not defined, then $CUDA_CC is set to "30". # # The ${CUDA_HOME}/bin/nvcc compiler is used to compile the cuda kernels. If CUDA_HOME is not defined # it will look for an nvidia compiler in $PATH instead. # # Examples: # # make # make CUDA_CC=70 # make CUDA_CC="30 32 35 37 50 52 53 60 61 62 70" # make CUDA_HOME="/usr/local/cuda-9.0" CUDA_CC="30 32 35 37 50 52 53 60 61 62 70" # # Different CUDA versions support different compute capabilities, as shown in the list below. # CUDA SDK 9.0 support for compute capability 30 32 35 37 50 52 53 60 61 62 70 # CUDA SDK 9.1 support for compute capability 30 32 35 37 50 52 53 60 61 62 70 # CUDA SDK 9.2 support for compute capability 30 32 35 37 50 52 53 60 61 62 70 72 # CUDA SDK 10.0 support for compute capability 30 32 35 37 50 52 53 60 61 62 70 72 75 # CUDA SDK 10.1 support for compute capability 30 32 35 37 50 52 53 60 61 62 70 72 75 # CUDA SDK 10.2 support for compute capability 30 32 35 37 50 52 53 60 61 62 70 72 75 # CUDA SDK 11.0 support for compute capability 30 32 35 37 50 52 53 60 61 62 70 72 75 80 SHELL = /bin/bash # When CUDA_HOME is not an environment variable and is not set on the command line, use the nvcc compiler # from the PATH ifeq ($(CUDA_HOME),) NVCC=nvcc else NVCC=${CUDA_HOME}/bin/nvcc endif # When CUDA_CC is not an environment variable and is not set on the command line, use compute capability 3.0 ifeq ($(CUDA_CC),) CUDA_CC = 30 endif # The gcc host compiler for nvcc ifeq ($(NVCC_CCBIN),) NVCC_CCBIN=/usr/bin/gcc endif CUDA_VERSION := $(shell $(NVCC) --version | grep "Cuda compilation" | grep -Eo '[+-]?[0-9]+([.][0-9]+)?' | head -n 1) NVCC_COMPATIBILITY_FLAGS := -std=c++03 ifneq (,$(filter 7.0 7.5 8.0,$(CUDA_VERSION))) NVCC_COMPATIBILITY_FLAGS := endif NVCCFLAGS = $(NVCC_COMPATIBILITY_FLAGS) -ccbin=$(NVCC_CCBIN) --compiler-options -Werror --compiler-options -Wall -Xptxas -O3 -ptx CUDAFILES := $(wildcard *.cu) WRAPPERS := $(CUDAFILES:.cu=_wrapper.go) .PHONY: all wrappers clean realclean all: wrappers @echo "Built with CUDA version ${CUDA_VERSION}" go install -v wrappers: $(WRAPPERS) %_wrapper.go: %.cu cuda2go @ rm -f $(basename $<)*.ptx @ for cc in $(CUDA_CC); do \ echo $(NVCC) $(NVCCFLAGS) -arch=compute_$$cc -code=sm_$$cc $< -o $(basename $<)_$$cc.ptx ;\ $(NVCC) $(NVCCFLAGS) -arch=compute_$$cc -code=sm_$$cc $< -o $(basename $<)_$$cc.ptx ;\ done @ ./cuda2go $< > /dev/null @ gofmt -w -s -l $@ > /dev/null cuda2go: cuda2go.go go build $< clean: rm -vf *.ptx realclean: rm -vf *_wrapper.go *.ptx cuda2gomumax3-3.10/cuda/alloc.go000066400000000000000000000010421371432437400152110ustar00rootroot00000000000000package cuda import ( "log" "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/data" ) // Wrapper for cu.MemAlloc, fatal exit on out of memory. func MemAlloc(bytes int64) unsafe.Pointer { defer func() { err := recover() if err == cu.ERROR_OUT_OF_MEMORY { log.Fatal(err) } if err != nil { panic(err) } }() return unsafe.Pointer(uintptr(cu.MemAlloc(bytes))) } // Returns a copy of in, allocated on GPU. func GPUCopy(in *data.Slice) *data.Slice { s := NewSlice(in.NComp(), in.Size()) data.Copy(s, in) return s } mumax3-3.10/cuda/amul.h000066400000000000000000000013701371432437400147030ustar00rootroot00000000000000#ifndef _AMUL_H_ #define _AMUL_H_ #include "float3.h" // Returns mul * arr[i], or mul when arr == NULL; inline __device__ float amul(float *arr, float mul, int i) { return (arr == NULL)? (mul): (mul * arr[i]); } // Returns m * a[i], or m when a == NULL; inline __device__ float3 vmul(float *ax, float *ay, float *az, float mx, float my, float mz, int i) { return make_float3(amul(ax, mx, i), amul(ay, my, i), amul(az, mz, i)); } // Returns 1/Msat, or 0 when Msat == 0. inline __device__ float inv_Msat(float *Ms_, float Ms_mul, int i) { float ms = amul(Ms_, Ms_mul, i); if (ms == 0.0f) { return 0.0f; } else { return 1.0f / ms; } } #endif mumax3-3.10/cuda/angles.go000066400000000000000000000007261371432437400154000ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) func SetPhi(s *data.Slice, m *data.Slice) { N := s.Size() util.Argument(m.Size() == N) cfg := make3DConf(N) k_setPhi_async(s.DevPtr(X), m.DevPtr(X), m.DevPtr(Y), N[X], N[Y], N[Z], cfg) return } func SetTheta(s *data.Slice, m *data.Slice) { N := s.Size() util.Argument(m.Size() == N) cfg := make3DConf(N) k_setTheta_async(s.DevPtr(X), m.DevPtr(Z), N[X], N[Y], N[Z], cfg) return } mumax3-3.10/cuda/anisotropy.go000066400000000000000000000024471371432437400163400ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Add uniaxial magnetocrystalline anisotropy field to Beff. // see uniaxialanisotropy.cu func AddCubicAnisotropy2(Beff, m *data.Slice, Msat, k1, k2, k3, c1, c2 MSlice) { util.Argument(Beff.Size() == m.Size()) N := Beff.Len() cfg := make1DConf(N) k_addcubicanisotropy2_async( Beff.DevPtr(X), Beff.DevPtr(Y), Beff.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), Msat.DevPtr(0), Msat.Mul(0), k1.DevPtr(0), k1.Mul(0), k2.DevPtr(0), k2.Mul(0), k3.DevPtr(0), k3.Mul(0), c1.DevPtr(X), c1.Mul(X), c1.DevPtr(Y), c1.Mul(Y), c1.DevPtr(Z), c1.Mul(Z), c2.DevPtr(X), c2.Mul(X), c2.DevPtr(Y), c2.Mul(Y), c2.DevPtr(Z), c2.Mul(Z), N, cfg) } // Add uniaxial magnetocrystalline anisotropy field to Beff. // see uniaxialanisotropy.cu func AddUniaxialAnisotropy2(Beff, m *data.Slice, Msat, k1, k2, u MSlice) { util.Argument(Beff.Size() == m.Size()) checkSize(Beff, m, k1, k2, u, Msat) N := Beff.Len() cfg := make1DConf(N) k_adduniaxialanisotropy2_async( Beff.DevPtr(X), Beff.DevPtr(Y), Beff.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), Msat.DevPtr(0), Msat.Mul(0), k1.DevPtr(0), k1.Mul(0), k2.DevPtr(0), k2.Mul(0), u.DevPtr(X), u.Mul(X), u.DevPtr(Y), u.Mul(Y), u.DevPtr(Z), u.Mul(Z), N, cfg) } mumax3-3.10/cuda/atomicf.h000066400000000000000000000002741371432437400153710ustar00rootroot00000000000000#ifndef _ATOMICF_H_ #define _ATOMICF_H_ // Atomic max of abs value. inline __device__ void atomicFmaxabs(float* a, float b){ b = fabs(b); atomicMax((int*)(a), *((int*)(&b))); } #endif mumax3-3.10/cuda/buffer.go000066400000000000000000000044161371432437400154000ustar00rootroot00000000000000package cuda // Pool of re-usable GPU buffers. // Synchronization subtlety: // async kernel launches mean a buffer may already be recycled when still in use. // That should be fine since the next launch run in the same stream (0), and will // effectively wait for the previous operation on the buffer. import ( "log" "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/data" ) var ( buf_pool = make(map[int][]unsafe.Pointer) // pool of GPU buffers indexed by size buf_check = make(map[unsafe.Pointer]struct{}) // checks if pointer originates here to avoid unintended recycle ) const buf_max = 100 // maximum number of buffers to allocate (detect memory leak early) // Returns a GPU slice for temporary use. To be returned to the pool with Recycle func Buffer(nComp int, size [3]int) *data.Slice { if Synchronous { Sync() } ptrs := make([]unsafe.Pointer, nComp) // re-use as many buffers as possible form our stack N := prod(size) pool := buf_pool[N] nFromPool := iMin(nComp, len(pool)) for i := 0; i < nFromPool; i++ { ptrs[i] = pool[len(pool)-i-1] } buf_pool[N] = pool[:len(pool)-nFromPool] // allocate as much new memory as needed for i := nFromPool; i < nComp; i++ { if len(buf_check) >= buf_max { log.Panic("too many buffers in use, possible memory leak") } ptrs[i] = MemAlloc(int64(cu.SIZEOF_FLOAT32 * N)) buf_check[ptrs[i]] = struct{}{} // mark this pointer as mine } return data.SliceFromPtrs(size, data.GPUMemory, ptrs) } // Returns a buffer obtained from GetBuffer to the pool. func Recycle(s *data.Slice) { if Synchronous { Sync() } N := s.Len() pool := buf_pool[N] // put each component buffer back on the stack for i := 0; i < s.NComp(); i++ { ptr := s.DevPtr(i) if ptr == unsafe.Pointer(uintptr(0)) { continue } if _, ok := buf_check[ptr]; !ok { log.Panic("recyle: was not obtained with getbuffer") } pool = append(pool, ptr) } s.Disable() // make it unusable, protect against accidental use after recycle buf_pool[N] = pool } // Frees all buffers. Called after mesh resize. func FreeBuffers() { Sync() for _, size := range buf_pool { for i := range size { cu.DevicePtr(uintptr(size[i])).Free() size[i] = nil } } buf_pool = make(map[int][]unsafe.Pointer) buf_check = make(map[unsafe.Pointer]struct{}) } mumax3-3.10/cuda/buffer_test.go000066400000000000000000000010561371432437400164340ustar00rootroot00000000000000package cuda import "testing" // In case of memory leak, this will crash func TestBuffer(t *testing.T) { m1 := [3]int{2, 1024, 2048} m2 := [3]int{4, 1024, 2048} a := Buffer(3, m1) b := Buffer(3, m2) c := Buffer(1, m1) d := Buffer(2, m2) Recycle(a) Recycle(b) Recycle(c) Recycle(d) for i := 0; i < 10000; i++ { b := Buffer(3, m2) Recycle(b) } } func BenchmarkBuffer(b *testing.B) { b.StopTimer() m := [3]int{2, 1024, 2048} a := Buffer(3, m) Recycle(a) b.StartTimer() for i := 0; i < b.N; i++ { a := Buffer(3, m) Recycle(a) } } mumax3-3.10/cuda/bytes.go000066400000000000000000000034061371432437400152530ustar00rootroot00000000000000package cuda // This file provides GPU byte slices, used to store regions. import ( "log" "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/util" ) // 3D byte slice, used for region lookup. type Bytes struct { Ptr unsafe.Pointer Len int } // Construct new byte slice with given length, // initialised to zeros. func NewBytes(Len int) *Bytes { ptr := cu.MemAlloc(int64(Len)) cu.MemsetD8(cu.DevicePtr(ptr), 0, int64(Len)) return &Bytes{unsafe.Pointer(uintptr(ptr)), Len} } // Upload src (host) to dst (gpu). func (dst *Bytes) Upload(src []byte) { util.Argument(dst.Len == len(src)) MemCpyHtoD(dst.Ptr, unsafe.Pointer(&src[0]), int64(dst.Len)) } // Copy on device: dst = src. func (dst *Bytes) Copy(src *Bytes) { util.Argument(dst.Len == src.Len) MemCpy(dst.Ptr, src.Ptr, int64(dst.Len)) } // Copy to host: dst = src. func (src *Bytes) Download(dst []byte) { util.Argument(src.Len == len(dst)) MemCpyDtoH(unsafe.Pointer(&dst[0]), src.Ptr, int64(src.Len)) } // Set one element to value. // data.Index can be used to find the index for x,y,z. func (dst *Bytes) Set(index int, value byte) { if index < 0 || index >= dst.Len { log.Panic("Bytes.Set: index out of range:", index) } src := value MemCpyHtoD(unsafe.Pointer(uintptr(dst.Ptr)+uintptr(index)), unsafe.Pointer(&src), 1) } // Get one element. // data.Index can be used to find the index for x,y,z. func (src *Bytes) Get(index int) byte { if index < 0 || index >= src.Len { log.Panic("Bytes.Set: index out of range:", index) } var dst byte MemCpyDtoH(unsafe.Pointer(&dst), unsafe.Pointer(uintptr(src.Ptr)+uintptr(index)), 1) return dst } // Frees the GPU memory and disables the slice. func (b *Bytes) Free() { if b.Ptr != nil { cu.MemFree(cu.DevicePtr(uintptr(b.Ptr))) } b.Ptr = nil b.Len = 0 } mumax3-3.10/cuda/constants.h000066400000000000000000000006061371432437400157620ustar00rootroot00000000000000#ifndef _CONSTANTS_H_ #define _CONSTANTS_H_ #define PI 3.1415926535897932384626433 #define MU0 (4*PI*1e-7) // Permeability of vacuum in Tm/A #define QE 1.60217646E-19 // Electron charge in C #define MUB 9.2740091523E-24 // Bohr magneton in J/T #define GAMMA0 1.7595e11 // Gyromagnetic ratio of electron, in rad/Ts #define HBAR 1.05457173E-34 #endif mumax3-3.10/cuda/conv_common.go000066400000000000000000000032311371432437400164360ustar00rootroot00000000000000package cuda // common code for all convolutions. import ( "log" "math" "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Output size of R2C FFT with given logic size, expressed in floats. func fftR2COutputSizeFloats(logicSize [3]int) [3]int { return [3]int{2 * (logicSize[X]/2 + 1), logicSize[Y], logicSize[Z]} } // product of elements func prod(size [3]int) int { return size[X] * size[Y] * size[Z] } // Extract real parts, copy them from src to dst. // In the meanwhile, check if imaginary parts are nearly zero // and scale the kernel to compensate for unnormalized FFTs. // scale = 1/N, with N the FFT logical size. func scaleRealParts(dst, src *data.Slice, scale float32) { util.Argument(2*dst.Len() == src.Len()) util.Argument(dst.NComp() == 1 && src.NComp() == 1) srcList := src.Host()[0] dstList := dst.Host()[0] // Normally, the FFT'ed kernel is purely real because of symmetry, // so we only store the real parts... maximg := float32(0.) for i := 0; i < src.Len()/2; i++ { dstList[i] = srcList[2*i] * scale if fabs(srcList[2*i+1]) > maximg { maximg = fabs(srcList[2*i+1]) } } maximg *= float32(math.Sqrt(float64(scale))) // after 1 FFT, normalization is sqrt(N) // ...however, we check that the imaginary parts are nearly zero, // just to be sure we did not make a mistake during kernel creation. if maximg > FFT_IMAG_TOLERANCE { log.Fatalf("FFT kernel imaginary part: %v\n", maximg) } } // Maximum tolerable imaginary/real part for demag kernel in Fourier space. Assures kernel has correct symmetry. const FFT_IMAG_TOLERANCE = 1e-6 // float32 absolute value func fabs(x float32) float32 { if x < 0 { return -x } return x } mumax3-3.10/cuda/conv_copypad.go000066400000000000000000000022221371432437400166040ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Copies src (larger) into dst (smaller). // Used to extract demag field after convolution on padded m. func copyUnPad(dst, src *data.Slice, dstsize, srcsize [3]int) { util.Argument(dst.NComp() == 1 && src.NComp() == 1) util.Argument(dst.Len() == prod(dstsize) && src.Len() == prod(srcsize)) cfg := make3DConf(dstsize) k_copyunpad_async(dst.DevPtr(0), dstsize[X], dstsize[Y], dstsize[Z], src.DevPtr(0), srcsize[X], srcsize[Y], srcsize[Z], cfg) } // Copies src into dst, which is larger, and multiplies by vol*Bsat. // The remainder of dst is not filled with zeros. // Used to zero-pad magnetization before convolution and in the meanwhile multiply m by its length. func copyPadMul(dst, src, vol *data.Slice, dstsize, srcsize [3]int, Msat MSlice) { util.Argument(dst.NComp() == 1 && src.NComp() == 1) util.Assert(dst.Len() == prod(dstsize) && src.Len() == prod(srcsize)) cfg := make3DConf(srcsize) k_copypadmul2_async(dst.DevPtr(0), dstsize[X], dstsize[Y], dstsize[Z], src.DevPtr(0), srcsize[X], srcsize[Y], srcsize[Z], Msat.DevPtr(0), Msat.Mul(0), vol.DevPtr(0), cfg) } mumax3-3.10/cuda/conv_demag.go000066400000000000000000000137441371432437400162350ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Stores the necessary state to perform FFT-accelerated convolution // with magnetostatic kernel (or other kernel of same symmetry). type DemagConvolution struct { inputSize [3]int // 3D size of the input/output data realKernSize [3]int // Size of kernel and logical FFT size. fftKernLogicSize [3]int // logic size FFTed kernel, real parts only, we store less fftRBuf [3]*data.Slice // FFT input buf; 2D: Z shares storage with X. fftCBuf [3]*data.Slice // FFT output buf; 2D: Z shares storage with X. kern [3][3]*data.Slice // FFT kernel on device fwPlan fft3DR2CPlan // Forward FFT (1 component) bwPlan fft3DC2RPlan // Backward FFT (1 component) } // Initializes a convolution to evaluate the demag field for the given mesh geometry. // Sanity-checked if test == true (slow-ish for large meshes). func NewDemag(inputSize, PBC [3]int, kernel [3][3]*data.Slice, test bool) *DemagConvolution { c := new(DemagConvolution) c.inputSize = inputSize c.realKernSize = kernel[X][X].Size() c.init(kernel) if test { testConvolution(c, PBC, kernel) } return c } // Calculate the demag field of m * vol * Bsat, store result in B. // m: magnetization normalized to unit length // vol: unitless mask used to scale m's length, may be nil // Bsat: saturation magnetization in Tesla // B: resulting demag field, in Tesla func (c *DemagConvolution) Exec(B, m, vol *data.Slice, Msat MSlice) { util.Argument(B.Size() == c.inputSize && m.Size() == c.inputSize) if c.is2D() { c.exec2D(B, m, vol, Msat) } else { c.exec3D(B, m, vol, Msat) } } func (c *DemagConvolution) exec3D(outp, inp, vol *data.Slice, Msat MSlice) { for i := 0; i < 3; i++ { // FW FFT c.fwFFT(i, inp, vol, Msat) } // kern mul kernMulRSymm3D_async(c.fftCBuf, c.kern[X][X], c.kern[Y][Y], c.kern[Z][Z], c.kern[Y][Z], c.kern[X][Z], c.kern[X][Y], c.fftKernLogicSize[X], c.fftKernLogicSize[Y], c.fftKernLogicSize[Z]) for i := 0; i < 3; i++ { // BW FFT c.bwFFT(i, outp) } } func (c *DemagConvolution) exec2D(outp, inp, vol *data.Slice, Msat MSlice) { // Convolution is separated into // a 1D convolution for z and a 2D convolution for xy. // So only 2 FFT buffers are needed at the same time. Nx, Ny := c.fftKernLogicSize[X], c.fftKernLogicSize[Y] // Z c.fwFFT(Z, inp, vol, Msat) kernMulRSymm2Dz_async(c.fftCBuf[Z], c.kern[Z][Z], Nx, Ny) c.bwFFT(Z, outp) // XY c.fwFFT(X, inp, vol, Msat) c.fwFFT(Y, inp, vol, Msat) kernMulRSymm2Dxy_async(c.fftCBuf[X], c.fftCBuf[Y], c.kern[X][X], c.kern[Y][Y], c.kern[X][Y], Nx, Ny) c.bwFFT(X, outp) c.bwFFT(Y, outp) } func (c *DemagConvolution) is2D() bool { return c.inputSize[Z] == 1 } // zero 1-component slice func zero1_async(dst *data.Slice) { cu.MemsetD32Async(cu.DevicePtr(uintptr(dst.DevPtr(0))), 0, int64(dst.Len()), stream0) } // forward FFT component i func (c *DemagConvolution) fwFFT(i int, inp, vol *data.Slice, Msat MSlice) { zero1_async(c.fftRBuf[i]) in := inp.Comp(i) copyPadMul(c.fftRBuf[i], in, vol, c.realKernSize, c.inputSize, Msat) c.fwPlan.ExecAsync(c.fftRBuf[i], c.fftCBuf[i]) } // backward FFT component i func (c *DemagConvolution) bwFFT(i int, outp *data.Slice) { c.bwPlan.ExecAsync(c.fftCBuf[i], c.fftRBuf[i]) out := outp.Comp(i) copyUnPad(out, c.fftRBuf[i], c.inputSize, c.realKernSize) } func (c *DemagConvolution) init(realKern [3][3]*data.Slice) { // init device buffers // 2D re-uses fftBuf[X] as fftBuf[Z], 3D needs all 3 fftBufs. nc := fftR2COutputSizeFloats(c.realKernSize) c.fftCBuf[X] = NewSlice(1, nc) c.fftCBuf[Y] = NewSlice(1, nc) if c.is2D() { c.fftCBuf[Z] = c.fftCBuf[X] } else { c.fftCBuf[Z] = NewSlice(1, nc) } c.fftRBuf[X] = NewSlice(1, c.realKernSize) c.fftRBuf[Y] = NewSlice(1, c.realKernSize) if c.is2D() { c.fftRBuf[Z] = c.fftRBuf[X] } else { c.fftRBuf[Z] = NewSlice(1, c.realKernSize) } // init FFT plans c.fwPlan = newFFT3DR2C(c.realKernSize[X], c.realKernSize[Y], c.realKernSize[Z]) c.bwPlan = newFFT3DC2R(c.realKernSize[X], c.realKernSize[Y], c.realKernSize[Z]) // init FFT kernel // logic size of FFT(kernel): store real parts only c.fftKernLogicSize = fftR2COutputSizeFloats(c.realKernSize) util.Assert(c.fftKernLogicSize[X]%2 == 0) c.fftKernLogicSize[X] /= 2 // physical size of FFT(kernel): store only non-redundant part exploiting Y, Z mirror symmetry // X mirror symmetry already exploited: FFT(kernel) is purely real. physKSize := [3]int{c.fftKernLogicSize[X], c.fftKernLogicSize[Y]/2 + 1, c.fftKernLogicSize[Z]/2 + 1} output := c.fftCBuf[0] input := c.fftRBuf[0] fftKern := data.NewSlice(1, physKSize) kfull := data.NewSlice(1, output.Size()) // not yet exploiting symmetry kfulls := kfull.Scalars() kCSize := physKSize kCSize[X] *= 2 // size of kernel after removing Y,Z redundant parts, but still complex kCmplx := data.NewSlice(1, kCSize) // not yet exploiting X symmetry kc := kCmplx.Scalars() for i := 0; i < 3; i++ { for j := i; j < 3; j++ { // upper triangular part if realKern[i][j] != nil { // ignore 0's // FW FFT data.Copy(input, realKern[i][j]) c.fwPlan.ExecAsync(input, output) data.Copy(kfull, output) // extract non-redundant part (Y,Z symmetry) for iz := 0; iz < kCSize[Z]; iz++ { for iy := 0; iy < kCSize[Y]; iy++ { for ix := 0; ix < kCSize[X]; ix++ { kc[iz][iy][ix] = kfulls[iz][iy][ix] } } } // extract real parts (X symmetry) scaleRealParts(fftKern, kCmplx, 1/float32(c.fwPlan.InputLen())) c.kern[i][j] = GPUCopy(fftKern) } } } } func (c *DemagConvolution) Free() { if c == nil { return } c.inputSize = [3]int{} c.realKernSize = [3]int{} for i := 0; i < 3; i++ { c.fftCBuf[i].Free() c.fftRBuf[i].Free() c.fftCBuf[i] = nil c.fftRBuf[i] = nil for j := 0; j < 3; j++ { c.kern[i][j].Free() c.kern[i][j] = nil } c.fwPlan.Free() c.bwPlan.Free() cudaCtx.SetCurrent() } } mumax3-3.10/cuda/conv_kernmul.go000066400000000000000000000034161371432437400166300ustar00rootroot00000000000000package cuda // Kernel multiplication for purely real kernel, symmetric around Y axis (apart from first row). // Launch configs range over all complex elements of fft input. This could be optimized: range only over kernel. import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) // kernel multiplication for 3D demag convolution, exploiting full kernel symmetry. func kernMulRSymm3D_async(fftM [3]*data.Slice, Kxx, Kyy, Kzz, Kyz, Kxz, Kxy *data.Slice, Nx, Ny, Nz int) { util.Argument(fftM[X].NComp() == 1 && Kxx.NComp() == 1) cfg := make3DConf([3]int{Nx, Ny, Nz}) k_kernmulRSymm3D_async(fftM[X].DevPtr(0), fftM[Y].DevPtr(0), fftM[Z].DevPtr(0), Kxx.DevPtr(0), Kyy.DevPtr(0), Kzz.DevPtr(0), Kyz.DevPtr(0), Kxz.DevPtr(0), Kxy.DevPtr(0), Nx, Ny, Nz, cfg) } // kernel multiplication for 2D demag convolution on X and Y, exploiting full kernel symmetry. func kernMulRSymm2Dxy_async(fftMx, fftMy, Kxx, Kyy, Kxy *data.Slice, Nx, Ny int) { util.Argument(fftMy.NComp() == 1 && Kxx.NComp() == 1) cfg := make3DConf([3]int{Nx, Ny, 1}) k_kernmulRSymm2Dxy_async(fftMx.DevPtr(0), fftMy.DevPtr(0), Kxx.DevPtr(0), Kyy.DevPtr(0), Kxy.DevPtr(0), Nx, Ny, cfg) } // kernel multiplication for 2D demag convolution on Z, exploiting full kernel symmetry. func kernMulRSymm2Dz_async(fftMz, Kzz *data.Slice, Nx, Ny int) { util.Argument(fftMz.NComp() == 1 && Kzz.NComp() == 1) cfg := make3DConf([3]int{Nx, Ny, 1}) k_kernmulRSymm2Dz_async(fftMz.DevPtr(0), Kzz.DevPtr(0), Nx, Ny, cfg) } // kernel multiplication for general 1D convolution. Does not assume any symmetry. // Used for MFM images. func kernMulC_async(fftM, K *data.Slice, Nx, Ny int) { util.Argument(fftM.NComp() == 1 && K.NComp() == 1) cfg := make3DConf([3]int{Nx, Ny, 1}) k_kernmulC_async(fftM.DevPtr(0), K.DevPtr(0), Nx, Ny, cfg) } mumax3-3.10/cuda/conv_mfm.go000066400000000000000000000056541371432437400157400ustar00rootroot00000000000000package cuda // Generation of Magnetic Force Microscopy images. import ( "github.com/mumax/3/data" "github.com/mumax/3/mag" ) // Stores the necessary state to perform FFT-accelerated convolution type MFMConvolution struct { size [3]int // 3D size of the input/output data kernSize [3]int // Size of kernel and logical FFT size. fftKernSize [3]int // fftRBuf *data.Slice // FFT input buf for FFT, shares storage with fftCBuf. fftCBuf *data.Slice // FFT output buf, shares storage with fftRBuf gpuFFTKern [3]*data.Slice // FFT kernel on device fwPlan fft3DR2CPlan // Forward FFT (1 component) bwPlan fft3DC2RPlan // Backward FFT (1 component) kern [3]*data.Slice // Real-space kernel (host) mesh *data.Mesh } func (c *MFMConvolution) Free() { if c == nil { return } c.size = [3]int{} c.kernSize = [3]int{} c.fftCBuf.Free() // shared with fftRbuf c.fftCBuf = nil c.fftRBuf = nil for j := 0; j < 3; j++ { c.gpuFFTKern[j].Free() c.gpuFFTKern[j] = nil c.kern[j] = nil } c.fwPlan.Free() c.bwPlan.Free() cudaCtx.SetCurrent() } func (c *MFMConvolution) init() { // init FFT plans padded := c.kernSize c.fwPlan = newFFT3DR2C(padded[X], padded[Y], padded[Z]) c.bwPlan = newFFT3DC2R(padded[X], padded[Y], padded[Z]) // init device buffers nc := fftR2COutputSizeFloats(c.kernSize) c.fftCBuf = NewSlice(1, nc) c.fftRBuf = NewSlice(1, c.kernSize) c.gpuFFTKern[X] = NewSlice(1, nc) c.gpuFFTKern[Y] = NewSlice(1, nc) c.gpuFFTKern[Z] = NewSlice(1, nc) c.initFFTKern3D() } func (c *MFMConvolution) initFFTKern3D() { c.fftKernSize = fftR2COutputSizeFloats(c.kernSize) for i := 0; i < 3; i++ { zero1_async(c.fftRBuf) data.Copy(c.fftRBuf, c.kern[i]) c.fwPlan.ExecAsync(c.fftRBuf, c.fftCBuf) scale := 2 / float32(c.fwPlan.InputLen()) // ?? zero1_async(c.gpuFFTKern[i]) Madd2(c.gpuFFTKern[i], c.gpuFFTKern[i], c.fftCBuf, 0, scale) } } // store MFM image in output, based on magnetization in inp. func (c *MFMConvolution) Exec(outp, inp, vol *data.Slice, Msat MSlice) { for i := 0; i < 3; i++ { zero1_async(c.fftRBuf) copyPadMul(c.fftRBuf, inp.Comp(i), vol, c.kernSize, c.size, Msat) c.fwPlan.ExecAsync(c.fftRBuf, c.fftCBuf) Nx, Ny := c.fftKernSize[X]/2, c.fftKernSize[Y] // ?? kernMulC_async(c.fftCBuf, c.gpuFFTKern[i], Nx, Ny) c.bwPlan.ExecAsync(c.fftCBuf, c.fftRBuf) copyUnPad(outp.Comp(i), c.fftRBuf, c.size, c.kernSize) } } func (c *MFMConvolution) Reinit(lift, tipsize float64, cachedir string) { c.kern = mag.MFMKernel(c.mesh, lift, tipsize, cachedir) c.initFFTKern3D() } // Initializes a convolution to evaluate the demag field for the given mesh geometry. func NewMFM(mesh *data.Mesh, lift, tipsize float64, cachedir string) *MFMConvolution { k := mag.MFMKernel(mesh, lift, tipsize, cachedir) size := mesh.Size() c := new(MFMConvolution) c.size = size c.kern = k c.kernSize = k[X].Size() c.init() c.mesh = mesh return c } mumax3-3.10/cuda/conv_selftest.go000066400000000000000000000072611371432437400170060ustar00rootroot00000000000000package cuda // Convolution self-test, performed once at the start of each simulation import ( "math/rand" "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Compares FFT-accelerated convolution against brute-force on sparse data. // This is not really needed but very quickly uncovers newly introduced bugs. func testConvolution(c *DemagConvolution, PBC [3]int, realKern [3][3]*data.Slice) { if PBC != [3]int{0, 0, 0} { // the brute-force method does not work for pbc. util.Log("skipping convolution self-test for PBC") return } util.Log("//convolution self-test...") inhost := data.NewSlice(3, c.inputSize) initConvTestInput(inhost.Vectors()) gpu := NewSlice(3, c.inputSize) defer gpu.Free() data.Copy(gpu, inhost) Msat := NewSlice(1, [3]int{1, 1, 256}) defer Msat.Free() Memset(Msat, 1) vol := data.NilSlice(1, c.inputSize) c.Exec(gpu, gpu, vol, ToMSlice(Msat)) output := gpu.HostCopy() brute := data.NewSlice(3, c.inputSize) bruteConv(inhost.Vectors(), brute.Vectors(), realKern) a, b := output.Host(), brute.Host() err := float32(0) for c := range a { for i := range a[c] { if fabs(a[c][i]-b[c][i]) > err { err = fabs(a[c][i] - b[c][i]) } } } if err > CONV_TOLERANCE { util.Fatal("convolution self-test tolerance: ", err, " FAIL") } } // Maximum tolerable error on demag convolution self-test. const CONV_TOLERANCE = 1e-6 // Brute-force O(N²) vector convolution on CPU. // Used to verify GPU FFT convolution. // Input better be sparse. // A nil kernel element is interpreted as all 0s. // Kernel indices are destination index, source index. // (O0) (K01 K02 K03) (I0) // (O1) = (K11 K12 K13) * (I1) // (O2) (K21 K22 K23) (I2) func bruteConv(in, out [3][][][]float32, kernel [3][3]*data.Slice) { var kern [3][3][][][]float32 for i := range kern { for j := range kern[i] { if kernel[i][j] != nil { kern[i][j] = kernel[i][j].Scalars() } } } size := sizeOf(in[0]) ksize := sizeOf(kern[0][0]) // Zero output first for c := 0; c < 3; c++ { for iz := 0; iz < size[Z]; iz++ { for iy := 0; iy < size[Y]; iy++ { for ix := 0; ix < size[X]; ix++ { out[c][iz][iy][ix] = 0 } } } } for sc := 0; sc < 3; sc++ { for sz := 0; sz < size[Z]; sz++ { for sy := 0; sy < size[Y]; sy++ { for sx := 0; sx < size[X]; sx++ { if in[sc][sz][sy][sx] == 0 { continue // skip zero source } for dc := 0; dc < 3; dc++ { // dest component if kern[dc][sc] == nil { continue // skip zero kernel } for dz := 0; dz < size[Z]; dz++ { k := wrap(dz-sz, ksize[Z]) for dy := 0; dy < size[Y]; dy++ { j := wrap(dy-sy, ksize[Y]) for dx := 0; dx < size[X]; dx++ { i := wrap(dx-sx, ksize[X]) out[dc][dz][dy][dx] += in[sc][sz][sy][sx] * kern[dc][sc][k][j][i] } } } } } } } } } // Wraps an index to [0, max] (python-like modulus) func wrap(number, max int) int { for number < 0 { number += max } for number >= max { number -= max } return number } // generate sparse input data for testing the convolution. func initConvTestInput(input [3][][][]float32) { rng := rand.New(rand.NewSource(0)) // reproducible tests size := sizeOf(input[0]) Nx, Ny, Nz := size[X], size[Y], size[Z] ixs := [...]int{0, Nx / 5, Nx / 2, Nx - 1} iys := [...]int{0, Ny / 7, Ny / 2, Ny - 1} izs := [...]int{0, Nz / 11, Nz / 2, Nz - 1} for c := range input { for _, i := range izs { for _, j := range iys { for _, k := range ixs { input[c][i][j][k] = 1 - 2*rng.Float32() } } } } } // Returns the x, y, z size of block func sizeOf(block [][][]float32) [3]int { return [3]int{len(block[0][0]), len(block[0]), len(block)} } mumax3-3.10/cuda/copypadmul2.cu000066400000000000000000000014761371432437400163730ustar00rootroot00000000000000#include "amul.h" #include "constants.h" #include "stencil.h" #include // Copy src (size S, smaller) into dst (size D, larger), // and multiply by Bsat * vol extern "C" __global__ void copypadmul2(float* __restrict__ dst, int Dx, int Dy, int Dz, float* __restrict__ src, int Sx, int Sy, int Sz, float* __restrict__ Ms_, float Ms_mul, float* __restrict__ vol) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_6; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; BB0_3: setp.eq.s64 %p7, %rd4, 0; mov.f32 %f13, 0f3F800000; @%p7 bra BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.f32 %f13, [%rd10]; BB0_5: cvta.to.global.u64 %rd11, %rd1; cvta.to.global.u64 %rd12, %rd2; mul.wide.s32 %rd13, %r4, 4; add.s64 %rd14, %rd12, %rd13; ld.global.f32 %f8, [%rd14]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f9, %f13; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd11, %rd15; st.global.f32 [%rd16], %f11; BB0_6: ret; } ` copypadmul2_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_6; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; BB0_3: setp.eq.s64 %p7, %rd4, 0; mov.f32 %f13, 0f3F800000; @%p7 bra BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; BB0_5: cvta.to.global.u64 %rd11, %rd1; cvta.to.global.u64 %rd12, %rd2; mul.wide.s32 %rd13, %r4, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f8, [%rd14]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f9, %f13; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd11, %rd15; st.global.f32 [%rd16], %f11; BB0_6: ret; } ` copypadmul2_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_6; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; BB0_3: setp.eq.s64 %p7, %rd4, 0; mov.f32 %f13, 0f3F800000; @%p7 bra BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; BB0_5: cvta.to.global.u64 %rd11, %rd1; cvta.to.global.u64 %rd12, %rd2; mul.wide.s32 %rd13, %r4, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f8, [%rd14]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f9, %f13; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd11, %rd15; st.global.f32 [%rd16], %f11; BB0_6: ret; } ` copypadmul2_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_6; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; BB0_3: setp.eq.s64 %p7, %rd4, 0; mov.f32 %f13, 0f3F800000; @%p7 bra BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; BB0_5: cvta.to.global.u64 %rd11, %rd1; cvta.to.global.u64 %rd12, %rd2; mul.wide.s32 %rd13, %r4, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f8, [%rd14]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f9, %f13; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd11, %rd15; st.global.f32 [%rd16], %f11; BB0_6: ret; } ` copypadmul2_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_6; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; BB0_3: setp.eq.s64 %p7, %rd4, 0; mov.f32 %f13, 0f3F800000; @%p7 bra BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; BB0_5: cvta.to.global.u64 %rd11, %rd1; cvta.to.global.u64 %rd12, %rd2; mul.wide.s32 %rd13, %r4, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f8, [%rd14]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f9, %f13; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd11, %rd15; st.global.f32 [%rd16], %f11; BB0_6: ret; } ` copypadmul2_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_6; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; BB0_3: setp.eq.s64 %p7, %rd4, 0; mov.f32 %f13, 0f3F800000; @%p7 bra BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; BB0_5: cvta.to.global.u64 %rd11, %rd1; cvta.to.global.u64 %rd12, %rd2; mul.wide.s32 %rd13, %r4, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f8, [%rd14]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f9, %f13; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd11, %rd15; st.global.f32 [%rd16], %f11; BB0_6: ret; } ` copypadmul2_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_6; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; BB0_3: setp.eq.s64 %p7, %rd4, 0; mov.f32 %f13, 0f3F800000; @%p7 bra BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; BB0_5: cvta.to.global.u64 %rd11, %rd1; cvta.to.global.u64 %rd12, %rd2; mul.wide.s32 %rd13, %r4, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f8, [%rd14]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f9, %f13; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd11, %rd15; st.global.f32 [%rd16], %f11; BB0_6: ret; } ` copypadmul2_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_6; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; BB0_3: setp.eq.s64 %p7, %rd4, 0; mov.f32 %f13, 0f3F800000; @%p7 bra BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; BB0_5: cvta.to.global.u64 %rd11, %rd1; cvta.to.global.u64 %rd12, %rd2; mul.wide.s32 %rd13, %r4, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f8, [%rd14]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f9, %f13; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd11, %rd15; st.global.f32 [%rd16], %f11; BB0_6: ret; } ` copypadmul2_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_6; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; BB0_3: setp.eq.s64 %p7, %rd4, 0; mov.f32 %f13, 0f3F800000; @%p7 bra BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; BB0_5: cvta.to.global.u64 %rd11, %rd1; cvta.to.global.u64 %rd12, %rd2; mul.wide.s32 %rd13, %r4, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f8, [%rd14]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f9, %f13; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd11, %rd15; st.global.f32 [%rd16], %f11; BB0_6: ret; } ` copypadmul2_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_6; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; BB0_3: setp.eq.s64 %p7, %rd4, 0; mov.f32 %f13, 0f3F800000; @%p7 bra BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; BB0_5: cvta.to.global.u64 %rd11, %rd1; cvta.to.global.u64 %rd12, %rd2; mul.wide.s32 %rd13, %r4, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f8, [%rd14]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f9, %f13; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd11, %rd15; st.global.f32 [%rd16], %f11; BB0_6: ret; } ` copypadmul2_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_6; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; BB0_3: setp.eq.s64 %p7, %rd4, 0; mov.f32 %f13, 0f3F800000; @%p7 bra BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; BB0_5: cvta.to.global.u64 %rd11, %rd1; cvta.to.global.u64 %rd12, %rd2; mul.wide.s32 %rd13, %r4, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f8, [%rd14]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f9, %f13; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd11, %rd15; st.global.f32 [%rd16], %f11; BB0_6: ret; } ` copypadmul2_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_6; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; BB0_3: setp.eq.s64 %p7, %rd4, 0; mov.f32 %f13, 0f3F800000; @%p7 bra BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; BB0_5: cvta.to.global.u64 %rd11, %rd1; cvta.to.global.u64 %rd12, %rd2; mul.wide.s32 %rd13, %r4, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f8, [%rd14]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f9, %f13; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd11, %rd15; st.global.f32 [%rd16], %f11; BB0_6: ret; } ` copypadmul2_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl copypadmul2 .visible .entry copypadmul2( .param .u64 copypadmul2_param_0, .param .u32 copypadmul2_param_1, .param .u32 copypadmul2_param_2, .param .u32 copypadmul2_param_3, .param .u64 copypadmul2_param_4, .param .u32 copypadmul2_param_5, .param .u32 copypadmul2_param_6, .param .u32 copypadmul2_param_7, .param .u64 copypadmul2_param_8, .param .f32 copypadmul2_param_9, .param .u64 copypadmul2_param_10 ) { .reg .pred %p<8>; .reg .f32 %f<14>; .reg .b32 %r<22>; .reg .f64 %fd<3>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [copypadmul2_param_0]; ld.param.u32 %r5, [copypadmul2_param_1]; ld.param.u32 %r6, [copypadmul2_param_2]; ld.param.u64 %rd2, [copypadmul2_param_4]; ld.param.u32 %r7, [copypadmul2_param_5]; ld.param.u32 %r8, [copypadmul2_param_6]; ld.param.u32 %r9, [copypadmul2_param_7]; ld.param.u64 %rd3, [copypadmul2_param_8]; ld.param.f32 %f12, [copypadmul2_param_9]; ld.param.u64 %rd4, [copypadmul2_param_10]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r7; setp.ge.s32 %p2, %r2, %r8; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_6; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; setp.eq.s64 %p6, %rd3, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd5, %rd3; mul.wide.s32 %rd6, %r4, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f6, [%rd7]; mul.f32 %f12, %f6, %f12; BB0_3: setp.eq.s64 %p7, %rd4, 0; mov.f32 %f13, 0f3F800000; @%p7 bra BB0_5; cvta.to.global.u64 %rd8, %rd4; mul.wide.s32 %rd9, %r4, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f13, [%rd10]; BB0_5: cvta.to.global.u64 %rd11, %rd1; cvta.to.global.u64 %rd12, %rd2; mul.wide.s32 %rd13, %r4, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f8, [%rd14]; cvt.f64.f32 %fd1, %f12; mul.f64 %fd2, %fd1, 0d3EB515370F99F6CB; cvt.rn.f32.f64 %f9, %fd2; mul.f32 %f10, %f9, %f13; mul.f32 %f11, %f10, %f8; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd15, %r21, 4; add.s64 %rd16, %rd11, %rd15; st.global.f32 [%rd16], %f11; BB0_6: ret; } ` ) mumax3-3.10/cuda/copyunpad.cu000066400000000000000000000007741371432437400161360ustar00rootroot00000000000000#include "stencil.h" // Copy src (size S, larger) to dst (size D, smaller) extern "C" __global__ void copyunpad(float* __restrict__ dst, int Dx, int Dy, int Dz, float* __restrict__ src, int Sx, int Sy, int Sz) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` copyunpad_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` copyunpad_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` copyunpad_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` copyunpad_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` copyunpad_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` copyunpad_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` copyunpad_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` copyunpad_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` copyunpad_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` copyunpad_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` copyunpad_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` copyunpad_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl copyunpad .visible .entry copyunpad( .param .u64 copyunpad_param_0, .param .u32 copyunpad_param_1, .param .u32 copyunpad_param_2, .param .u32 copyunpad_param_3, .param .u64 copyunpad_param_4, .param .u32 copyunpad_param_5, .param .u32 copyunpad_param_6, .param .u32 copyunpad_param_7 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [copyunpad_param_0]; ld.param.u32 %r4, [copyunpad_param_1]; ld.param.u32 %r5, [copyunpad_param_2]; ld.param.u32 %r8, [copyunpad_param_3]; ld.param.u64 %rd2, [copyunpad_param_4]; ld.param.u32 %r6, [copyunpad_param_5]; ld.param.u32 %r7, [copyunpad_param_6]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r20, %r3, %r5, %r2; mad.lo.s32 %r21, %r20, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` ) mumax3-3.10/cuda/crop.cu000066400000000000000000000007751371432437400151000ustar00rootroot00000000000000#include "stencil.h" // See crop.go extern "C" __global__ void crop(float* __restrict__ dst, int Dx, int Dy, int Dz, float* __restrict__ src, int Sx, int Sy, int Sz, int Offx, int Offy, int Offz) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r13, %r12, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r15, %r16, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r18, %r19, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` crop_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r13, %r12, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r15, %r16, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r18, %r19, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` crop_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r13, %r12, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r15, %r16, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r18, %r19, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` crop_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r13, %r12, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r15, %r16, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r18, %r19, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` crop_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r13, %r12, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r15, %r16, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r18, %r19, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` crop_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r13, %r12, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r15, %r16, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r18, %r19, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` crop_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r13, %r12, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r15, %r16, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r18, %r19, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` crop_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r13, %r12, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r15, %r16, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r18, %r19, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` crop_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r13, %r12, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r15, %r16, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r18, %r19, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` crop_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r13, %r12, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r15, %r16, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r18, %r19, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` crop_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r13, %r12, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r15, %r16, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r18, %r19, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` crop_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r13, %r12, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r15, %r16, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r18, %r19, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` crop_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl crop .visible .entry crop( .param .u64 crop_param_0, .param .u32 crop_param_1, .param .u32 crop_param_2, .param .u32 crop_param_3, .param .u64 crop_param_4, .param .u32 crop_param_5, .param .u32 crop_param_6, .param .u32 crop_param_7, .param .u32 crop_param_8, .param .u32 crop_param_9, .param .u32 crop_param_10 ) { .reg .pred %p<6>; .reg .f32 %f<2>; .reg .b32 %r<28>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [crop_param_0]; ld.param.u32 %r4, [crop_param_1]; ld.param.u32 %r5, [crop_param_2]; ld.param.u32 %r11, [crop_param_3]; ld.param.u64 %rd2, [crop_param_4]; ld.param.u32 %r6, [crop_param_5]; ld.param.u32 %r7, [crop_param_6]; ld.param.u32 %r8, [crop_param_8]; ld.param.u32 %r9, [crop_param_9]; ld.param.u32 %r10, [crop_param_10]; mov.u32 %r12, %ctaid.x; mov.u32 %r13, %ntid.x; mov.u32 %r14, %tid.x; mad.lo.s32 %r1, %r13, %r12, %r14; mov.u32 %r15, %ntid.y; mov.u32 %r16, %ctaid.y; mov.u32 %r17, %tid.y; mad.lo.s32 %r2, %r15, %r16, %r17; mov.u32 %r18, %ntid.z; mov.u32 %r19, %ctaid.z; mov.u32 %r20, %tid.z; mad.lo.s32 %r3, %r18, %r19, %r20; setp.ge.s32 %p1, %r1, %r4; setp.ge.s32 %p2, %r2, %r5; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r11; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; add.s32 %r21, %r3, %r10; add.s32 %r22, %r2, %r9; mad.lo.s32 %r23, %r21, %r7, %r22; add.s32 %r24, %r1, %r8; mad.lo.s32 %r25, %r23, %r6, %r24; mul.wide.s32 %rd4, %r25, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; mad.lo.s32 %r26, %r3, %r5, %r2; mad.lo.s32 %r27, %r26, %r4, %r1; cvta.to.global.u64 %rd6, %rd1; mul.wide.s32 %rd7, %r27, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f1; BB0_2: ret; } ` ) mumax3-3.10/cuda/crossproduct.cu000066400000000000000000000011631371432437400166570ustar00rootroot00000000000000#include "float3.h" extern "C" __global__ void crossproduct(float* __restrict__ dstx, float* __restrict__ dsty, float* __restrict__ dstz, float* __restrict__ ax, float* __restrict__ ay, float* __restrict__ az, float* __restrict__ bx, float* __restrict__ by, float* __restrict__ bz, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { float3 A = {ax[i], ay[i], az[i]}; float3 B = {bx[i], by[i], bz[i]}; float3 AxB = cross(A, B); dstx[i] = AxB.x; dsty[i] = AxB.y; dstz[i] = AxB.z; } } mumax3-3.10/cuda/crossproduct.go000066400000000000000000000007051371432437400166560ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) func CrossProduct(dst, a, b *data.Slice) { util.Argument(dst.NComp() == 3 && a.NComp() == 3 && b.NComp() == 3) util.Argument(dst.Len() == a.Len() && dst.Len() == b.Len()) N := dst.Len() cfg := make1DConf(N) k_crossproduct_async(dst.DevPtr(X), dst.DevPtr(Y), dst.DevPtr(Z), a.DevPtr(X), a.DevPtr(Y), a.DevPtr(Z), b.DevPtr(X), b.DevPtr(Y), b.DevPtr(Z), N, cfg) } mumax3-3.10/cuda/crossproduct_wrapper.go000066400000000000000000001031341371432437400204160ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for crossproduct kernel var crossproduct_code cu.Function // Stores the arguments for crossproduct kernel invocation type crossproduct_args_t struct { arg_dstx unsafe.Pointer arg_dsty unsafe.Pointer arg_dstz unsafe.Pointer arg_ax unsafe.Pointer arg_ay unsafe.Pointer arg_az unsafe.Pointer arg_bx unsafe.Pointer arg_by unsafe.Pointer arg_bz unsafe.Pointer arg_N int argptr [10]unsafe.Pointer sync.Mutex } // Stores the arguments for crossproduct kernel invocation var crossproduct_args crossproduct_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. crossproduct_args.argptr[0] = unsafe.Pointer(&crossproduct_args.arg_dstx) crossproduct_args.argptr[1] = unsafe.Pointer(&crossproduct_args.arg_dsty) crossproduct_args.argptr[2] = unsafe.Pointer(&crossproduct_args.arg_dstz) crossproduct_args.argptr[3] = unsafe.Pointer(&crossproduct_args.arg_ax) crossproduct_args.argptr[4] = unsafe.Pointer(&crossproduct_args.arg_ay) crossproduct_args.argptr[5] = unsafe.Pointer(&crossproduct_args.arg_az) crossproduct_args.argptr[6] = unsafe.Pointer(&crossproduct_args.arg_bx) crossproduct_args.argptr[7] = unsafe.Pointer(&crossproduct_args.arg_by) crossproduct_args.argptr[8] = unsafe.Pointer(&crossproduct_args.arg_bz) crossproduct_args.argptr[9] = unsafe.Pointer(&crossproduct_args.arg_N) } // Wrapper for crossproduct CUDA kernel, asynchronous. func k_crossproduct_async(dstx unsafe.Pointer, dsty unsafe.Pointer, dstz unsafe.Pointer, ax unsafe.Pointer, ay unsafe.Pointer, az unsafe.Pointer, bx unsafe.Pointer, by unsafe.Pointer, bz unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("crossproduct") } crossproduct_args.Lock() defer crossproduct_args.Unlock() if crossproduct_code == 0 { crossproduct_code = fatbinLoad(crossproduct_map, "crossproduct") } crossproduct_args.arg_dstx = dstx crossproduct_args.arg_dsty = dsty crossproduct_args.arg_dstz = dstz crossproduct_args.arg_ax = ax crossproduct_args.arg_ay = ay crossproduct_args.arg_az = az crossproduct_args.arg_bx = bx crossproduct_args.arg_by = by crossproduct_args.arg_bz = bz crossproduct_args.arg_N = N args := crossproduct_args.argptr[:] cu.LaunchKernel(crossproduct_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("crossproduct") } } // maps compute capability on PTX code for crossproduct kernel. var crossproduct_map = map[int]string{0: "", 30: crossproduct_ptx_30, 32: crossproduct_ptx_32, 35: crossproduct_ptx_35, 37: crossproduct_ptx_37, 50: crossproduct_ptx_50, 52: crossproduct_ptx_52, 53: crossproduct_ptx_53, 60: crossproduct_ptx_60, 61: crossproduct_ptx_61, 62: crossproduct_ptx_62, 70: crossproduct_ptx_70, 72: crossproduct_ptx_72, 75: crossproduct_ptx_75} // crossproduct PTX code for various compute capabilities. const ( crossproduct_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.f32 %f1, [%rd22]; ld.global.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.f32 %f4, [%rd20]; ld.global.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; BB0_2: ret; } ` crossproduct_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; BB0_2: ret; } ` crossproduct_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; BB0_2: ret; } ` crossproduct_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; BB0_2: ret; } ` crossproduct_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; BB0_2: ret; } ` crossproduct_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; BB0_2: ret; } ` crossproduct_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; BB0_2: ret; } ` crossproduct_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; BB0_2: ret; } ` crossproduct_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; BB0_2: ret; } ` crossproduct_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; BB0_2: ret; } ` crossproduct_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; BB0_2: ret; } ` crossproduct_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; BB0_2: ret; } ` crossproduct_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl crossproduct .visible .entry crossproduct( .param .u64 crossproduct_param_0, .param .u64 crossproduct_param_1, .param .u64 crossproduct_param_2, .param .u64 crossproduct_param_3, .param .u64 crossproduct_param_4, .param .u64 crossproduct_param_5, .param .u64 crossproduct_param_6, .param .u64 crossproduct_param_7, .param .u64 crossproduct_param_8, .param .u32 crossproduct_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [crossproduct_param_0]; ld.param.u64 %rd2, [crossproduct_param_1]; ld.param.u64 %rd3, [crossproduct_param_2]; ld.param.u64 %rd4, [crossproduct_param_3]; ld.param.u64 %rd5, [crossproduct_param_4]; ld.param.u64 %rd6, [crossproduct_param_5]; ld.param.u64 %rd7, [crossproduct_param_6]; ld.param.u64 %rd8, [crossproduct_param_7]; ld.param.u64 %rd9, [crossproduct_param_8]; ld.param.u32 %r2, [crossproduct_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f7; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f12; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f15; BB0_2: ret; } ` ) mumax3-3.10/cuda/cu/000077500000000000000000000000001371432437400142025ustar00rootroot00000000000000mumax3-3.10/cuda/cu/Makefile000066400000000000000000000006351371432437400156460ustar00rootroot00000000000000all: 6g gccgo doc 6g: go install -v go tool vet *.go gofmt -w *.go GCCGO=gccgo -gccgoflags '-static-libgcc -O3' gccgo: go build -v -compiler $(GCCGO) test: 6gtest gccgotest 6gtest: go test gccgotest: go test -compiler $(GCCGO) bench: 6gbench gccgobench 6gbench: go test -bench=. gccgobench: go test -bench=. -compiler $(GCCGO) clean: go clean doc: godoc github.com/barnex/cuda5/cu > README mumax3-3.10/cuda/cu/README000066400000000000000000000551521371432437400150720ustar00rootroot00000000000000PACKAGE DOCUMENTATION package cu import "github.com/barnex/cuda5/cu" Go bindings for the CUDA driver API. CONSTANTS const ( // If the number of contexts > number of CPUs, yield to other OS threads when waiting for the GPU, otherwise CUDA spin on the processor. CTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO // Spin when waiting for results from the GPU. CTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN // Yield its thread when waiting for results from the GPU. CTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD // Bock the CPU thread on a synchronization primitive when waiting for the GPU to finish work. CTX_BLOCKING_SYNC // Support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU. CTX_MAP_HOST = C.CU_CTX_MAP_HOST //Do not reduce local memory after resizing local memory for a kernel. CTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX ) Flags for CtxCreate const ( SIZEOF_FLOAT32 = 4 SIZEOF_FLOAT64 = 8 SIZEOF_COMPLEX64 = 8 SIZEOF_COMPLEX128 = 16 ) Type size in bytes FUNCTIONS func CtxDestroy(ctx *Context) Destroys the CUDA context specified by ctx. If the context usage count is not equal to 1, or the context is current to any CPU thread other than the current one, this function fails. Floating contexts (detached from a CPU thread via cuCtxPopCurrent()) may be destroyed by this function. func CtxDisablePeerAccess(peer Context) Reverses CtxEnablePeerAccess(). func CtxEnablePeerAccess(peer Context) Make allocations from the peer Context available to the current context. func CtxGetApiVersion(ctx Context) (version int) Returns the API version to create the context. func CtxSetCurrent(ctx Context) Sets the current active context. func CtxSynchronize() Blocks until the device has completed all preceding requested tasks, if the context was created with the CU_CTX_SCHED_BLOCKING_SYNC flag. func DeviceCanAccessPeer(dev, peer Device) bool Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev. func DeviceComputeCapability(device Device) (major, minor int) Returns the compute capability of the device. func DeviceGetAttribute(attrib DeviceAttribute, dev Device) int Gets the value of a device attribute. func DeviceGetCount() int Returns the number of devices with compute capability greater than or equal to 1.0 that are available for execution. func DeviceGetName(dev Device) string Gets the name of the device. func DeviceTotalMem(device Device) int64 Returns the total amount of memory available on the device in bytes. func FuncGetAttribute(attrib FunctionAttribute, function Function) int func Init(flags int) Initialize the CUDA driver API. Currently, flags must be 0. If Init() has not been called, any function from the driver API will panic with ERROR_NOT_INITIALIZED. func LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer) func MemAllocHost(bytes int64) unsafe.Pointer func MemFree(p DevicePtr) Frees device memory allocated by MemAlloc(). It is safe to double-free. func MemFreeHost(ptr unsafe.Pointer) func MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr) Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr. func MemGetInfo() (free, total int64) Returns the free and total amount of memroy in the current Context (in bytes). func MemHostRegister(ptr unsafe.Pointer, bytes int64, flags MemHostRegisterFlag) Page-locks memory specified by the pointer and bytes. The pointer and byte size must be aligned to the host page size (4KB) See also: MemHostUnregister() func MemHostUnregister(ptr unsafe.Pointer) Unmaps memory locked by MemHostRegister(). func Memcpy(dst, src DevicePtr, bytes int64) Copies a number of bytes on the current device. Requires unified addressing to be supported. See also: MemcpyDtoD(). TODO(a): is actually an auto copy for device and/or host memory func MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream) Asynchronously copies a number of bytes on the current device. func MemcpyDtoD(dst, src DevicePtr, bytes int64) Copies a number of bytes from host to device. func MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream) Asynchronously copies a number of bytes from host to device. func MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64) Copies a number of bytes from device to host. func MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, stream Stream) Asynchronously copies a number of bytes device host to host. The host memory must be page-locked (see MemRegister) func MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64) Copies a number of bytes from host to device. func MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream) Asynchronously copies a number of bytes from host to device. The host memory must be page-locked (see MemRegister) func MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64) Copies from device memory in one context (device) to another. func MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream) Asynchronously copies from device memory in one context (device) to another. func MemsetD32(deviceptr DevicePtr, value uint32, N int64) Sets the first N 32-bit values of dst array to value. Asynchronous. func MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream Stream) Asynchronously sets the first N 32-bit values of dst array to value. func MemsetD8(deviceptr DevicePtr, value uint8, N int64) Sets the first N 8-bit values of dst array to value. Asynchronous. func MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream) Asynchronously sets the first N 32-bit values of dst array to value. func StreamDestroy(stream *Stream) Destroys an asynchronous stream func StreamSynchronize(stream Stream) Blocks until the stream has completed. func Version() int Returns the CUDA driver version. TYPES type Context uintptr CUDA context. func CtxCreate(flags uint, dev Device) Context Create a CUDA context. func CtxGetCurrent() Context Gets the current active context. func (ctx Context) ApiVersion() (version int) Returns the API version to create the context. func (ctx *Context) Destroy() Destroys the CUDA context. func (peer Context) DisablePeerAccess() Reverses EnablePeerAccess(). func (peer Context) EnablePeerAccess() Make allocations from the peer Context available to the current context. func (ctx Context) SetCurrent() Sets the current active context. type DevProp struct { MaxThreadsPerBlock int MaxThreadsDim [3]int MaxGridSize [3]int SharedMemPerBlock int TotalConstantMemory int SIMDWidth int MemPitch int RegsPerBlock int ClockRate int TextureAlign int } Device properties func DeviceGetProperties(dev Device) (prop DevProp) Returns the device's properties. type Device int CUDA Device number. func CtxGetDevice() Device Returns the ordinal of the current context's device. func DeviceGet(ordinal int) Device Returns in a device handle given an ordinal in the range [0, DeviceGetCount()-1]. func (dev Device) Attribute(attrib DeviceAttribute) int Gets the value of a device attribute. func (dev Device) CanAccessPeer(peer Device) bool Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev. func (device Device) ComputeCapability() (major, minor int) Returns the compute capability of the device. func (dev Device) Name() string Gets the name of the device. func (dev Device) Properties() DevProp Returns the device's properties. func (device Device) TotalMem() int64 Returns the total amount of memory available on the device in bytes. type DeviceAttribute int const ( MAX_THREADS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK // Maximum number of threads per block MAX_BLOCK_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X // Maximum block dimension X MAX_BLOCK_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y // Maximum block dimension Y MAX_BLOCK_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z // Maximum block dimension Z MAX_GRID_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X // Maximum grid dimension X MAX_GRID_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y // Maximum grid dimension Y MAX_GRID_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z // Maximum grid dimension Z MAX_SHARED_MEMORY_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK // Maximum shared memory available per block in bytes TOTAL_CONSTANT_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY // Memory available on device for __constant__ variables in a CUDA C kernel in bytes WARP_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE // Warp size in threads MAX_PITCH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH // Maximum pitch in bytes allowed by memory copies MAX_REGISTERS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK // Maximum number of 32-bit registers available per block CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE // Peak clock frequency in kilohertz TEXTURE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT // Alignment requirement for textures MULTIPROCESSOR_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT // Number of multiprocessors on device KERNEL_EXEC_TIMEOUT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT // Specifies whether there is a run time limit on kernels INTEGRATED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED // Device is integrated with host memory CAN_MAP_HOST_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY // Device can map host memory into CUDA address space COMPUTE_MODE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE // Compute mode (See ::CUcomputemode for details) MAXIMUM_TEXTURE1D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH // Maximum 1D texture width MAXIMUM_TEXTURE2D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH // Maximum 2D texture width MAXIMUM_TEXTURE2D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT // Maximum 2D texture height MAXIMUM_TEXTURE3D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH // Maximum 3D texture width MAXIMUM_TEXTURE3D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT // Maximum 3D texture height MAXIMUM_TEXTURE3D_DEPTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH // Maximum 3D texture depth MAXIMUM_TEXTURE2D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH // Maximum 2D layered texture width MAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height MAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture SURFACE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT // Alignment requirement for surfaces CONCURRENT_KERNELS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS // Device can possibly execute multiple kernels concurrently ECC_ENABLED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED // Device has ECC support enabled PCI_BUS_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID // PCI bus ID of the device PCI_DEVICE_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID // PCI device ID of the device TCC_DRIVER DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER // Device is using TCC driver model MEMORY_CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE // Peak memory clock frequency in kilohertz GLOBAL_MEMORY_BUS_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH // Global memory bus width in bits L2_CACHE_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE // Size of L2 cache in bytes MAX_THREADS_PER_MULTIPROCESSOR DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR // Maximum resident threads per multiprocessor ASYNC_ENGINE_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT // Number of asynchronous engines UNIFIED_ADDRESSING DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING // Device uses shares a unified address space with the host MAXIMUM_TEXTURE1D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH // Maximum 1D layered texture width MAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture ) type DevicePtr uintptr func MemAlloc(bytes int64) DevicePtr Allocates a number of bytes of device memory. func (ptr DevicePtr) Bytes() (bytes int64) Returns the size of the allocation (by MemAlloc) that contains the input pointer ptr. func (ptr DevicePtr) Free() Frees device memory allocated by MemAlloc(). Overwrites the pointer with NULL. It is safe to double-free. func (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr) Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr. func (ptr DevicePtr) MemoryType() MemoryType Returns the physical memory type that ptr addresses. func (p DevicePtr) String() string type Dim3 struct { X, Y, Z int } type Function uintptr Represents a CUDA CUfunction, a reference to a function within a module. func ModuleGetFunction(module Module, name string) Function Returns a Function handle. func (f Function) GetAttribute(attrib FunctionAttribute) int type FunctionAttribute int const ( FUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail. FUNC_A_SHARED_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES // The size in bytes of statically-allocated shared memory required by this function. FUNC_A_CONST_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES // The size in bytes of user-allocated constant memory required by this function. FUNC_A_LOCAL_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES // The size in bytes of local memory used by each thread of this function. FUNC_A_NUM_REGS FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS // The number of registers used by each thread of this function. FUNC_A_PTX_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION // The PTX virtual architecture version for which the function was compiled. FUNC_A_BINARY_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION // The binary architecture version for which the function was compiled. ) type MemHostRegisterFlag int const ( // Memory is pinned in all CUDA contexts. MEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORTABLE // Maps the allocation in CUDA address space. TODO(a): cuMemHostGetDevicePointer() MEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEVICEMAP ) Flag for MemHostRegister type MemoryType uint Physical memory type of device pointer. const ( MemoryTypeHost MemoryType = C.CU_MEMORYTYPE_HOST MemoryTypeDevice MemoryType = C.CU_MEMORYTYPE_DEVICE MemoryTypeArray MemoryType = C.CU_MEMORYTYPE_ARRAY MemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED ) func PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Result) Returns the physical memory type that ptr addresses. func (t MemoryType) String() string type Module uintptr Represents a CUDA CUmodule, a reference to executable device code. func ModuleLoad(fname string) Module Loads a compute module from file func ModuleLoadData(image string) Module Loads a compute module from string func (m Module) GetFunction(name string) Function Returns a Function handle. type Result int CUDA error status. CUDA error statuses are not returned by functions but checked and passed to panic() when not successful. If desired, they can be caught by recover(). const ( SUCCESS Result = C.CUDA_SUCCESS ERROR_INVALID_VALUE Result = C.CUDA_ERROR_INVALID_VALUE ERROR_OUT_OF_MEMORY Result = C.CUDA_ERROR_OUT_OF_MEMORY ERROR_NOT_INITIALIZED Result = C.CUDA_ERROR_NOT_INITIALIZED ERROR_DEINITIALIZED Result = C.CUDA_ERROR_DEINITIALIZED ERROR_PROFILER_DISABLED Result = C.CUDA_ERROR_PROFILER_DISABLED ERROR_PROFILER_NOT_INITIALIZED Result = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED ERROR_PROFILER_ALREADY_STARTED Result = C.CUDA_ERROR_PROFILER_ALREADY_STARTED ERROR_PROFILER_ALREADY_STOPPED Result = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED ERROR_NO_DEVICE Result = C.CUDA_ERROR_NO_DEVICE ERROR_INVALID_DEVICE Result = C.CUDA_ERROR_INVALID_DEVICE ERROR_INVALID_IMAGE Result = C.CUDA_ERROR_INVALID_IMAGE ERROR_INVALID_CONTEXT Result = C.CUDA_ERROR_INVALID_CONTEXT ERROR_CONTEXT_ALREADY_CURRENT Result = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT ERROR_MAP_FAILED Result = C.CUDA_ERROR_MAP_FAILED ERROR_UNMAP_FAILED Result = C.CUDA_ERROR_UNMAP_FAILED ERROR_ARRAY_IS_MAPPED Result = C.CUDA_ERROR_ARRAY_IS_MAPPED ERROR_ALREADY_MAPPED Result = C.CUDA_ERROR_ALREADY_MAPPED ERROR_NO_BINARY_FOR_GPU Result = C.CUDA_ERROR_NO_BINARY_FOR_GPU ERROR_ALREADY_ACQUIRED Result = C.CUDA_ERROR_ALREADY_ACQUIRED ERROR_NOT_MAPPED Result = C.CUDA_ERROR_NOT_MAPPED ERROR_NOT_MAPPED_AS_ARRAY Result = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY ERROR_NOT_MAPPED_AS_POINTER Result = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER ERROR_ECC_UNCORRECTABLE Result = C.CUDA_ERROR_ECC_UNCORRECTABLE ERROR_UNSUPPORTED_LIMIT Result = C.CUDA_ERROR_UNSUPPORTED_LIMIT ERROR_CONTEXT_ALREADY_IN_USE Result = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE ERROR_INVALID_SOURCE Result = C.CUDA_ERROR_INVALID_SOURCE ERROR_FILE_NOT_FOUND Result = C.CUDA_ERROR_FILE_NOT_FOUND ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND ERROR_SHARED_OBJECT_INIT_FAILED Result = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED ERROR_OPERATING_SYSTEM Result = C.CUDA_ERROR_OPERATING_SYSTEM ERROR_INVALID_HANDLE Result = C.CUDA_ERROR_INVALID_HANDLE ERROR_NOT_FOUND Result = C.CUDA_ERROR_NOT_FOUND ERROR_NOT_READY Result = C.CUDA_ERROR_NOT_READY ERROR_LAUNCH_FAILED Result = C.CUDA_ERROR_LAUNCH_FAILED ERROR_LAUNCH_OUT_OF_RESOURCES Result = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES ERROR_LAUNCH_TIMEOUT Result = C.CUDA_ERROR_LAUNCH_TIMEOUT ERROR_LAUNCH_INCOMPATIBLE_TEXTURING Result = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING ERROR_PEER_ACCESS_ALREADY_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED ERROR_PEER_ACCESS_NOT_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED ERROR_PRIMARY_CONTEXT_ACTIVE Result = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE ERROR_CONTEXT_IS_DESTROYED Result = C.CUDA_ERROR_CONTEXT_IS_DESTROYED ERROR_ASSERT Result = C.CUDA_ERROR_ASSERT ERROR_TOO_MANY_PEERS Result = C.CUDA_ERROR_TOO_MANY_PEERS ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED ERROR_HOST_MEMORY_NOT_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED ERROR_HARDWARE_STACK_ERROR Result = 714 //C.CUDA_ERROR_HARDWARE_STACK_ERROR ERROR_ILLEGAL_INSTRUCTION Result = 715 //C.CUDA_ERROR_ILLEGAL_INSTRUCTION ERROR_MISALIGNED_ADDRESS Result = 716 //C.CUDA_ERROR_MISALIGNED_ADDRESS ERROR_INVALID_ADDRESS_SPACE Result = 717 //C.CUDA_ERROR_INVALID_ADDRESS_SPACE ERROR_INVALID_PC Result = 718 //C.CUDA_ERROR_INVALID_PC ERROR_NOT_PERMITTED Result = 800 //C.CUDA_ERROR_NOT_PERMITTED ERROR_NOT_SUPPORTED Result = 801 //C.CUDA_ERROR_NOT_SUPPORTED ERROR_UNKNOWN Result = C.CUDA_ERROR_UNKNOWN ) func StreamQuery(stream Stream) Result Returns Success if all operations have completed, ErrorNotReady otherwise func (err Result) String() string Message string for the error type Stream uintptr CUDA stream. func StreamCreate() Stream Creates an asynchronous stream func (stream *Stream) Destroy() Destroys the asynchronous stream func (stream Stream) Query() Result Returns Success if all operations have completed, ErrorNotReady otherwise func (stream Stream) Synchronize() Blocks until the stream has completed. mumax3-3.10/cuda/cu/cgoflags.go000066400000000000000000000010131371432437400163110ustar00rootroot00000000000000package cu // This file provides CGO flags to find CUDA libraries and headers. //#cgo LDFLAGS:-lcuda // ////default location: //#cgo LDFLAGS:-L/usr/local/cuda/lib64/stubs/ //#cgo CFLAGS: -I/usr/local/cuda/include/ // ////Ubuntu 15.04: //#cgo LDFLAGS:-L/usr/lib/x86_64-linux-gnu/ //#cgo CFLAGS: -I/usr/include // ////arch linux: //#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib //#cgo CFLAGS: -I/opt/cuda/include // ////WINDOWS: //#cgo windows LDFLAGS:-LC:/cuda/lib/x64 //#cgo windows CFLAGS: -IC:/cuda/include import "C" mumax3-3.10/cuda/cu/context.go000066400000000000000000000060241371432437400162170ustar00rootroot00000000000000package cu // This file implements CUDA driver context management //#include import "C" import "unsafe" // CUDA context. type Context uintptr // Create a CUDA context. func CtxCreate(flags uint, dev Device) Context { var ctx C.CUcontext err := Result(C.cuCtxCreate(&ctx, C.uint(flags), C.CUdevice(dev))) if err != SUCCESS { panic(err) } return Context(uintptr(unsafe.Pointer(ctx))) } //Destroys the CUDA context specified by ctx. If the context usage count is not equal to 1, or the context is current to any CPU thread other than the current one, this function fails. Floating contexts (detached from a CPU thread via cuCtxPopCurrent()) may be destroyed by this function. func CtxDestroy(ctx *Context) { err := Result(C.cuCtxDestroy(C.CUcontext(unsafe.Pointer(uintptr(*ctx))))) *ctx = 0 if err != SUCCESS { panic(err) } } //Destroys the CUDA context. func (ctx *Context) Destroy() { CtxDestroy(ctx) } // Returns the API version to create the context. func CtxGetApiVersion(ctx Context) (version int) { var cversion C.uint err := Result(C.cuCtxGetApiVersion(C.CUcontext(unsafe.Pointer(uintptr(ctx))), &cversion)) if err != SUCCESS { panic(err) } version = int(cversion) return } // Returns the API version to create the context. func (ctx Context) ApiVersion() (version int) { return CtxGetApiVersion(ctx) } // Gets the current active context. func CtxGetCurrent() Context { var ctx C.CUcontext err := Result(C.cuCtxGetCurrent(&ctx)) if err != SUCCESS { panic(err) } return Context(uintptr(unsafe.Pointer(ctx))) } // Returns the ordinal of the current context's device. func CtxGetDevice() Device { var dev C.CUdevice err := Result(C.cuCtxGetDevice(&dev)) if err != SUCCESS { panic(err) } return Device(dev) } // Sets the current active context. func CtxSetCurrent(ctx Context) { err := Result(C.cuCtxSetCurrent(C.CUcontext(unsafe.Pointer(uintptr(ctx))))) if err != SUCCESS { panic(err) } } // Sets the current active context. func (ctx Context) SetCurrent() { CtxSetCurrent(ctx) } // Blocks until the device has completed all preceding requested tasks, if the context was created with the CU_CTX_SCHED_BLOCKING_SYNC flag. func CtxSynchronize() { err := Result(C.cuCtxSynchronize()) if err != SUCCESS { panic(err) } } // Flags for CtxCreate const ( // If the number of contexts > number of CPUs, yield to other OS threads when waiting for the GPU, otherwise CUDA spin on the processor. CTX_SCHED_AUTO = C.CU_CTX_SCHED_AUTO // Spin when waiting for results from the GPU. CTX_SCHED_SPIN = C.CU_CTX_SCHED_SPIN // Yield its thread when waiting for results from the GPU. CTX_SCHED_YIELD = C.CU_CTX_SCHED_YIELD // Bock the CPU thread on a synchronization primitive when waiting for the GPU to finish work. CTX_BLOCKING_SYNC // Support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU. CTX_MAP_HOST = C.CU_CTX_MAP_HOST //Do not reduce local memory after resizing local memory for a kernel. CTX_LMEM_RESIZE_TO_MAX = C.CU_CTX_LMEM_RESIZE_TO_MAX ) mumax3-3.10/cuda/cu/context_test.go000066400000000000000000000012111371432437400172470ustar00rootroot00000000000000package cu import ( "fmt" "testing" ) func TestContext(t *testing.T) { fmt.Println("CtxCreate") ctx := CtxCreate(CTX_SCHED_AUTO, 0) fmt.Println("CtxSetCurrent") CtxSetCurrent(ctx) fmt.Println("CtxGetApiVersion:", ctx.ApiVersion()) fmt.Println("CtxGetDevice:", CtxGetDevice()) (&ctx).Destroy() } func BenchmarkGetContext(b *testing.B) { b.StopTimer() ctx := CtxCreate(CTX_SCHED_AUTO, 0) CtxSetCurrent(ctx) b.StartTimer() for i := 0; i < b.N; i++ { CtxGetCurrent() } } func BenchmarkSetContext(b *testing.B) { b.StopTimer() ctx := CtxCreate(CTX_SCHED_AUTO, 0) b.StartTimer() for i := 0; i < b.N; i++ { ctx.SetCurrent() } } mumax3-3.10/cuda/cu/device.go000066400000000000000000000234161371432437400157760ustar00rootroot00000000000000package cu // This file implements CUDA driver device management //#include import "C" // CUDA Device number. type Device int // Returns the compute capability of the device. func DeviceComputeCapability(device Device) (major, minor int) { major = device.Attribute(COMPUTE_CAPABILITY_MAJOR) minor = device.Attribute(COMPUTE_CAPABILITY_MINOR) return } // Returns the compute capability of the device. func (device Device) ComputeCapability() (major, minor int) { return DeviceComputeCapability(device) } // Returns in a device handle given an ordinal in the range [0, DeviceGetCount()-1]. func DeviceGet(ordinal int) Device { var device C.CUdevice err := Result(C.cuDeviceGet(&device, C.int(ordinal))) if err != SUCCESS { panic(err) } return Device(device) } // Gets the value of a device attribute. func DeviceGetAttribute(attrib DeviceAttribute, dev Device) int { var attr C.int err := Result(C.cuDeviceGetAttribute(&attr, C.CUdevice_attribute(attrib), C.CUdevice(dev))) if err != SUCCESS { panic(err) } return int(attr) } // Gets the value of a device attribute. func (dev Device) Attribute(attrib DeviceAttribute) int { return DeviceGetAttribute(attrib, dev) } // Returns the number of devices with compute capability greater than or equal to 1.0 that are available for execution. func DeviceGetCount() int { var count C.int err := Result(C.cuDeviceGetCount(&count)) if err != SUCCESS { panic(err) } return int(count) } // Gets the name of the device. func DeviceGetName(dev Device) string { size := 256 buf := make([]byte, size) cstr := C.CString(string(buf)) err := Result(C.cuDeviceGetName(cstr, C.int(size), C.CUdevice(dev))) if err != SUCCESS { panic(err) } return C.GoString(cstr) } // Gets the name of the device. func (dev Device) Name() string { return DeviceGetName(dev) } // Device properties type DevProp struct { MaxThreadsPerBlock int MaxThreadsDim [3]int MaxGridSize [3]int SharedMemPerBlock int TotalConstantMemory int SIMDWidth int MemPitch int RegsPerBlock int ClockRate int TextureAlign int } // Returns the dev's properties. func DeviceGetProperties(dev Device) (prop DevProp) { prop.MaxThreadsPerBlock = dev.Attribute(MAX_THREADS_PER_BLOCK) prop.MaxThreadsDim[0] = dev.Attribute(MAX_BLOCK_DIM_X) prop.MaxThreadsDim[1] = dev.Attribute(MAX_BLOCK_DIM_Y) prop.MaxThreadsDim[2] = dev.Attribute(MAX_BLOCK_DIM_Z) prop.MaxGridSize[0] = dev.Attribute(MAX_GRID_DIM_X) prop.MaxGridSize[1] = dev.Attribute(MAX_GRID_DIM_Y) prop.MaxGridSize[2] = dev.Attribute(MAX_GRID_DIM_Z) prop.SharedMemPerBlock = dev.Attribute(MAX_SHARED_MEMORY_PER_BLOCK) prop.TotalConstantMemory = dev.Attribute(TOTAL_CONSTANT_MEMORY) prop.SIMDWidth = dev.Attribute(WARP_SIZE) prop.MemPitch = dev.Attribute(MAX_PITCH) prop.RegsPerBlock = dev.Attribute(MAX_REGISTERS_PER_BLOCK) prop.ClockRate = dev.Attribute(CLOCK_RATE) prop.TextureAlign = dev.Attribute(TEXTURE_ALIGNMENT) return } // Returns the device's properties. func (dev Device) Properties() DevProp { return DeviceGetProperties(dev) } // Returns the total amount of memory available on the device in bytes. func (device Device) TotalMem() int64 { return DeviceTotalMem(device) } // Returns the total amount of memory available on the device in bytes. func DeviceTotalMem(device Device) int64 { var bytes C.size_t err := Result(C.cuDeviceTotalMem(&bytes, C.CUdevice(device))) if err != SUCCESS { panic(err) } return int64(bytes) } type DeviceAttribute int const ( MAX_THREADS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK // Maximum number of threads per block MAX_BLOCK_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X // Maximum block dimension X MAX_BLOCK_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y // Maximum block dimension Y MAX_BLOCK_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z // Maximum block dimension Z MAX_GRID_DIM_X DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X // Maximum grid dimension X MAX_GRID_DIM_Y DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y // Maximum grid dimension Y MAX_GRID_DIM_Z DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z // Maximum grid dimension Z MAX_SHARED_MEMORY_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK // Maximum shared memory available per block in bytes TOTAL_CONSTANT_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY // Memory available on device for __constant__ variables in a CUDA C kernel in bytes WARP_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_WARP_SIZE // Warp size in threads MAX_PITCH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_PITCH // Maximum pitch in bytes allowed by memory copies MAX_REGISTERS_PER_BLOCK DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK // Maximum number of 32-bit registers available per block CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CLOCK_RATE // Peak clock frequency in kilohertz TEXTURE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT // Alignment requirement for textures MULTIPROCESSOR_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT // Number of multiprocessors on device KERNEL_EXEC_TIMEOUT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT // Specifies whether there is a run time limit on kernels INTEGRATED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_INTEGRATED // Device is integrated with host memory CAN_MAP_HOST_MEMORY DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY // Device can map host memory into CUDA address space COMPUTE_MODE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE // Compute mode (See ::CUcomputemode for details) MAXIMUM_TEXTURE1D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH // Maximum 1D texture width MAXIMUM_TEXTURE2D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH // Maximum 2D texture width MAXIMUM_TEXTURE2D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT // Maximum 2D texture height MAXIMUM_TEXTURE3D_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH // Maximum 3D texture width MAXIMUM_TEXTURE3D_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT // Maximum 3D texture height MAXIMUM_TEXTURE3D_DEPTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH // Maximum 3D texture depth MAXIMUM_TEXTURE2D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH // Maximum 2D layered texture width MAXIMUM_TEXTURE2D_LAYERED_HEIGHT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT // Maximum 2D layered texture height MAXIMUM_TEXTURE2D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS // Maximum layers in a 2D layered texture SURFACE_ALIGNMENT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT // Alignment requirement for surfaces CONCURRENT_KERNELS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS // Device can possibly execute multiple kernels concurrently ECC_ENABLED DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ECC_ENABLED // Device has ECC support enabled PCI_BUS_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID // PCI bus ID of the device PCI_DEVICE_ID DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID // PCI device ID of the device TCC_DRIVER DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_TCC_DRIVER // Device is using TCC driver model MEMORY_CLOCK_RATE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE // Peak memory clock frequency in kilohertz GLOBAL_MEMORY_BUS_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH // Global memory bus width in bits L2_CACHE_SIZE DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE // Size of L2 cache in bytes MAX_THREADS_PER_MULTIPROCESSOR DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR // Maximum resident threads per multiprocessor ASYNC_ENGINE_COUNT DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT // Number of asynchronous engines UNIFIED_ADDRESSING DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING // Device uses shares a unified address space with the host MAXIMUM_TEXTURE1D_LAYERED_WIDTH DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH // Maximum 1D layered texture width MAXIMUM_TEXTURE1D_LAYERED_LAYERS DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS // Maximum layers in a 1D layered texture COMPUTE_CAPABILITY_MAJOR DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR // Major compute capability version number COMPUTE_CAPABILITY_MINOR DeviceAttribute = C.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR // Minor compute capability version number ) mumax3-3.10/cuda/cu/device_test.go000066400000000000000000000106541371432437400170350ustar00rootroot00000000000000package cu import ( "fmt" "testing" ) func TestDevice(t *testing.T) { fmt.Println("DeviceGetCount:", DeviceGetCount()) for i := 0; i < DeviceGetCount(); i++ { fmt.Println("DeviceGet", i) dev := DeviceGet(i) major, minor := dev.ComputeCapability() fmt.Println("Name: ", dev.Name()) fmt.Println("ComputeCapability: ", major, minor) fmt.Println("TotalMem: ", dev.TotalMem()) fmt.Println("ATTRIBUTE_MAX_THREADS_PER_BLOCK :", dev.Attribute(MAX_THREADS_PER_BLOCK)) fmt.Println("ATTRIBUTE_MAX_BLOCK_DIM_X :", dev.Attribute(MAX_BLOCK_DIM_X)) fmt.Println("ATTRIBUTE_MAX_BLOCK_DIM_Y :", dev.Attribute(MAX_BLOCK_DIM_Y)) fmt.Println("ATTRIBUTE_MAX_BLOCK_DIM_Z :", dev.Attribute(MAX_BLOCK_DIM_Z)) fmt.Println("ATTRIBUTE_MAX_GRID_DIM_X :", dev.Attribute(MAX_GRID_DIM_X)) fmt.Println("ATTRIBUTE_MAX_GRID_DIM_Y :", dev.Attribute(MAX_GRID_DIM_Y)) fmt.Println("ATTRIBUTE_MAX_GRID_DIM_Z :", dev.Attribute(MAX_GRID_DIM_Z)) fmt.Println("ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK :", dev.Attribute(MAX_SHARED_MEMORY_PER_BLOCK)) fmt.Println("ATTRIBUTE_TOTAL_CONSTANT_MEMORY :", dev.Attribute(TOTAL_CONSTANT_MEMORY)) fmt.Println("ATTRIBUTE_WARP_SIZE :", dev.Attribute(WARP_SIZE)) fmt.Println("ATTRIBUTE_MAX_PITCH :", dev.Attribute(MAX_PITCH)) fmt.Println("ATTRIBUTE_MAX_REGISTERS_PER_BLOCK :", dev.Attribute(MAX_REGISTERS_PER_BLOCK)) fmt.Println("ATTRIBUTE_CLOCK_RATE :", dev.Attribute(CLOCK_RATE)) fmt.Println("ATTRIBUTE_TEXTURE_ALIGNMENT :", dev.Attribute(TEXTURE_ALIGNMENT)) fmt.Println("ATTRIBUTE_MULTIPROCESSOR_COUNT :", dev.Attribute(MULTIPROCESSOR_COUNT)) fmt.Println("ATTRIBUTE_KERNEL_EXEC_TIMEOUT :", dev.Attribute(KERNEL_EXEC_TIMEOUT)) fmt.Println("ATTRIBUTE_INTEGRATED :", dev.Attribute(INTEGRATED)) fmt.Println("ATTRIBUTE_CAN_MAP_HOST_MEMORY :", dev.Attribute(CAN_MAP_HOST_MEMORY)) fmt.Println("ATTRIBUTE_COMPUTE_MODE :", dev.Attribute(COMPUTE_MODE)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE1D_WIDTH)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE2D_WIDTH)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT :", dev.Attribute(MAXIMUM_TEXTURE2D_HEIGHT)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE3D_WIDTH)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT :", dev.Attribute(MAXIMUM_TEXTURE3D_HEIGHT)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH :", dev.Attribute(MAXIMUM_TEXTURE3D_DEPTH)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_WIDTH)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT:", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS:", dev.Attribute(MAXIMUM_TEXTURE2D_LAYERED_LAYERS)) fmt.Println("ATTRIBUTE_SURFACE_ALIGNMENT :", dev.Attribute(SURFACE_ALIGNMENT)) fmt.Println("ATTRIBUTE_CONCURRENT_KERNELS :", dev.Attribute(CONCURRENT_KERNELS)) fmt.Println("ATTRIBUTE_ECC_ENABLED :", dev.Attribute(ECC_ENABLED)) fmt.Println("ATTRIBUTE_PCI_BUS_ID :", dev.Attribute(PCI_BUS_ID)) fmt.Println("ATTRIBUTE_PCI_DEVICE_ID :", dev.Attribute(PCI_DEVICE_ID)) fmt.Println("ATTRIBUTE_TCC_DRIVER :", dev.Attribute(TCC_DRIVER)) fmt.Println("ATTRIBUTE_MEMORY_CLOCK_RATE :", dev.Attribute(MEMORY_CLOCK_RATE)) fmt.Println("ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH :", dev.Attribute(GLOBAL_MEMORY_BUS_WIDTH)) fmt.Println("ATTRIBUTE_L2_CACHE_SIZE :", dev.Attribute(L2_CACHE_SIZE)) fmt.Println("ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR :", dev.Attribute(MAX_THREADS_PER_MULTIPROCESSOR)) fmt.Println("ATTRIBUTE_ASYNC_ENGINE_COUNT :", dev.Attribute(ASYNC_ENGINE_COUNT)) fmt.Println("ATTRIBUTE_UNIFIED_ADDRESSING :", dev.Attribute(UNIFIED_ADDRESSING)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH :", dev.Attribute(MAXIMUM_TEXTURE1D_LAYERED_WIDTH)) fmt.Println("ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS:", dev.Attribute(MAXIMUM_TEXTURE1D_LAYERED_LAYERS)) fmt.Printf("Properties:%#v\n", dev.Properties()) } } mumax3-3.10/cuda/cu/dim3.go000066400000000000000000000000561371432437400153660ustar00rootroot00000000000000package cu type Dim3 struct { X, Y, Z int } mumax3-3.10/cuda/cu/doc.go000066400000000000000000000000631371432437400152750ustar00rootroot00000000000000// Go bindings for the CUDA driver API. package cu mumax3-3.10/cuda/cu/execution.go000066400000000000000000000025041371432437400165350ustar00rootroot00000000000000package cu // This file implements execution of CUDA kernels //#include import "C" import ( "unsafe" ) const pointerSize = 8 // sorry, 64 bits only. func LaunchKernel(f Function, gridDimX, gridDimY, gridDimZ int, blockDimX, blockDimY, blockDimZ int, sharedMemBytes int, stream Stream, kernelParams []unsafe.Pointer) { // Since Go 1.6, a cgo argument cannot have a Go pointer to Go pointer, // so we copy the argument values go C memory first. argv := C.malloc(C.size_t(len(kernelParams) * pointerSize)) argp := C.malloc(C.size_t(len(kernelParams) * pointerSize)) defer C.free(argv) defer C.free(argp) for i := range kernelParams { *((*unsafe.Pointer)(offset(argp, i))) = offset(argv, i) // argp[i] = &argv[i] *((*uint64)(offset(argv, i))) = *((*uint64)(kernelParams[i])) // argv[i] = *kernelParams[i] } err := Result(C.cuLaunchKernel( C.CUfunction(unsafe.Pointer(uintptr(f))), C.uint(gridDimX), C.uint(gridDimY), C.uint(gridDimZ), C.uint(blockDimX), C.uint(blockDimY), C.uint(blockDimZ), C.uint(sharedMemBytes), C.CUstream(unsafe.Pointer(uintptr(stream))), (*unsafe.Pointer)(argp), (*unsafe.Pointer)(unsafe.Pointer(uintptr(0))))) if err != SUCCESS { panic(err) } } func offset(ptr unsafe.Pointer, i int) unsafe.Pointer { return unsafe.Pointer(uintptr(ptr) + pointerSize*uintptr(i)) } mumax3-3.10/cuda/cu/function.go000066400000000000000000000034451371432437400163640ustar00rootroot00000000000000package cu // This file implements manipulations on CUDA functions //#include import "C" import ( "unsafe" ) // Represents a CUDA CUfunction, a reference to a function within a module. type Function uintptr func FuncGetAttribute(attrib FunctionAttribute, function Function) int { var attr C.int err := Result(C.cuFuncGetAttribute(&attr, C.CUfunction_attribute(attrib), C.CUfunction(unsafe.Pointer(uintptr(function))))) if err != SUCCESS { panic(err) } return int(attr) } func (f Function) GetAttribute(attrib FunctionAttribute) int { return FuncGetAttribute(attrib, f) } type FunctionAttribute int const ( FUNC_A_MAX_THREADS_PER_BLOCK FunctionAttribute = C.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK // The maximum number of threads per block, beyond which a launch of the function would fail. FUNC_A_SHARED_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES // The size in bytes of statically-allocated shared memory required by this function. FUNC_A_CONST_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES // The size in bytes of user-allocated constant memory required by this function. FUNC_A_LOCAL_SIZE_BYTES FunctionAttribute = C.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES // The size in bytes of local memory used by each thread of this function. FUNC_A_NUM_REGS FunctionAttribute = C.CU_FUNC_ATTRIBUTE_NUM_REGS // The number of registers used by each thread of this function. FUNC_A_PTX_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_PTX_VERSION // The PTX virtual architecture version for which the function was compiled. FUNC_A_BINARY_VERSION FunctionAttribute = C.CU_FUNC_ATTRIBUTE_BINARY_VERSION // The binary architecture version for which the function was compiled. ) mumax3-3.10/cuda/cu/init.go000066400000000000000000000005621371432437400154770ustar00rootroot00000000000000package cu // This file implements CUDA driver initialization //#include import "C" // Initialize the CUDA driver API. // Currently, flags must be 0. // If Init() has not been called, any function from the driver API will panic with ERROR_NOT_INITIALIZED. func Init(flags int) { err := Result(C.cuInit(C.uint(flags))) if err != SUCCESS { panic(err) } } mumax3-3.10/cuda/cu/init_test.go000066400000000000000000000002651371432437400165360ustar00rootroot00000000000000package cu import ( "fmt" ) // needed for all other tests. func init() { Init(0) ctx := CtxCreate(CTX_SCHED_AUTO, 0) CtxSetCurrent(ctx) fmt.Println("Created CUDA context") } mumax3-3.10/cuda/cu/memory.go000066400000000000000000000170371371432437400160510ustar00rootroot00000000000000package cu // This file implements CUDA memory management on the driver level //#include import "C" import ( "fmt" "unsafe" ) type DevicePtr uintptr // Allocates a number of bytes of device memory. func MemAlloc(bytes int64) DevicePtr { var devptr C.CUdeviceptr err := Result(C.cuMemAlloc(&devptr, C.size_t(bytes))) if err != SUCCESS { panic(err) } return DevicePtr(devptr) } // Frees device memory allocated by MemAlloc(). // It is safe to double-free. func MemFree(p DevicePtr) { if p == DevicePtr(uintptr(0)) { return // Allready freed } err := Result(C.cuMemFree(C.CUdeviceptr(p))) if err != SUCCESS { panic(err) } } // Frees device memory allocated by MemAlloc(). // Overwrites the pointer with NULL. // It is safe to double-free. func (ptr DevicePtr) Free() { MemFree(ptr) } // Copies a number of bytes on the current device. // Requires unified addressing to be supported. // See also: MemcpyDtoD(). func Memcpy(dst, src DevicePtr, bytes int64) { err := Result(C.cuMemcpy(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes))) if err != SUCCESS { panic(err) } } // Asynchronously copies a number of bytes on the current device. func MemcpyAsync(dst, src DevicePtr, bytes int64, stream Stream) { err := Result(C.cuMemcpyAsync(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } // Copies a number of bytes from host to device. func MemcpyDtoD(dst, src DevicePtr, bytes int64) { err := Result(C.cuMemcpyDtoD(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes))) if err != SUCCESS { panic(err) } } // Asynchronously copies a number of bytes from host to device. func MemcpyDtoDAsync(dst, src DevicePtr, bytes int64, stream Stream) { err := Result(C.cuMemcpyDtoDAsync(C.CUdeviceptr(dst), C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } // Copies a number of bytes from host to device. func MemcpyHtoD(dst DevicePtr, src unsafe.Pointer, bytes int64) { err := Result(C.cuMemcpyHtoD(C.CUdeviceptr(dst), src, C.size_t(bytes))) if err != SUCCESS { panic(err) } } // Asynchronously copies a number of bytes from host to device. // The host memory must be page-locked (see MemRegister) func MemcpyHtoDAsync(dst DevicePtr, src unsafe.Pointer, bytes int64, stream Stream) { err := Result(C.cuMemcpyHtoDAsync(C.CUdeviceptr(dst), src, C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } // Copies a number of bytes from device to host. func MemcpyDtoH(dst unsafe.Pointer, src DevicePtr, bytes int64) { err := Result(C.cuMemcpyDtoH(dst, C.CUdeviceptr(src), C.size_t(bytes))) if err != SUCCESS { panic(err) } } // Asynchronously copies a number of bytes device host to host. // The host memory must be page-locked (see MemRegister) func MemcpyDtoHAsync(dst unsafe.Pointer, src DevicePtr, bytes int64, stream Stream) { err := Result(C.cuMemcpyDtoHAsync(dst, C.CUdeviceptr(src), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } // Copies from device memory in one context (device) to another. func MemcpyPeer(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64) { err := Result(C.cuMemcpyPeer(C.CUdeviceptr(dst), C.CUcontext(unsafe.Pointer(uintptr(dstCtx))), C.CUdeviceptr(src), C.CUcontext(unsafe.Pointer(uintptr(srcCtx))), C.size_t(bytes))) if err != SUCCESS { panic(err) } } // Asynchronously copies from device memory in one context (device) to another. func MemcpyPeerAsync(dst DevicePtr, dstCtx Context, src DevicePtr, srcCtx Context, bytes int64, stream Stream) { err := Result(C.cuMemcpyPeerAsync(C.CUdeviceptr(dst), C.CUcontext(unsafe.Pointer(uintptr(dstCtx))), C.CUdeviceptr(src), C.CUcontext(unsafe.Pointer(uintptr(srcCtx))), C.size_t(bytes), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } // Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr. func MemGetAddressRange(ptr DevicePtr) (bytes int64, base DevicePtr) { var cbytes C.size_t var cptr C.CUdeviceptr err := Result(C.cuMemGetAddressRange(&cptr, &cbytes, C.CUdeviceptr(ptr))) if err != SUCCESS { panic(err) } bytes = int64(cbytes) base = DevicePtr(cptr) return } // Returns the base address and size of the allocation (by MemAlloc) that contains the input pointer ptr. func (ptr DevicePtr) GetAddressRange() (bytes int64, base DevicePtr) { return MemGetAddressRange(ptr) } // Returns the size of the allocation (by MemAlloc) that contains the input pointer ptr. func (ptr DevicePtr) Bytes() (bytes int64) { bytes, _ = MemGetAddressRange(ptr) return } // Returns the free and total amount of memroy in the current Context (in bytes). func MemGetInfo() (free, total int64) { var cfree, ctotal C.size_t err := Result(C.cuMemGetInfo(&cfree, &ctotal)) if err != SUCCESS { panic(err) } free = int64(cfree) total = int64(ctotal) return } // Page-locks memory specified by the pointer and bytes. // The pointer and byte size must be aligned to the host page size (4KB) // See also: MemHostUnregister() // doesn't link with cuda6.5 //func MemHostRegister(ptr unsafe.Pointer, bytes int64, flags MemHostRegisterFlag) { // err := Result(C.cuMemHostRegister(ptr, C.size_t(bytes), C.uint(flags))) // if err != SUCCESS { // panic(err) // } //} // Unmaps memory locked by MemHostRegister(). // doesn't link with cuda6.5 //func MemHostUnregister(ptr unsafe.Pointer) { // err := Result(C.cuMemHostUnregister(ptr)) // if err != SUCCESS { // panic(err) // } //} func MemAllocHost(bytes int64) unsafe.Pointer { var p unsafe.Pointer err := Result(C.cuMemAllocHost(&p, C.size_t(bytes))) if err != SUCCESS { panic(err) } return p } func MemFreeHost(ptr unsafe.Pointer) { err := Result(C.cuMemFreeHost(ptr)) if err != SUCCESS { panic(err) } } type MemHostRegisterFlag int // Flag for MemHostRegister const ( // Memory is pinned in all CUDA contexts. MEMHOSTREGISTER_PORTABLE MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_PORTABLE // Maps the allocation in CUDA address space. TODO(a): cuMemHostGetDevicePointer() MEMHOSTREGISTER_DEVICEMAP MemHostRegisterFlag = C.CU_MEMHOSTREGISTER_DEVICEMAP ) func (p DevicePtr) String() string { return fmt.Sprint(unsafe.Pointer(uintptr(p))) } // Type size in bytes const ( SIZEOF_FLOAT32 = 4 SIZEOF_FLOAT64 = 8 SIZEOF_COMPLEX64 = 8 SIZEOF_COMPLEX128 = 16 ) // Physical memory type of device pointer. type MemoryType uint const ( MemoryTypeHost MemoryType = C.CU_MEMORYTYPE_HOST MemoryTypeDevice MemoryType = C.CU_MEMORYTYPE_DEVICE MemoryTypeArray MemoryType = C.CU_MEMORYTYPE_ARRAY MemoryTypeUnified MemoryType = C.CU_MEMORYTYPE_UNIFIED ) var memorytype = map[MemoryType]string{ MemoryTypeHost: "MemoryTypeHost", MemoryTypeDevice: "MemoryTypeDevice", MemoryTypeArray: "MemoryTypeArray", MemoryTypeUnified: "MemoryTypeUnified"} func (t MemoryType) String() string { if s, ok := memorytype[t]; ok { return s } return "MemoryTypeUnknown" } // Returns the physical memory type that ptr addresses. func PointerGetAttributeMemoryType(ptr DevicePtr) (t MemoryType, err Result) { var typ uint64 // foresee enough memory just to be safe err = Result(C.cuPointerGetAttribute(unsafe.Pointer(&typ), C.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, C.CUdeviceptr(uintptr(ptr)))) return MemoryType(uint(typ)), err } // Returns the physical memory type that ptr addresses. func (ptr DevicePtr) MemoryType() MemoryType { t, err := PointerGetAttributeMemoryType(ptr) if err != SUCCESS { panic(err) } return t } mumax3-3.10/cuda/cu/memory_test.go000066400000000000000000000103671371432437400171070ustar00rootroot00000000000000package cu import ( "fmt" "math" "testing" "unsafe" ) func TestMalloc(t *testing.T) { for i := 0; i < 1024; i++ { pointer := MemAlloc(16 * 1024 * 1024) pointer.Free() } for i := 0; i < 1024; i++ { pointer := MemAlloc(16 * 1024 * 1024) MemFree(pointer) } } func BenchmarkMallocFree1B(b *testing.B) { for i := 0; i < b.N; i++ { m := MemAlloc(1) m.Free() } } func BenchmarkMallocFree1kB(b *testing.B) { for i := 0; i < b.N; i++ { m := MemAlloc(1024) m.Free() } } func BenchmarkMallocFree1MB(b *testing.B) { for i := 0; i < b.N; i++ { m := MemAlloc(1024 * 1024) m.Free() } } func TestMemAddressRange(t *testing.T) { N := 12345 ptr := MemAlloc(int64(N)) size, base := MemGetAddressRange(ptr) if size != int64(N) { t.Fail() } if base != ptr { t.Fail() } size, base = 0, DevicePtr(0) size, base = ptr.GetAddressRange() if ptr.Bytes() != int64(N) { t.Fail() } } func TestMemGetInfo(t *testing.T) { free, total := MemGetInfo() fmt.Println("MemGetInfo: ", free, "/", total) if free > total { t.Fail() } if total == 0 { t.Fail() } } func TestMemsetAsync(t *testing.T) { N := int64(32 * 1024) host1 := make([]float32, N) for i := range host1 { host1[i] = float32(i) } host2 := make([]float32, N) dev1 := MemAlloc(int64(4 * N)) MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N) str := StreamCreate() MemsetD32Async(dev1, math.Float32bits(42), N, str) MemsetD32Async(dev1, math.Float32bits(21), N/2, str) MemcpyDtoH((unsafe.Pointer(&host2[0])), dev1, 4*N) str.Synchronize() (&str).Destroy() for i := 0; i < len(host2)/2; i++ { if host2[i] != 21 { t.Fail() } } for i := len(host2) / 2; i < len(host2); i++ { if host2[i] != 42 { t.Fail() } } dev1.Free() } func TestMemset(t *testing.T) { N := int64(32 * 1024) host1 := make([]float32, N) for i := range host1 { host1[i] = float32(i) } host2 := make([]float32, N) dev1 := MemAlloc(int64(4 * N)) MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N) MemsetD32(dev1, math.Float32bits(42), N) MemsetD32(dev1, math.Float32bits(21), N/2) MemcpyDtoH((unsafe.Pointer(&host2[0])), dev1, 4*N) for i := 0; i < len(host2)/2; i++ { if host2[i] != 21 { t.Fail() } } for i := len(host2) / 2; i < len(host2); i++ { if host2[i] != 42 { t.Fail() } } dev1.Free() } func TestMemcpy(t *testing.T) { N := int64(32 * 1024) host1 := make([]float32, N) for i := range host1 { host1[i] = float32(i) } host2 := make([]float32, N) dev1 := MemAlloc(int64(4 * N)) dev2 := MemAlloc(int64(4 * N)) MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N) MemcpyDtoD(dev2, dev1, 4*N) MemcpyDtoH((unsafe.Pointer(&host2[0])), dev2, 4*N) for i := range host2 { if host2[i] != float32(i) { t.Fail() } } dev1.Free() dev2.Free() } func TestMemcpyAsync(t *testing.T) { N := int64(32 * 1024) host1 := make([]float32, N) for i := range host1 { host1[i] = float32(i) } host2 := make([]float32, N) dev1 := MemAlloc(int64(4 * N)) dev2 := MemAlloc(int64(4 * N)) stream := StreamCreate() MemcpyHtoDAsync(dev1, (unsafe.Pointer(&host1[0])), 4*N, stream) MemcpyDtoDAsync(dev2, dev1, 4*N, stream) MemcpyDtoHAsync((unsafe.Pointer(&host2[0])), dev2, 4*N, stream) stream.Synchronize() for i := range host2 { if host2[i] != float32(i) { t.Fail() } } dev1.Free() dev2.Free() } func TestMemcpyAsyncRegistered(t *testing.T) { N := int64(32 * 1024) host1 := make([]float32, N) for i := range host1 { host1[i] = float32(i) } host2 := make([]float32, N) dev1 := MemAlloc(int64(4 * N)) dev2 := MemAlloc(int64(4 * N)) stream := StreamCreate() MemcpyHtoDAsync(dev1, (unsafe.Pointer(&host1[0])), 4*N, stream) MemcpyDtoDAsync(dev2, dev1, 4*N, stream) MemcpyDtoHAsync((unsafe.Pointer(&host2[0])), dev2, 4*N, stream) stream.Synchronize() for i := range host2 { if host2[i] != float32(i) { t.Fail() } } dev1.Free() dev2.Free() } func BenchmarkMemcpy(b *testing.B) { b.StopTimer() N := int64(32 * 1024 * 1024) host1 := make([]float32, N) host2 := make([]float32, N) dev1 := MemAlloc(int64(4 * N)) defer dev1.Free() dev2 := MemAlloc(int64(4 * N)) defer dev2.Free() b.SetBytes(4 * N) b.StartTimer() for i := 0; i < b.N; i++ { MemcpyHtoD(dev1, (unsafe.Pointer(&host1[0])), 4*N) MemcpyDtoD(dev2, dev1, 4*N) MemcpyDtoH((unsafe.Pointer(&host2[0])), dev2, 4*N) } } mumax3-3.10/cuda/cu/memset.go000066400000000000000000000024001371432437400160170ustar00rootroot00000000000000package cu // This file implements CUDA memset functions. //#include import "C" import ( "unsafe" ) // Sets the first N 32-bit values of dst array to value. // Asynchronous. func MemsetD32(deviceptr DevicePtr, value uint32, N int64) { err := Result(C.cuMemsetD32(C.CUdeviceptr(deviceptr), C.uint(value), C.size_t(N))) if err != SUCCESS { panic(err) } } // Asynchronously sets the first N 32-bit values of dst array to value. func MemsetD32Async(deviceptr DevicePtr, value uint32, N int64, stream Stream) { err := Result(C.cuMemsetD32Async(C.CUdeviceptr(deviceptr), C.uint(value), C.size_t(N), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } // Sets the first N 8-bit values of dst array to value. // Asynchronous. func MemsetD8(deviceptr DevicePtr, value uint8, N int64) { err := Result(C.cuMemsetD8(C.CUdeviceptr(deviceptr), C.uchar(value), C.size_t(N))) if err != SUCCESS { panic(err) } } // Asynchronously sets the first N 32-bit values of dst array to value. func MemsetD8Async(deviceptr DevicePtr, value uint8, N int64, stream Stream) { err := Result(C.cuMemsetD8Async(C.CUdeviceptr(deviceptr), C.uchar(value), C.size_t(N), C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } mumax3-3.10/cuda/cu/module.go000066400000000000000000000022711371432437400160200ustar00rootroot00000000000000package cu // This file implements loading of CUDA ptx modules //#include import "C" import ( "unsafe" ) // Represents a CUDA CUmodule, a reference to executable device code. type Module uintptr // Loads a compute module from file func ModuleLoad(fname string) Module { //fmt.Fprintln(os.Stderr, "driver.ModuleLoad", fname) var mod C.CUmodule err := Result(C.cuModuleLoad(&mod, C.CString(fname))) if err != SUCCESS { panic(err) } return Module(uintptr(unsafe.Pointer(mod))) } // Loads a compute module from string func ModuleLoadData(image string) Module { var mod C.CUmodule err := Result(C.cuModuleLoadData(&mod, unsafe.Pointer(C.CString(image)))) if err != SUCCESS { panic(err) } return Module(uintptr(unsafe.Pointer(mod))) } // Returns a Function handle. func ModuleGetFunction(module Module, name string) Function { var function C.CUfunction err := Result(C.cuModuleGetFunction( &function, C.CUmodule(unsafe.Pointer(uintptr(module))), C.CString(name))) if err != SUCCESS { panic(err) } return Function(uintptr(unsafe.Pointer(function))) } // Returns a Function handle. func (m Module) GetFunction(name string) Function { return ModuleGetFunction(m, name) } mumax3-3.10/cuda/cu/module_test.go000066400000000000000000000015161371432437400170600ustar00rootroot00000000000000package cu import ( "testing" "unsafe" //"fmt" ) func TestModule(test *testing.T) { mod := ModuleLoad("/testdata/testmodule.ptx") f := mod.GetFunction("testMemset") N := 1000 N4 := 4 * int64(N) a := make([]float32, N) A := MemAlloc(N4) defer A.Free() aptr := unsafe.Pointer(&a[0]) MemcpyHtoD(A, aptr, N4) var value float32 value = 42 var n int n = N / 2 block := 128 grid := DivUp(N, block) shmem := 0 args := []unsafe.Pointer{unsafe.Pointer(&A), unsafe.Pointer(&value), unsafe.Pointer(&n)} LaunchKernel(f, grid, 1, 1, block, 1, 1, shmem, 0, args) MemcpyDtoH(aptr, A, N4) for i := 0; i < N/2; i++ { if a[i] != 42 { test.Fail() } } for i := N / 2; i < N; i++ { if a[i] != 0 { test.Fail() } } //fmt.Println(a) } // Integer division rounded up. func DivUp(x, y int) int { return ((x - 1) / y) + 1 } mumax3-3.10/cuda/cu/peer.go000066400000000000000000000024571371432437400154740ustar00rootroot00000000000000package cu // This file implements CUDA unified addressing. //#include import "C" import ( "unsafe" ) // Make allocations from the peer Context available to the current context. func CtxEnablePeerAccess(peer Context) { err := Result(C.cuCtxEnablePeerAccess(C.CUcontext(unsafe.Pointer(uintptr(peer))), C.uint(0))) if err != SUCCESS { panic(err) } } // Make allocations from the peer Context available to the current context. func (peer Context) EnablePeerAccess() { CtxEnablePeerAccess(peer) } // Reverses CtxEnablePeerAccess(). func CtxDisablePeerAccess(peer Context) { err := Result(C.cuCtxDisablePeerAccess(C.CUcontext(unsafe.Pointer(uintptr(peer))))) if err != SUCCESS { panic(err) } } // Reverses EnablePeerAccess(). func (peer Context) DisablePeerAccess() { CtxDisablePeerAccess(peer) } // Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev. func DeviceCanAccessPeer(dev, peer Device) bool { var canAccessPeer C.int err := Result(C.cuDeviceCanAccessPeer(&canAccessPeer, C.CUdevice(dev), C.CUdevice(peer))) if err != SUCCESS { panic(err) } return int(canAccessPeer) != 0 } // Returns true if CtxEnablePeerAccess can be called on a context for dev and peerDev. func (dev Device) CanAccessPeer(peer Device) bool { return DeviceCanAccessPeer(dev, peer) } mumax3-3.10/cuda/cu/result.go000066400000000000000000000207721371432437400160570ustar00rootroot00000000000000package cu // This file provides access to CUDA driver error statuses (type CUresult). //#include import "C" import ( "fmt" ) // CUDA error status. // CUDA error statuses are not returned by functions but checked and passed to // panic() when not successful. If desired, they can be caught by // recover(). type Result int // Message string for the error func (err Result) String() string { str, ok := errorString[err] if !ok { return "Unknown CUresult: " + fmt.Sprint(int(err)) } return str } const ( SUCCESS Result = C.CUDA_SUCCESS ERROR_INVALID_VALUE Result = C.CUDA_ERROR_INVALID_VALUE ERROR_OUT_OF_MEMORY Result = C.CUDA_ERROR_OUT_OF_MEMORY ERROR_NOT_INITIALIZED Result = C.CUDA_ERROR_NOT_INITIALIZED ERROR_DEINITIALIZED Result = C.CUDA_ERROR_DEINITIALIZED ERROR_PROFILER_DISABLED Result = C.CUDA_ERROR_PROFILER_DISABLED ERROR_PROFILER_NOT_INITIALIZED Result = C.CUDA_ERROR_PROFILER_NOT_INITIALIZED ERROR_PROFILER_ALREADY_STARTED Result = C.CUDA_ERROR_PROFILER_ALREADY_STARTED ERROR_PROFILER_ALREADY_STOPPED Result = C.CUDA_ERROR_PROFILER_ALREADY_STOPPED ERROR_NO_DEVICE Result = C.CUDA_ERROR_NO_DEVICE ERROR_INVALID_DEVICE Result = C.CUDA_ERROR_INVALID_DEVICE ERROR_INVALID_IMAGE Result = C.CUDA_ERROR_INVALID_IMAGE ERROR_INVALID_CONTEXT Result = C.CUDA_ERROR_INVALID_CONTEXT ERROR_CONTEXT_ALREADY_CURRENT Result = C.CUDA_ERROR_CONTEXT_ALREADY_CURRENT ERROR_MAP_FAILED Result = C.CUDA_ERROR_MAP_FAILED ERROR_UNMAP_FAILED Result = C.CUDA_ERROR_UNMAP_FAILED ERROR_ARRAY_IS_MAPPED Result = C.CUDA_ERROR_ARRAY_IS_MAPPED ERROR_ALREADY_MAPPED Result = C.CUDA_ERROR_ALREADY_MAPPED ERROR_NO_BINARY_FOR_GPU Result = C.CUDA_ERROR_NO_BINARY_FOR_GPU ERROR_ALREADY_ACQUIRED Result = C.CUDA_ERROR_ALREADY_ACQUIRED ERROR_NOT_MAPPED Result = C.CUDA_ERROR_NOT_MAPPED ERROR_NOT_MAPPED_AS_ARRAY Result = C.CUDA_ERROR_NOT_MAPPED_AS_ARRAY ERROR_NOT_MAPPED_AS_POINTER Result = C.CUDA_ERROR_NOT_MAPPED_AS_POINTER ERROR_ECC_UNCORRECTABLE Result = C.CUDA_ERROR_ECC_UNCORRECTABLE ERROR_UNSUPPORTED_LIMIT Result = C.CUDA_ERROR_UNSUPPORTED_LIMIT ERROR_CONTEXT_ALREADY_IN_USE Result = C.CUDA_ERROR_CONTEXT_ALREADY_IN_USE ERROR_INVALID_SOURCE Result = C.CUDA_ERROR_INVALID_SOURCE ERROR_FILE_NOT_FOUND Result = C.CUDA_ERROR_FILE_NOT_FOUND ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND Result = C.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND ERROR_SHARED_OBJECT_INIT_FAILED Result = C.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED ERROR_OPERATING_SYSTEM Result = C.CUDA_ERROR_OPERATING_SYSTEM ERROR_INVALID_HANDLE Result = C.CUDA_ERROR_INVALID_HANDLE ERROR_NOT_FOUND Result = C.CUDA_ERROR_NOT_FOUND ERROR_NOT_READY Result = C.CUDA_ERROR_NOT_READY ERROR_LAUNCH_FAILED Result = C.CUDA_ERROR_LAUNCH_FAILED ERROR_LAUNCH_OUT_OF_RESOURCES Result = C.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES ERROR_LAUNCH_TIMEOUT Result = C.CUDA_ERROR_LAUNCH_TIMEOUT ERROR_LAUNCH_INCOMPATIBLE_TEXTURING Result = C.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING ERROR_PEER_ACCESS_ALREADY_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED ERROR_PEER_ACCESS_NOT_ENABLED Result = C.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED ERROR_PRIMARY_CONTEXT_ACTIVE Result = C.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE ERROR_CONTEXT_IS_DESTROYED Result = C.CUDA_ERROR_CONTEXT_IS_DESTROYED ERROR_ASSERT Result = C.CUDA_ERROR_ASSERT ERROR_TOO_MANY_PEERS Result = C.CUDA_ERROR_TOO_MANY_PEERS ERROR_HOST_MEMORY_ALREADY_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED ERROR_HOST_MEMORY_NOT_REGISTERED Result = C.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED ERROR_HARDWARE_STACK_ERROR Result = 714 //C.CUDA_ERROR_HARDWARE_STACK_ERROR ERROR_ILLEGAL_INSTRUCTION Result = 715 //C.CUDA_ERROR_ILLEGAL_INSTRUCTION ERROR_MISALIGNED_ADDRESS Result = 716 //C.CUDA_ERROR_MISALIGNED_ADDRESS ERROR_INVALID_ADDRESS_SPACE Result = 717 //C.CUDA_ERROR_INVALID_ADDRESS_SPACE ERROR_INVALID_PC Result = 718 //C.CUDA_ERROR_INVALID_PC ERROR_NOT_PERMITTED Result = 800 //C.CUDA_ERROR_NOT_PERMITTED ERROR_NOT_SUPPORTED Result = 801 //C.CUDA_ERROR_NOT_SUPPORTED ERROR_UNKNOWN Result = C.CUDA_ERROR_UNKNOWN ) // Map with error strings for Result error numbers var errorString map[Result]string = map[Result]string{ SUCCESS: "CUDA_SUCCESS", ERROR_INVALID_VALUE: "CUDA_ERROR_INVALID_VALUE", ERROR_OUT_OF_MEMORY: "CUDA_ERROR_OUT_OF_MEMORY", ERROR_NOT_INITIALIZED: "CUDA_ERROR_NOT_INITIALIZED", ERROR_DEINITIALIZED: "CUDA_ERROR_DEINITIALIZED", ERROR_PROFILER_DISABLED: "CUDA_ERROR_PROFILER_DISABLED", ERROR_PROFILER_NOT_INITIALIZED: "CUDA_ERROR_PROFILER_NOT_INITIALIZED", ERROR_PROFILER_ALREADY_STARTED: "CUDA_ERROR_PROFILER_ALREADY_STARTED", ERROR_PROFILER_ALREADY_STOPPED: "CUDA_ERROR_PROFILER_ALREADY_STOPPED", ERROR_NO_DEVICE: "CUDA_ERROR_NO_DEVICE", ERROR_INVALID_DEVICE: "CUDA_ERROR_INVALID_DEVICE", ERROR_INVALID_IMAGE: "CUDA_ERROR_INVALID_IMAGE", ERROR_INVALID_CONTEXT: "CUDA_ERROR_INVALID_CONTEXT", ERROR_CONTEXT_ALREADY_CURRENT: "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", ERROR_MAP_FAILED: "CUDA_ERROR_MAP_FAILED", ERROR_UNMAP_FAILED: "CUDA_ERROR_UNMAP_FAILED", ERROR_ARRAY_IS_MAPPED: "CUDA_ERROR_ARRAY_IS_MAPPED", ERROR_ALREADY_MAPPED: "CUDA_ERROR_ALREADY_MAPPED", ERROR_NO_BINARY_FOR_GPU: "CUDA_ERROR_NO_BINARY_FOR_GPU", ERROR_ALREADY_ACQUIRED: "CUDA_ERROR_ALREADY_ACQUIRED", ERROR_NOT_MAPPED: "CUDA_ERROR_NOT_MAPPED", ERROR_NOT_MAPPED_AS_ARRAY: "CUDA_ERROR_NOT_MAPPED_AS_ARRAY", ERROR_NOT_MAPPED_AS_POINTER: "CUDA_ERROR_NOT_MAPPED_AS_POINTER", ERROR_ECC_UNCORRECTABLE: "CUDA_ERROR_ECC_UNCORRECTABLE", ERROR_UNSUPPORTED_LIMIT: "CUDA_ERROR_UNSUPPORTED_LIMIT", ERROR_CONTEXT_ALREADY_IN_USE: "CUDA_ERROR_CONTEXT_ALREADY_IN_USE", ERROR_INVALID_SOURCE: "CUDA_ERROR_INVALID_SOURCE", ERROR_FILE_NOT_FOUND: "CUDA_ERROR_FILE_NOT_FOUND", ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", ERROR_SHARED_OBJECT_INIT_FAILED: "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", ERROR_OPERATING_SYSTEM: "CUDA_ERROR_OPERATING_SYSTEM", ERROR_INVALID_HANDLE: "CUDA_ERROR_INVALID_HANDLE", ERROR_NOT_FOUND: "CUDA_ERROR_NOT_FOUND", ERROR_NOT_READY: "CUDA_ERROR_NOT_READY", ERROR_LAUNCH_OUT_OF_RESOURCES: "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", ERROR_LAUNCH_TIMEOUT: "CUDA_ERROR_LAUNCH_TIMEOUT", ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", ERROR_PEER_ACCESS_ALREADY_ENABLED: "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", ERROR_PEER_ACCESS_NOT_ENABLED: "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", ERROR_PRIMARY_CONTEXT_ACTIVE: "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", ERROR_CONTEXT_IS_DESTROYED: "CUDA_ERROR_CONTEXT_IS_DESTROYED", ERROR_ASSERT: "CUDA_ERROR_ASSERT", ERROR_TOO_MANY_PEERS: "CUDA_ERROR_TOO_MANY_PEERS", ERROR_HOST_MEMORY_ALREADY_REGISTERED: "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", ERROR_HOST_MEMORY_NOT_REGISTERED: "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", ERROR_HARDWARE_STACK_ERROR: "CUDA_ERROR_HARDWARE_STACK_ERROR", ERROR_ILLEGAL_INSTRUCTION: "CUDA_ERROR_ILLEGAL_INSTRUCTION", ERROR_MISALIGNED_ADDRESS: "CUDA_ERROR_MISALIGNED_ADDRESS", ERROR_INVALID_ADDRESS_SPACE: "CUDA_ERROR_INVALID_ADDRESS_SPACE", ERROR_INVALID_PC: "CUDA_ERROR_INVALID_PC", ERROR_LAUNCH_FAILED: "CUDA_ERROR_LAUNCH_FAILED", ERROR_NOT_PERMITTED: "CUDA_ERROR_NOT_PERMITTED", ERROR_NOT_SUPPORTED: "CUDA_ERROR_NOT_SUPPORTED", ERROR_UNKNOWN: "CUDA_ERROR_UNKNOWN"} mumax3-3.10/cuda/cu/stream.go000066400000000000000000000024751371432437400160340ustar00rootroot00000000000000package cu // This file implements CUDA streams //#include import "C" import "unsafe" // CUDA stream. type Stream uintptr // Creates an asynchronous stream func StreamCreate() Stream { var stream C.CUstream err := Result(C.cuStreamCreate(&stream, C.uint(0))) // flags has to be zero if err != SUCCESS { panic(err) } return Stream(uintptr(unsafe.Pointer(stream))) } // Destroys the asynchronous stream func (stream *Stream) Destroy() { str := *stream err := Result(C.cuStreamDestroy(C.CUstream(unsafe.Pointer(uintptr(str))))) *stream = 0 if err != SUCCESS { panic(err) } } // Destroys an asynchronous stream func StreamDestroy(stream *Stream) { stream.Destroy() } // Blocks until the stream has completed. func (stream Stream) Synchronize() { err := Result(C.cuStreamSynchronize(C.CUstream(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } // Returns Success if all operations have completed, ErrorNotReady otherwise func (stream Stream) Query() Result { return Result(C.cuStreamQuery(C.CUstream(unsafe.Pointer(uintptr(stream))))) } // Returns Success if all operations have completed, ErrorNotReady otherwise func StreamQuery(stream Stream) Result { return stream.Query() } // Blocks until the stream has completed. func StreamSynchronize(stream Stream) { stream.Synchronize() } mumax3-3.10/cuda/cu/testdata/000077500000000000000000000000001371432437400160135ustar00rootroot00000000000000mumax3-3.10/cuda/cu/testdata/testmodule.cu000066400000000000000000000006731371432437400205370ustar00rootroot00000000000000/* * Module to test CUDA module loading and execution. * To be compiled with: * nvcc -ptx testmodule.cu */ #ifdef __cplusplus extern "C" { #endif #define threadindex ( ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x ) /// Sets the first N elements of array to value. __global__ void testMemset(float* array, float value, int N){ int i = threadindex; if(i < N){ array[i] = value; } } #ifdef __cplusplus } #endif mumax3-3.10/cuda/cu/testdata/testmodule.ptx000066400000000000000000000062551371432437400207450ustar00rootroot00000000000000 .version 1.4 .target sm_10, map_f64_to_f32 // compiled with /usr/local/cuda/open64/lib//be // nvopencc 4.0 built on 2011-02-18 //----------------------------------------------------------- // Compiling /tmp/tmpxft_00000e56_00000000-9_testmodule.cpp3.i (/tmp/ccBI#.rDLD4T) //----------------------------------------------------------- //----------------------------------------------------------- // Options: //----------------------------------------------------------- // Target:ptx, ISA:sm_10, Endian:little, Pointer Size:64 // -O3 (Optimization level) // -g0 (Debug level) // -m2 (Report advisories) //----------------------------------------------------------- .file 1 "" .file 2 "/tmp/tmpxft_00000e56_00000000-8_testmodule.cudafe2.gpu" .file 3 "/usr/lib/gcc/x86_64-linux-gnu/4.4.3/include/stddef.h" .file 4 "/usr/local/cuda/bin/../include/crt/device_runtime.h" .file 5 "/usr/local/cuda/bin/../include/host_defines.h" .file 6 "/usr/local/cuda/bin/../include/builtin_types.h" .file 7 "/usr/local/cuda/bin/../include/device_types.h" .file 8 "/usr/local/cuda/bin/../include/driver_types.h" .file 9 "/usr/local/cuda/bin/../include/surface_types.h" .file 10 "/usr/local/cuda/bin/../include/texture_types.h" .file 11 "/usr/local/cuda/bin/../include/vector_types.h" .file 12 "/usr/local/cuda/bin/../include/device_launch_parameters.h" .file 13 "/usr/local/cuda/bin/../include/crt/storage_class.h" .file 14 "/usr/include/bits/types.h" .file 15 "/usr/include/time.h" .file 16 "testmodule.cu" .file 17 "/usr/local/cuda/bin/../include/common_functions.h" .file 18 "/usr/local/cuda/bin/../include/math_functions.h" .file 19 "/usr/local/cuda/bin/../include/math_constants.h" .file 20 "/usr/local/cuda/bin/../include/device_functions.h" .file 21 "/usr/local/cuda/bin/../include/sm_11_atomic_functions.h" .file 22 "/usr/local/cuda/bin/../include/sm_12_atomic_functions.h" .file 23 "/usr/local/cuda/bin/../include/sm_13_double_functions.h" .file 24 "/usr/local/cuda/bin/../include/sm_20_atomic_functions.h" .file 25 "/usr/local/cuda/bin/../include/sm_20_intrinsics.h" .file 26 "/usr/local/cuda/bin/../include/surface_functions.h" .file 27 "/usr/local/cuda/bin/../include/texture_fetch_functions.h" .file 28 "/usr/local/cuda/bin/../include/math_functions_dbl_ptx1.h" .entry testMemset ( .param .u64 __cudaparm_testMemset_array, .param .f32 __cudaparm_testMemset_value, .param .s32 __cudaparm_testMemset_N) { .reg .u16 %rh<4>; .reg .u32 %r<10>; .reg .u64 %rd<6>; .reg .f32 %f<3>; .reg .pred %p<3>; .loc 16 7 0 $LDWbegin_testMemset: mov.u16 %rh1, %nctaid.x; mov.u16 %rh2, %ctaid.y; mul.wide.u16 %r1, %rh1, %rh2; cvt.u32.u16 %r2, %ctaid.x; add.u32 %r3, %r2, %r1; cvt.u32.u16 %r4, %ntid.x; mul.lo.u32 %r5, %r4, %r3; cvt.u32.u16 %r6, %tid.x; add.u32 %r7, %r6, %r5; ld.param.s32 %r8, [__cudaparm_testMemset_N]; setp.le.s32 %p1, %r8, %r7; @%p1 bra $Lt_0_1026; .loc 16 10 0 ld.param.f32 %f1, [__cudaparm_testMemset_value]; ld.param.u64 %rd1, [__cudaparm_testMemset_array]; cvt.s64.s32 %rd2, %r7; mul.wide.s32 %rd3, %r7, 4; add.u64 %rd4, %rd1, %rd3; st.global.f32 [%rd4+0], %f1; $Lt_0_1026: .loc 16 12 0 exit; $LDWend_testMemset: } // testMemset mumax3-3.10/cuda/cu/version.go000066400000000000000000000005001371432437400162110ustar00rootroot00000000000000package cu // This file implements CUDA driver version management //#include import "C" const CUDA_VERSION = C.CUDA_VERSION // Returns the CUDA driver version. func Version() int { var version C.int err := Result(C.cuDriverGetVersion(&version)) if err != SUCCESS { panic(err) } return int(version) } mumax3-3.10/cuda/cu/version_test.go000066400000000000000000000001761371432437400172610ustar00rootroot00000000000000package cu import ( "fmt" "testing" ) func TestVersion(t *testing.T) { fmt.Println("CUDA driver version: ", Version()) } mumax3-3.10/cuda/cubicanisotropy2.cu000066400000000000000000000051231371432437400174240ustar00rootroot00000000000000#include "amul.h" #include "float3.h" #include // add cubic anisotropy field to B. // B: effective field in T // m: reduced magnetization (unit length) // Ms: saturation magnetization in A/m. // K1: Kc1 in J/m3 // K2: Kc2 in T/m3 // C1, C2: anisotropy axes // // based on http://www.southampton.ac.uk/~fangohr/software/oxs_cubic8.html extern "C" __global__ void addcubicanisotropy2(float* __restrict__ Bx, float* __restrict__ By, float* __restrict__ Bz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ Ms_, float Ms_mul, float* __restrict__ k1_, float k1_mul, float* __restrict__ k2_, float k2_mul, float* __restrict__ k3_, float k3_mul, float* __restrict__ c1x_, float c1x_mul, float* __restrict__ c1y_, float c1y_mul, float* __restrict__ c1z_, float c1z_mul, float* __restrict__ c2x_, float c2x_mul, float* __restrict__ c2y_, float c2y_mul, float* __restrict__ c2z_, float c2z_mul, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { float invMs = inv_Msat(Ms_, Ms_mul, i); float k1 = amul(k1_, k1_mul, i) * invMs; float k2 = amul(k2_, k2_mul, i) * invMs; float k3 = amul(k3_, k3_mul, i) * invMs; float3 u1 = normalized(vmul(c1x_, c1y_, c1z_, c1x_mul, c1y_mul, c1z_mul, i)); float3 u2 = normalized(vmul(c2x_, c2y_, c2z_, c2x_mul, c2y_mul, c2z_mul, i)); float3 u3 = cross(u1, u2); // 3rd axis perpendicular to u1,u2 float3 m = make_float3(mx[i], my[i], mz[i]); float u1m = dot(u1, m); float u2m = dot(u2, m); float u3m = dot(u3, m); float3 B = -2.0f*k1*((pow2(u2m) + pow2(u3m)) * ( (u1m) * u1) + (pow2(u1m) + pow2(u3m)) * ( (u2m) * u2) + (pow2(u1m) + pow2(u2m)) * ( (u3m) * u3))- 2.0f*k2*((pow2(u2m) * pow2(u3m)) * ( (u1m) * u1) + (pow2(u1m) * pow2(u3m)) * ( (u2m) * u2) + (pow2(u1m) * pow2(u2m)) * ( (u3m) * u3))- 4.0f*k3*((pow4(u2m) + pow4(u3m)) * (pow3(u1m) * u1) + (pow4(u1m) + pow4(u3m)) * (pow3(u2m) * u2) + (pow4(u1m) + pow4(u2m)) * (pow3(u3m) * u3)); Bx[i] += B.x; By[i] += B.y; Bz[i] += B.z; } } mumax3-3.10/cuda/cubicanisotropy2_wrapper.go000066400000000000000000004154211371432437400211700ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for addcubicanisotropy2 kernel var addcubicanisotropy2_code cu.Function // Stores the arguments for addcubicanisotropy2 kernel invocation type addcubicanisotropy2_args_t struct { arg_Bx unsafe.Pointer arg_By unsafe.Pointer arg_Bz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_Ms_ unsafe.Pointer arg_Ms_mul float32 arg_k1_ unsafe.Pointer arg_k1_mul float32 arg_k2_ unsafe.Pointer arg_k2_mul float32 arg_k3_ unsafe.Pointer arg_k3_mul float32 arg_c1x_ unsafe.Pointer arg_c1x_mul float32 arg_c1y_ unsafe.Pointer arg_c1y_mul float32 arg_c1z_ unsafe.Pointer arg_c1z_mul float32 arg_c2x_ unsafe.Pointer arg_c2x_mul float32 arg_c2y_ unsafe.Pointer arg_c2y_mul float32 arg_c2z_ unsafe.Pointer arg_c2z_mul float32 arg_N int argptr [27]unsafe.Pointer sync.Mutex } // Stores the arguments for addcubicanisotropy2 kernel invocation var addcubicanisotropy2_args addcubicanisotropy2_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. addcubicanisotropy2_args.argptr[0] = unsafe.Pointer(&addcubicanisotropy2_args.arg_Bx) addcubicanisotropy2_args.argptr[1] = unsafe.Pointer(&addcubicanisotropy2_args.arg_By) addcubicanisotropy2_args.argptr[2] = unsafe.Pointer(&addcubicanisotropy2_args.arg_Bz) addcubicanisotropy2_args.argptr[3] = unsafe.Pointer(&addcubicanisotropy2_args.arg_mx) addcubicanisotropy2_args.argptr[4] = unsafe.Pointer(&addcubicanisotropy2_args.arg_my) addcubicanisotropy2_args.argptr[5] = unsafe.Pointer(&addcubicanisotropy2_args.arg_mz) addcubicanisotropy2_args.argptr[6] = unsafe.Pointer(&addcubicanisotropy2_args.arg_Ms_) addcubicanisotropy2_args.argptr[7] = unsafe.Pointer(&addcubicanisotropy2_args.arg_Ms_mul) addcubicanisotropy2_args.argptr[8] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k1_) addcubicanisotropy2_args.argptr[9] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k1_mul) addcubicanisotropy2_args.argptr[10] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k2_) addcubicanisotropy2_args.argptr[11] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k2_mul) addcubicanisotropy2_args.argptr[12] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k3_) addcubicanisotropy2_args.argptr[13] = unsafe.Pointer(&addcubicanisotropy2_args.arg_k3_mul) addcubicanisotropy2_args.argptr[14] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1x_) addcubicanisotropy2_args.argptr[15] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1x_mul) addcubicanisotropy2_args.argptr[16] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1y_) addcubicanisotropy2_args.argptr[17] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1y_mul) addcubicanisotropy2_args.argptr[18] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1z_) addcubicanisotropy2_args.argptr[19] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c1z_mul) addcubicanisotropy2_args.argptr[20] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2x_) addcubicanisotropy2_args.argptr[21] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2x_mul) addcubicanisotropy2_args.argptr[22] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2y_) addcubicanisotropy2_args.argptr[23] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2y_mul) addcubicanisotropy2_args.argptr[24] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2z_) addcubicanisotropy2_args.argptr[25] = unsafe.Pointer(&addcubicanisotropy2_args.arg_c2z_mul) addcubicanisotropy2_args.argptr[26] = unsafe.Pointer(&addcubicanisotropy2_args.arg_N) } // Wrapper for addcubicanisotropy2 CUDA kernel, asynchronous. func k_addcubicanisotropy2_async(Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, k1_ unsafe.Pointer, k1_mul float32, k2_ unsafe.Pointer, k2_mul float32, k3_ unsafe.Pointer, k3_mul float32, c1x_ unsafe.Pointer, c1x_mul float32, c1y_ unsafe.Pointer, c1y_mul float32, c1z_ unsafe.Pointer, c1z_mul float32, c2x_ unsafe.Pointer, c2x_mul float32, c2y_ unsafe.Pointer, c2y_mul float32, c2z_ unsafe.Pointer, c2z_mul float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("addcubicanisotropy2") } addcubicanisotropy2_args.Lock() defer addcubicanisotropy2_args.Unlock() if addcubicanisotropy2_code == 0 { addcubicanisotropy2_code = fatbinLoad(addcubicanisotropy2_map, "addcubicanisotropy2") } addcubicanisotropy2_args.arg_Bx = Bx addcubicanisotropy2_args.arg_By = By addcubicanisotropy2_args.arg_Bz = Bz addcubicanisotropy2_args.arg_mx = mx addcubicanisotropy2_args.arg_my = my addcubicanisotropy2_args.arg_mz = mz addcubicanisotropy2_args.arg_Ms_ = Ms_ addcubicanisotropy2_args.arg_Ms_mul = Ms_mul addcubicanisotropy2_args.arg_k1_ = k1_ addcubicanisotropy2_args.arg_k1_mul = k1_mul addcubicanisotropy2_args.arg_k2_ = k2_ addcubicanisotropy2_args.arg_k2_mul = k2_mul addcubicanisotropy2_args.arg_k3_ = k3_ addcubicanisotropy2_args.arg_k3_mul = k3_mul addcubicanisotropy2_args.arg_c1x_ = c1x_ addcubicanisotropy2_args.arg_c1x_mul = c1x_mul addcubicanisotropy2_args.arg_c1y_ = c1y_ addcubicanisotropy2_args.arg_c1y_mul = c1y_mul addcubicanisotropy2_args.arg_c1z_ = c1z_ addcubicanisotropy2_args.arg_c1z_mul = c1z_mul addcubicanisotropy2_args.arg_c2x_ = c2x_ addcubicanisotropy2_args.arg_c2x_mul = c2x_mul addcubicanisotropy2_args.arg_c2y_ = c2y_ addcubicanisotropy2_args.arg_c2y_mul = c2y_mul addcubicanisotropy2_args.arg_c2z_ = c2z_ addcubicanisotropy2_args.arg_c2z_mul = c2z_mul addcubicanisotropy2_args.arg_N = N args := addcubicanisotropy2_args.argptr[:] cu.LaunchKernel(addcubicanisotropy2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("addcubicanisotropy2") } } // maps compute capability on PTX code for addcubicanisotropy2 kernel. var addcubicanisotropy2_map = map[int]string{0: "", 30: addcubicanisotropy2_ptx_30, 32: addcubicanisotropy2_ptx_32, 35: addcubicanisotropy2_ptx_35, 37: addcubicanisotropy2_ptx_37, 50: addcubicanisotropy2_ptx_50, 52: addcubicanisotropy2_ptx_52, 53: addcubicanisotropy2_ptx_53, 60: addcubicanisotropy2_ptx_60, 61: addcubicanisotropy2_ptx_61, 62: addcubicanisotropy2_ptx_62, 70: addcubicanisotropy2_ptx_70, 72: addcubicanisotropy2_ptx_72, 75: addcubicanisotropy2_ptx_75} // addcubicanisotropy2 PTX code for various compute capabilities. const ( addcubicanisotropy2_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<86>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r1, [addcubicanisotropy2_param_26]; mov.u32 %r2, %nctaid.x; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %ctaid.x; mad.lo.s32 %r5, %r2, %r3, %r4; mov.u32 %r6, %ntid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r8, %r5, %r6, %r7; setp.ge.s32 %p1, %r8, %r1; @%p1 bra BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r8, 4; add.s64 %rd19, %rd17, %rd18; ld.global.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f175, %f174; BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r8, 4; add.s64 %rd22, %rd20, %rd21; ld.global.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r8, 4; add.s64 %rd25, %rd23, %rd24; ld.global.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r8, 4; add.s64 %rd28, %rd26, %rd27; ld.global.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; BB0_11: setp.eq.s64 %p7, %rd11, 0; mul.f32 %f11, %f175, %f176; mul.f32 %f12, %f175, %f177; @%p7 bra BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r8, 4; add.s64 %rd31, %rd29, %rd30; ld.global.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; BB0_13: setp.eq.s64 %p8, %rd12, 0; mul.f32 %f15, %f175, %f178; @%p8 bra BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r8, 4; add.s64 %rd34, %rd32, %rd33; ld.global.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r8, 4; add.s64 %rd37, %rd35, %rd36; ld.global.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; mov.f32 %f182, 0f00000000; setp.eq.f32 %p10, %f20, 0f00000000; @%p10 bra BB0_19; rcp.rn.f32 %f182, %f20; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r8, 4; add.s64 %rd40, %rd38, %rd39; ld.global.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; BB0_21: mul.f32 %f25, %f179, %f182; mul.f32 %f26, %f180, %f182; mul.f32 %f27, %f181, %f182; setp.eq.s64 %p12, %rd15, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r8, 4; add.s64 %rd43, %rd41, %rd42; ld.global.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r8, 4; add.s64 %rd46, %rd44, %rd45; ld.global.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; mov.f32 %f186, 0f00000000; setp.eq.f32 %p14, %f32, 0f00000000; @%p14 bra BB0_27; rcp.rn.f32 %f186, %f32; BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f26, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f27, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f27, %f69; mul.f32 %f71, %f25, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f25, %f66; mul.f32 %f74, %f26, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r8, 4; add.s64 %rd49, %rd47, %rd48; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.f32 %f76, [%rd49]; ld.global.f32 %f77, [%rd51]; mul.f32 %f78, %f26, %f77; fma.rn.f32 %f79, %f25, %f76, %f78; ld.global.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f27, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f25, %f81; mul.f32 %f92, %f26, %f81; mul.f32 %f93, %f27, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f105, %f106, %f102; fma.rn.f32 %f110, %f105, %f107, %f103; fma.rn.f32 %f111, %f105, %f108, %f104; mul.f32 %f112, %f11, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f124, %f106, %f121; fma.rn.f32 %f126, %f124, %f107, %f122; fma.rn.f32 %f127, %f124, %f108, %f123; add.f32 %f128, %f12, %f12; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f25, %f138; mul.f32 %f140, %f26, %f138; mul.f32 %f141, %f27, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f15, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; BB0_28: ret; } ` addcubicanisotropy2_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<86>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r1, [addcubicanisotropy2_param_26]; mov.u32 %r2, %nctaid.x; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %ctaid.x; mad.lo.s32 %r5, %r2, %r3, %r4; mov.u32 %r6, %ntid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r8, %r5, %r6, %r7; setp.ge.s32 %p1, %r8, %r1; @%p1 bra BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r8, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f175, %f174; BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r8, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r8, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r8, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; BB0_11: setp.eq.s64 %p7, %rd11, 0; mul.f32 %f11, %f175, %f176; mul.f32 %f12, %f175, %f177; @%p7 bra BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r8, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; BB0_13: setp.eq.s64 %p8, %rd12, 0; mul.f32 %f15, %f175, %f178; @%p8 bra BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r8, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r8, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; mov.f32 %f182, 0f00000000; setp.eq.f32 %p10, %f20, 0f00000000; @%p10 bra BB0_19; rcp.rn.f32 %f182, %f20; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r8, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; BB0_21: mul.f32 %f25, %f179, %f182; mul.f32 %f26, %f180, %f182; mul.f32 %f27, %f181, %f182; setp.eq.s64 %p12, %rd15, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r8, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r8, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; mov.f32 %f186, 0f00000000; setp.eq.f32 %p14, %f32, 0f00000000; @%p14 bra BB0_27; rcp.rn.f32 %f186, %f32; BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f26, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f27, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f27, %f69; mul.f32 %f71, %f25, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f25, %f66; mul.f32 %f74, %f26, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r8, 4; add.s64 %rd49, %rd47, %rd48; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f76, [%rd49]; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f26, %f77; fma.rn.f32 %f79, %f25, %f76, %f78; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f27, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f25, %f81; mul.f32 %f92, %f26, %f81; mul.f32 %f93, %f27, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f105, %f106, %f102; fma.rn.f32 %f110, %f105, %f107, %f103; fma.rn.f32 %f111, %f105, %f108, %f104; mul.f32 %f112, %f11, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f124, %f106, %f121; fma.rn.f32 %f126, %f124, %f107, %f122; fma.rn.f32 %f127, %f124, %f108, %f123; add.f32 %f128, %f12, %f12; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f25, %f138; mul.f32 %f140, %f26, %f138; mul.f32 %f141, %f27, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f15, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; BB0_28: ret; } ` addcubicanisotropy2_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<86>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r1, [addcubicanisotropy2_param_26]; mov.u32 %r2, %nctaid.x; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %ctaid.x; mad.lo.s32 %r5, %r2, %r3, %r4; mov.u32 %r6, %ntid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r8, %r5, %r6, %r7; setp.ge.s32 %p1, %r8, %r1; @%p1 bra BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r8, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f175, %f174; BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r8, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r8, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r8, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; BB0_11: setp.eq.s64 %p7, %rd11, 0; mul.f32 %f11, %f175, %f176; mul.f32 %f12, %f175, %f177; @%p7 bra BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r8, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; BB0_13: setp.eq.s64 %p8, %rd12, 0; mul.f32 %f15, %f175, %f178; @%p8 bra BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r8, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r8, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; mov.f32 %f182, 0f00000000; setp.eq.f32 %p10, %f20, 0f00000000; @%p10 bra BB0_19; rcp.rn.f32 %f182, %f20; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r8, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; BB0_21: mul.f32 %f25, %f179, %f182; mul.f32 %f26, %f180, %f182; mul.f32 %f27, %f181, %f182; setp.eq.s64 %p12, %rd15, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r8, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r8, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; mov.f32 %f186, 0f00000000; setp.eq.f32 %p14, %f32, 0f00000000; @%p14 bra BB0_27; rcp.rn.f32 %f186, %f32; BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f26, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f27, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f27, %f69; mul.f32 %f71, %f25, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f25, %f66; mul.f32 %f74, %f26, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r8, 4; add.s64 %rd49, %rd47, %rd48; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f76, [%rd49]; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f26, %f77; fma.rn.f32 %f79, %f25, %f76, %f78; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f27, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f25, %f81; mul.f32 %f92, %f26, %f81; mul.f32 %f93, %f27, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f105, %f106, %f102; fma.rn.f32 %f110, %f105, %f107, %f103; fma.rn.f32 %f111, %f105, %f108, %f104; mul.f32 %f112, %f11, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f124, %f106, %f121; fma.rn.f32 %f126, %f124, %f107, %f122; fma.rn.f32 %f127, %f124, %f108, %f123; add.f32 %f128, %f12, %f12; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f25, %f138; mul.f32 %f140, %f26, %f138; mul.f32 %f141, %f27, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f15, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; BB0_28: ret; } ` addcubicanisotropy2_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<86>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r1, [addcubicanisotropy2_param_26]; mov.u32 %r2, %nctaid.x; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %ctaid.x; mad.lo.s32 %r5, %r2, %r3, %r4; mov.u32 %r6, %ntid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r8, %r5, %r6, %r7; setp.ge.s32 %p1, %r8, %r1; @%p1 bra BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r8, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f175, %f174; BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r8, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r8, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r8, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; BB0_11: setp.eq.s64 %p7, %rd11, 0; mul.f32 %f11, %f175, %f176; mul.f32 %f12, %f175, %f177; @%p7 bra BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r8, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; BB0_13: setp.eq.s64 %p8, %rd12, 0; mul.f32 %f15, %f175, %f178; @%p8 bra BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r8, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r8, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; mov.f32 %f182, 0f00000000; setp.eq.f32 %p10, %f20, 0f00000000; @%p10 bra BB0_19; rcp.rn.f32 %f182, %f20; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r8, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; BB0_21: mul.f32 %f25, %f179, %f182; mul.f32 %f26, %f180, %f182; mul.f32 %f27, %f181, %f182; setp.eq.s64 %p12, %rd15, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r8, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r8, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; mov.f32 %f186, 0f00000000; setp.eq.f32 %p14, %f32, 0f00000000; @%p14 bra BB0_27; rcp.rn.f32 %f186, %f32; BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f26, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f27, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f27, %f69; mul.f32 %f71, %f25, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f25, %f66; mul.f32 %f74, %f26, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r8, 4; add.s64 %rd49, %rd47, %rd48; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f76, [%rd49]; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f26, %f77; fma.rn.f32 %f79, %f25, %f76, %f78; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f27, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f25, %f81; mul.f32 %f92, %f26, %f81; mul.f32 %f93, %f27, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f105, %f106, %f102; fma.rn.f32 %f110, %f105, %f107, %f103; fma.rn.f32 %f111, %f105, %f108, %f104; mul.f32 %f112, %f11, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f124, %f106, %f121; fma.rn.f32 %f126, %f124, %f107, %f122; fma.rn.f32 %f127, %f124, %f108, %f123; add.f32 %f128, %f12, %f12; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f25, %f138; mul.f32 %f140, %f26, %f138; mul.f32 %f141, %f27, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f15, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; BB0_28: ret; } ` addcubicanisotropy2_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<86>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r1, [addcubicanisotropy2_param_26]; mov.u32 %r2, %nctaid.x; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %ctaid.x; mad.lo.s32 %r5, %r2, %r3, %r4; mov.u32 %r6, %ntid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r8, %r5, %r6, %r7; setp.ge.s32 %p1, %r8, %r1; @%p1 bra BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r8, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f175, %f174; BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r8, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r8, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r8, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; BB0_11: setp.eq.s64 %p7, %rd11, 0; mul.f32 %f11, %f175, %f176; mul.f32 %f12, %f175, %f177; @%p7 bra BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r8, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; BB0_13: setp.eq.s64 %p8, %rd12, 0; mul.f32 %f15, %f175, %f178; @%p8 bra BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r8, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r8, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; mov.f32 %f182, 0f00000000; setp.eq.f32 %p10, %f20, 0f00000000; @%p10 bra BB0_19; rcp.rn.f32 %f182, %f20; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r8, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; BB0_21: mul.f32 %f25, %f179, %f182; mul.f32 %f26, %f180, %f182; mul.f32 %f27, %f181, %f182; setp.eq.s64 %p12, %rd15, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r8, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r8, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; mov.f32 %f186, 0f00000000; setp.eq.f32 %p14, %f32, 0f00000000; @%p14 bra BB0_27; rcp.rn.f32 %f186, %f32; BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f26, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f27, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f27, %f69; mul.f32 %f71, %f25, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f25, %f66; mul.f32 %f74, %f26, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r8, 4; add.s64 %rd49, %rd47, %rd48; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f76, [%rd49]; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f26, %f77; fma.rn.f32 %f79, %f25, %f76, %f78; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f27, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f25, %f81; mul.f32 %f92, %f26, %f81; mul.f32 %f93, %f27, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f105, %f106, %f102; fma.rn.f32 %f110, %f105, %f107, %f103; fma.rn.f32 %f111, %f105, %f108, %f104; mul.f32 %f112, %f11, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f124, %f106, %f121; fma.rn.f32 %f126, %f124, %f107, %f122; fma.rn.f32 %f127, %f124, %f108, %f123; add.f32 %f128, %f12, %f12; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f25, %f138; mul.f32 %f140, %f26, %f138; mul.f32 %f141, %f27, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f15, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; BB0_28: ret; } ` addcubicanisotropy2_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<86>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r1, [addcubicanisotropy2_param_26]; mov.u32 %r2, %nctaid.x; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %ctaid.x; mad.lo.s32 %r5, %r2, %r3, %r4; mov.u32 %r6, %ntid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r8, %r5, %r6, %r7; setp.ge.s32 %p1, %r8, %r1; @%p1 bra BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r8, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f175, %f174; BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r8, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r8, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r8, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; BB0_11: setp.eq.s64 %p7, %rd11, 0; mul.f32 %f11, %f175, %f176; mul.f32 %f12, %f175, %f177; @%p7 bra BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r8, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; BB0_13: setp.eq.s64 %p8, %rd12, 0; mul.f32 %f15, %f175, %f178; @%p8 bra BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r8, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r8, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; mov.f32 %f182, 0f00000000; setp.eq.f32 %p10, %f20, 0f00000000; @%p10 bra BB0_19; rcp.rn.f32 %f182, %f20; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r8, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; BB0_21: mul.f32 %f25, %f179, %f182; mul.f32 %f26, %f180, %f182; mul.f32 %f27, %f181, %f182; setp.eq.s64 %p12, %rd15, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r8, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r8, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; mov.f32 %f186, 0f00000000; setp.eq.f32 %p14, %f32, 0f00000000; @%p14 bra BB0_27; rcp.rn.f32 %f186, %f32; BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f26, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f27, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f27, %f69; mul.f32 %f71, %f25, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f25, %f66; mul.f32 %f74, %f26, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r8, 4; add.s64 %rd49, %rd47, %rd48; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f76, [%rd49]; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f26, %f77; fma.rn.f32 %f79, %f25, %f76, %f78; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f27, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f25, %f81; mul.f32 %f92, %f26, %f81; mul.f32 %f93, %f27, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f105, %f106, %f102; fma.rn.f32 %f110, %f105, %f107, %f103; fma.rn.f32 %f111, %f105, %f108, %f104; mul.f32 %f112, %f11, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f124, %f106, %f121; fma.rn.f32 %f126, %f124, %f107, %f122; fma.rn.f32 %f127, %f124, %f108, %f123; add.f32 %f128, %f12, %f12; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f25, %f138; mul.f32 %f140, %f26, %f138; mul.f32 %f141, %f27, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f15, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; BB0_28: ret; } ` addcubicanisotropy2_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<86>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r1, [addcubicanisotropy2_param_26]; mov.u32 %r2, %nctaid.x; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %ctaid.x; mad.lo.s32 %r5, %r2, %r3, %r4; mov.u32 %r6, %ntid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r8, %r5, %r6, %r7; setp.ge.s32 %p1, %r8, %r1; @%p1 bra BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r8, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f175, %f174; BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r8, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r8, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r8, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; BB0_11: setp.eq.s64 %p7, %rd11, 0; mul.f32 %f11, %f175, %f176; mul.f32 %f12, %f175, %f177; @%p7 bra BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r8, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; BB0_13: setp.eq.s64 %p8, %rd12, 0; mul.f32 %f15, %f175, %f178; @%p8 bra BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r8, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r8, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; mov.f32 %f182, 0f00000000; setp.eq.f32 %p10, %f20, 0f00000000; @%p10 bra BB0_19; rcp.rn.f32 %f182, %f20; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r8, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; BB0_21: mul.f32 %f25, %f179, %f182; mul.f32 %f26, %f180, %f182; mul.f32 %f27, %f181, %f182; setp.eq.s64 %p12, %rd15, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r8, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r8, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; mov.f32 %f186, 0f00000000; setp.eq.f32 %p14, %f32, 0f00000000; @%p14 bra BB0_27; rcp.rn.f32 %f186, %f32; BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f26, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f27, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f27, %f69; mul.f32 %f71, %f25, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f25, %f66; mul.f32 %f74, %f26, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r8, 4; add.s64 %rd49, %rd47, %rd48; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f76, [%rd49]; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f26, %f77; fma.rn.f32 %f79, %f25, %f76, %f78; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f27, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f25, %f81; mul.f32 %f92, %f26, %f81; mul.f32 %f93, %f27, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f105, %f106, %f102; fma.rn.f32 %f110, %f105, %f107, %f103; fma.rn.f32 %f111, %f105, %f108, %f104; mul.f32 %f112, %f11, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f124, %f106, %f121; fma.rn.f32 %f126, %f124, %f107, %f122; fma.rn.f32 %f127, %f124, %f108, %f123; add.f32 %f128, %f12, %f12; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f25, %f138; mul.f32 %f140, %f26, %f138; mul.f32 %f141, %f27, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f15, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; BB0_28: ret; } ` addcubicanisotropy2_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<86>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r1, [addcubicanisotropy2_param_26]; mov.u32 %r2, %nctaid.x; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %ctaid.x; mad.lo.s32 %r5, %r2, %r3, %r4; mov.u32 %r6, %ntid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r8, %r5, %r6, %r7; setp.ge.s32 %p1, %r8, %r1; @%p1 bra BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r8, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f175, %f174; BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r8, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r8, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r8, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; BB0_11: setp.eq.s64 %p7, %rd11, 0; mul.f32 %f11, %f175, %f176; mul.f32 %f12, %f175, %f177; @%p7 bra BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r8, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; BB0_13: setp.eq.s64 %p8, %rd12, 0; mul.f32 %f15, %f175, %f178; @%p8 bra BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r8, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r8, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; mov.f32 %f182, 0f00000000; setp.eq.f32 %p10, %f20, 0f00000000; @%p10 bra BB0_19; rcp.rn.f32 %f182, %f20; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r8, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; BB0_21: mul.f32 %f25, %f179, %f182; mul.f32 %f26, %f180, %f182; mul.f32 %f27, %f181, %f182; setp.eq.s64 %p12, %rd15, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r8, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r8, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; mov.f32 %f186, 0f00000000; setp.eq.f32 %p14, %f32, 0f00000000; @%p14 bra BB0_27; rcp.rn.f32 %f186, %f32; BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f26, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f27, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f27, %f69; mul.f32 %f71, %f25, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f25, %f66; mul.f32 %f74, %f26, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r8, 4; add.s64 %rd49, %rd47, %rd48; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f76, [%rd49]; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f26, %f77; fma.rn.f32 %f79, %f25, %f76, %f78; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f27, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f25, %f81; mul.f32 %f92, %f26, %f81; mul.f32 %f93, %f27, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f105, %f106, %f102; fma.rn.f32 %f110, %f105, %f107, %f103; fma.rn.f32 %f111, %f105, %f108, %f104; mul.f32 %f112, %f11, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f124, %f106, %f121; fma.rn.f32 %f126, %f124, %f107, %f122; fma.rn.f32 %f127, %f124, %f108, %f123; add.f32 %f128, %f12, %f12; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f25, %f138; mul.f32 %f140, %f26, %f138; mul.f32 %f141, %f27, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f15, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; BB0_28: ret; } ` addcubicanisotropy2_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<86>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r1, [addcubicanisotropy2_param_26]; mov.u32 %r2, %nctaid.x; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %ctaid.x; mad.lo.s32 %r5, %r2, %r3, %r4; mov.u32 %r6, %ntid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r8, %r5, %r6, %r7; setp.ge.s32 %p1, %r8, %r1; @%p1 bra BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r8, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f175, %f174; BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r8, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r8, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r8, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; BB0_11: setp.eq.s64 %p7, %rd11, 0; mul.f32 %f11, %f175, %f176; mul.f32 %f12, %f175, %f177; @%p7 bra BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r8, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; BB0_13: setp.eq.s64 %p8, %rd12, 0; mul.f32 %f15, %f175, %f178; @%p8 bra BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r8, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r8, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; mov.f32 %f182, 0f00000000; setp.eq.f32 %p10, %f20, 0f00000000; @%p10 bra BB0_19; rcp.rn.f32 %f182, %f20; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r8, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; BB0_21: mul.f32 %f25, %f179, %f182; mul.f32 %f26, %f180, %f182; mul.f32 %f27, %f181, %f182; setp.eq.s64 %p12, %rd15, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r8, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r8, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; mov.f32 %f186, 0f00000000; setp.eq.f32 %p14, %f32, 0f00000000; @%p14 bra BB0_27; rcp.rn.f32 %f186, %f32; BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f26, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f27, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f27, %f69; mul.f32 %f71, %f25, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f25, %f66; mul.f32 %f74, %f26, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r8, 4; add.s64 %rd49, %rd47, %rd48; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f76, [%rd49]; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f26, %f77; fma.rn.f32 %f79, %f25, %f76, %f78; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f27, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f25, %f81; mul.f32 %f92, %f26, %f81; mul.f32 %f93, %f27, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f105, %f106, %f102; fma.rn.f32 %f110, %f105, %f107, %f103; fma.rn.f32 %f111, %f105, %f108, %f104; mul.f32 %f112, %f11, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f124, %f106, %f121; fma.rn.f32 %f126, %f124, %f107, %f122; fma.rn.f32 %f127, %f124, %f108, %f123; add.f32 %f128, %f12, %f12; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f25, %f138; mul.f32 %f140, %f26, %f138; mul.f32 %f141, %f27, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f15, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; BB0_28: ret; } ` addcubicanisotropy2_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<86>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r1, [addcubicanisotropy2_param_26]; mov.u32 %r2, %nctaid.x; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %ctaid.x; mad.lo.s32 %r5, %r2, %r3, %r4; mov.u32 %r6, %ntid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r8, %r5, %r6, %r7; setp.ge.s32 %p1, %r8, %r1; @%p1 bra BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r8, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f175, %f174; BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r8, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r8, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r8, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; BB0_11: setp.eq.s64 %p7, %rd11, 0; mul.f32 %f11, %f175, %f176; mul.f32 %f12, %f175, %f177; @%p7 bra BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r8, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; BB0_13: setp.eq.s64 %p8, %rd12, 0; mul.f32 %f15, %f175, %f178; @%p8 bra BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r8, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r8, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; mov.f32 %f182, 0f00000000; setp.eq.f32 %p10, %f20, 0f00000000; @%p10 bra BB0_19; rcp.rn.f32 %f182, %f20; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r8, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; BB0_21: mul.f32 %f25, %f179, %f182; mul.f32 %f26, %f180, %f182; mul.f32 %f27, %f181, %f182; setp.eq.s64 %p12, %rd15, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r8, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r8, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; mov.f32 %f186, 0f00000000; setp.eq.f32 %p14, %f32, 0f00000000; @%p14 bra BB0_27; rcp.rn.f32 %f186, %f32; BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f26, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f27, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f27, %f69; mul.f32 %f71, %f25, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f25, %f66; mul.f32 %f74, %f26, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r8, 4; add.s64 %rd49, %rd47, %rd48; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f76, [%rd49]; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f26, %f77; fma.rn.f32 %f79, %f25, %f76, %f78; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f27, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f25, %f81; mul.f32 %f92, %f26, %f81; mul.f32 %f93, %f27, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f105, %f106, %f102; fma.rn.f32 %f110, %f105, %f107, %f103; fma.rn.f32 %f111, %f105, %f108, %f104; mul.f32 %f112, %f11, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f124, %f106, %f121; fma.rn.f32 %f126, %f124, %f107, %f122; fma.rn.f32 %f127, %f124, %f108, %f123; add.f32 %f128, %f12, %f12; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f25, %f138; mul.f32 %f140, %f26, %f138; mul.f32 %f141, %f27, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f15, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; BB0_28: ret; } ` addcubicanisotropy2_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<86>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r1, [addcubicanisotropy2_param_26]; mov.u32 %r2, %nctaid.x; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %ctaid.x; mad.lo.s32 %r5, %r2, %r3, %r4; mov.u32 %r6, %ntid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r8, %r5, %r6, %r7; setp.ge.s32 %p1, %r8, %r1; @%p1 bra BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r8, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f175, %f174; BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r8, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r8, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r8, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; BB0_11: setp.eq.s64 %p7, %rd11, 0; mul.f32 %f11, %f175, %f176; mul.f32 %f12, %f175, %f177; @%p7 bra BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r8, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; BB0_13: setp.eq.s64 %p8, %rd12, 0; mul.f32 %f15, %f175, %f178; @%p8 bra BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r8, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r8, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; mov.f32 %f182, 0f00000000; setp.eq.f32 %p10, %f20, 0f00000000; @%p10 bra BB0_19; rcp.rn.f32 %f182, %f20; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r8, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; BB0_21: mul.f32 %f25, %f179, %f182; mul.f32 %f26, %f180, %f182; mul.f32 %f27, %f181, %f182; setp.eq.s64 %p12, %rd15, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r8, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r8, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; mov.f32 %f186, 0f00000000; setp.eq.f32 %p14, %f32, 0f00000000; @%p14 bra BB0_27; rcp.rn.f32 %f186, %f32; BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f26, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f27, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f27, %f69; mul.f32 %f71, %f25, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f25, %f66; mul.f32 %f74, %f26, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r8, 4; add.s64 %rd49, %rd47, %rd48; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f76, [%rd49]; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f26, %f77; fma.rn.f32 %f79, %f25, %f76, %f78; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f27, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f25, %f81; mul.f32 %f92, %f26, %f81; mul.f32 %f93, %f27, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f105, %f106, %f102; fma.rn.f32 %f110, %f105, %f107, %f103; fma.rn.f32 %f111, %f105, %f108, %f104; mul.f32 %f112, %f11, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f124, %f106, %f121; fma.rn.f32 %f126, %f124, %f107, %f122; fma.rn.f32 %f127, %f124, %f108, %f123; add.f32 %f128, %f12, %f12; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f25, %f138; mul.f32 %f140, %f26, %f138; mul.f32 %f141, %f27, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f15, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; BB0_28: ret; } ` addcubicanisotropy2_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<86>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r1, [addcubicanisotropy2_param_26]; mov.u32 %r2, %nctaid.x; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %ctaid.x; mad.lo.s32 %r5, %r2, %r3, %r4; mov.u32 %r6, %ntid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r8, %r5, %r6, %r7; setp.ge.s32 %p1, %r8, %r1; @%p1 bra BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r8, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f175, %f174; BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r8, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r8, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r8, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; BB0_11: setp.eq.s64 %p7, %rd11, 0; mul.f32 %f11, %f175, %f176; mul.f32 %f12, %f175, %f177; @%p7 bra BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r8, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; BB0_13: setp.eq.s64 %p8, %rd12, 0; mul.f32 %f15, %f175, %f178; @%p8 bra BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r8, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r8, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; mov.f32 %f182, 0f00000000; setp.eq.f32 %p10, %f20, 0f00000000; @%p10 bra BB0_19; rcp.rn.f32 %f182, %f20; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r8, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; BB0_21: mul.f32 %f25, %f179, %f182; mul.f32 %f26, %f180, %f182; mul.f32 %f27, %f181, %f182; setp.eq.s64 %p12, %rd15, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r8, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r8, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; mov.f32 %f186, 0f00000000; setp.eq.f32 %p14, %f32, 0f00000000; @%p14 bra BB0_27; rcp.rn.f32 %f186, %f32; BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f26, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f27, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f27, %f69; mul.f32 %f71, %f25, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f25, %f66; mul.f32 %f74, %f26, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r8, 4; add.s64 %rd49, %rd47, %rd48; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f76, [%rd49]; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f26, %f77; fma.rn.f32 %f79, %f25, %f76, %f78; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f27, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f25, %f81; mul.f32 %f92, %f26, %f81; mul.f32 %f93, %f27, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f105, %f106, %f102; fma.rn.f32 %f110, %f105, %f107, %f103; fma.rn.f32 %f111, %f105, %f108, %f104; mul.f32 %f112, %f11, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f124, %f106, %f121; fma.rn.f32 %f126, %f124, %f107, %f122; fma.rn.f32 %f127, %f124, %f108, %f123; add.f32 %f128, %f12, %f12; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f25, %f138; mul.f32 %f140, %f26, %f138; mul.f32 %f141, %f27, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f15, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; BB0_28: ret; } ` addcubicanisotropy2_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl addcubicanisotropy2 .visible .entry addcubicanisotropy2( .param .u64 addcubicanisotropy2_param_0, .param .u64 addcubicanisotropy2_param_1, .param .u64 addcubicanisotropy2_param_2, .param .u64 addcubicanisotropy2_param_3, .param .u64 addcubicanisotropy2_param_4, .param .u64 addcubicanisotropy2_param_5, .param .u64 addcubicanisotropy2_param_6, .param .f32 addcubicanisotropy2_param_7, .param .u64 addcubicanisotropy2_param_8, .param .f32 addcubicanisotropy2_param_9, .param .u64 addcubicanisotropy2_param_10, .param .f32 addcubicanisotropy2_param_11, .param .u64 addcubicanisotropy2_param_12, .param .f32 addcubicanisotropy2_param_13, .param .u64 addcubicanisotropy2_param_14, .param .f32 addcubicanisotropy2_param_15, .param .u64 addcubicanisotropy2_param_16, .param .f32 addcubicanisotropy2_param_17, .param .u64 addcubicanisotropy2_param_18, .param .f32 addcubicanisotropy2_param_19, .param .u64 addcubicanisotropy2_param_20, .param .f32 addcubicanisotropy2_param_21, .param .u64 addcubicanisotropy2_param_22, .param .f32 addcubicanisotropy2_param_23, .param .u64 addcubicanisotropy2_param_24, .param .f32 addcubicanisotropy2_param_25, .param .u32 addcubicanisotropy2_param_26 ) { .reg .pred %p<15>; .reg .f32 %f<187>; .reg .b32 %r<86>; .reg .b64 %rd<60>; ld.param.u64 %rd1, [addcubicanisotropy2_param_0]; ld.param.u64 %rd2, [addcubicanisotropy2_param_1]; ld.param.u64 %rd3, [addcubicanisotropy2_param_2]; ld.param.u64 %rd4, [addcubicanisotropy2_param_3]; ld.param.u64 %rd5, [addcubicanisotropy2_param_4]; ld.param.u64 %rd6, [addcubicanisotropy2_param_5]; ld.param.u64 %rd7, [addcubicanisotropy2_param_6]; ld.param.f32 %f174, [addcubicanisotropy2_param_7]; ld.param.u64 %rd8, [addcubicanisotropy2_param_8]; ld.param.f32 %f176, [addcubicanisotropy2_param_9]; ld.param.u64 %rd9, [addcubicanisotropy2_param_10]; ld.param.f32 %f177, [addcubicanisotropy2_param_11]; ld.param.u64 %rd10, [addcubicanisotropy2_param_12]; ld.param.f32 %f178, [addcubicanisotropy2_param_13]; ld.param.u64 %rd11, [addcubicanisotropy2_param_14]; ld.param.f32 %f179, [addcubicanisotropy2_param_15]; ld.param.u64 %rd12, [addcubicanisotropy2_param_16]; ld.param.f32 %f180, [addcubicanisotropy2_param_17]; ld.param.u64 %rd13, [addcubicanisotropy2_param_18]; ld.param.f32 %f181, [addcubicanisotropy2_param_19]; ld.param.u64 %rd14, [addcubicanisotropy2_param_20]; ld.param.f32 %f183, [addcubicanisotropy2_param_21]; ld.param.u64 %rd15, [addcubicanisotropy2_param_22]; ld.param.f32 %f184, [addcubicanisotropy2_param_23]; ld.param.u64 %rd16, [addcubicanisotropy2_param_24]; ld.param.f32 %f185, [addcubicanisotropy2_param_25]; ld.param.u32 %r1, [addcubicanisotropy2_param_26]; mov.u32 %r2, %nctaid.x; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %ctaid.x; mad.lo.s32 %r5, %r2, %r3, %r4; mov.u32 %r6, %ntid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r8, %r5, %r6, %r7; setp.ge.s32 %p1, %r8, %r1; @%p1 bra BB0_28; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd17, %rd7; mul.wide.s32 %rd18, %r8, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f45, [%rd19]; mul.f32 %f174, %f45, %f174; BB0_3: setp.eq.f32 %p3, %f174, 0f00000000; mov.f32 %f175, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f175, %f174; BB0_5: setp.eq.s64 %p4, %rd8, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd20, %rd8; mul.wide.s32 %rd21, %r8, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f47, [%rd22]; mul.f32 %f176, %f47, %f176; BB0_7: setp.eq.s64 %p5, %rd9, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd23, %rd9; mul.wide.s32 %rd24, %r8, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f48, [%rd25]; mul.f32 %f177, %f48, %f177; BB0_9: setp.eq.s64 %p6, %rd10, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd26, %rd10; mul.wide.s32 %rd27, %r8, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f49, [%rd28]; mul.f32 %f178, %f49, %f178; BB0_11: setp.eq.s64 %p7, %rd11, 0; mul.f32 %f11, %f175, %f176; mul.f32 %f12, %f175, %f177; @%p7 bra BB0_13; cvta.to.global.u64 %rd29, %rd11; mul.wide.s32 %rd30, %r8, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f50, [%rd31]; mul.f32 %f179, %f50, %f179; BB0_13: setp.eq.s64 %p8, %rd12, 0; mul.f32 %f15, %f175, %f178; @%p8 bra BB0_15; cvta.to.global.u64 %rd32, %rd12; mul.wide.s32 %rd33, %r8, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f51, [%rd34]; mul.f32 %f180, %f51, %f180; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd35, %rd13; mul.wide.s32 %rd36, %r8, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f52, [%rd37]; mul.f32 %f181, %f52, %f181; BB0_17: mul.f32 %f54, %f180, %f180; fma.rn.f32 %f55, %f179, %f179, %f54; fma.rn.f32 %f56, %f181, %f181, %f55; sqrt.rn.f32 %f20, %f56; mov.f32 %f182, 0f00000000; setp.eq.f32 %p10, %f20, 0f00000000; @%p10 bra BB0_19; rcp.rn.f32 %f182, %f20; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd38, %rd14; mul.wide.s32 %rd39, %r8, 4; add.s64 %rd40, %rd38, %rd39; ld.global.nc.f32 %f57, [%rd40]; mul.f32 %f183, %f57, %f183; BB0_21: mul.f32 %f25, %f179, %f182; mul.f32 %f26, %f180, %f182; mul.f32 %f27, %f181, %f182; setp.eq.s64 %p12, %rd15, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd41, %rd15; mul.wide.s32 %rd42, %r8, 4; add.s64 %rd43, %rd41, %rd42; ld.global.nc.f32 %f58, [%rd43]; mul.f32 %f184, %f58, %f184; BB0_23: setp.eq.s64 %p13, %rd16, 0; @%p13 bra BB0_25; cvta.to.global.u64 %rd44, %rd16; mul.wide.s32 %rd45, %r8, 4; add.s64 %rd46, %rd44, %rd45; ld.global.nc.f32 %f59, [%rd46]; mul.f32 %f185, %f59, %f185; BB0_25: mul.f32 %f61, %f184, %f184; fma.rn.f32 %f62, %f183, %f183, %f61; fma.rn.f32 %f63, %f185, %f185, %f62; sqrt.rn.f32 %f32, %f63; mov.f32 %f186, 0f00000000; setp.eq.f32 %p14, %f32, 0f00000000; @%p14 bra BB0_27; rcp.rn.f32 %f186, %f32; BB0_27: mul.f32 %f64, %f185, %f186; mul.f32 %f65, %f26, %f64; mul.f32 %f66, %f184, %f186; mul.f32 %f67, %f27, %f66; sub.f32 %f68, %f65, %f67; mul.f32 %f69, %f183, %f186; mul.f32 %f70, %f27, %f69; mul.f32 %f71, %f25, %f64; sub.f32 %f72, %f70, %f71; mul.f32 %f73, %f25, %f66; mul.f32 %f74, %f26, %f69; sub.f32 %f75, %f73, %f74; cvta.to.global.u64 %rd47, %rd4; mul.wide.s32 %rd48, %r8, 4; add.s64 %rd49, %rd47, %rd48; cvta.to.global.u64 %rd50, %rd5; add.s64 %rd51, %rd50, %rd48; cvta.to.global.u64 %rd52, %rd6; add.s64 %rd53, %rd52, %rd48; ld.global.nc.f32 %f76, [%rd49]; ld.global.nc.f32 %f77, [%rd51]; mul.f32 %f78, %f26, %f77; fma.rn.f32 %f79, %f25, %f76, %f78; ld.global.nc.f32 %f80, [%rd53]; fma.rn.f32 %f81, %f27, %f80, %f79; mul.f32 %f82, %f66, %f77; fma.rn.f32 %f83, %f69, %f76, %f82; fma.rn.f32 %f84, %f64, %f80, %f83; mul.f32 %f85, %f77, %f72; fma.rn.f32 %f86, %f76, %f68, %f85; fma.rn.f32 %f87, %f75, %f80, %f86; mul.f32 %f88, %f84, %f84; mul.f32 %f89, %f87, %f87; add.f32 %f90, %f88, %f89; mul.f32 %f91, %f25, %f81; mul.f32 %f92, %f26, %f81; mul.f32 %f93, %f27, %f81; mul.f32 %f94, %f81, %f81; add.f32 %f95, %f94, %f89; mul.f32 %f96, %f69, %f84; mul.f32 %f97, %f66, %f84; mul.f32 %f98, %f64, %f84; mul.f32 %f99, %f96, %f95; mul.f32 %f100, %f97, %f95; mul.f32 %f101, %f98, %f95; fma.rn.f32 %f102, %f91, %f90, %f99; fma.rn.f32 %f103, %f92, %f90, %f100; fma.rn.f32 %f104, %f93, %f90, %f101; add.f32 %f105, %f94, %f88; mul.f32 %f106, %f68, %f87; mul.f32 %f107, %f72, %f87; mul.f32 %f108, %f75, %f87; fma.rn.f32 %f109, %f105, %f106, %f102; fma.rn.f32 %f110, %f105, %f107, %f103; fma.rn.f32 %f111, %f105, %f108, %f104; mul.f32 %f112, %f11, 0fC0000000; mul.f32 %f113, %f112, %f109; mul.f32 %f114, %f112, %f110; mul.f32 %f115, %f112, %f111; mul.f32 %f116, %f88, %f89; mul.f32 %f117, %f94, %f89; mul.f32 %f118, %f96, %f117; mul.f32 %f119, %f97, %f117; mul.f32 %f120, %f98, %f117; fma.rn.f32 %f121, %f91, %f116, %f118; fma.rn.f32 %f122, %f92, %f116, %f119; fma.rn.f32 %f123, %f93, %f116, %f120; mul.f32 %f124, %f94, %f88; fma.rn.f32 %f125, %f124, %f106, %f121; fma.rn.f32 %f126, %f124, %f107, %f122; fma.rn.f32 %f127, %f124, %f108, %f123; add.f32 %f128, %f12, %f12; mul.f32 %f129, %f128, %f125; mul.f32 %f130, %f128, %f126; mul.f32 %f131, %f128, %f127; sub.f32 %f132, %f113, %f129; sub.f32 %f133, %f114, %f130; sub.f32 %f134, %f115, %f131; mul.f32 %f135, %f88, %f88; mul.f32 %f136, %f89, %f89; add.f32 %f137, %f135, %f136; mul.f32 %f138, %f81, %f94; mul.f32 %f139, %f25, %f138; mul.f32 %f140, %f26, %f138; mul.f32 %f141, %f27, %f138; fma.rn.f32 %f142, %f94, %f94, %f136; mul.f32 %f143, %f84, %f88; mul.f32 %f144, %f69, %f143; mul.f32 %f145, %f66, %f143; mul.f32 %f146, %f64, %f143; mul.f32 %f147, %f144, %f142; mul.f32 %f148, %f145, %f142; mul.f32 %f149, %f146, %f142; fma.rn.f32 %f150, %f139, %f137, %f147; fma.rn.f32 %f151, %f140, %f137, %f148; fma.rn.f32 %f152, %f141, %f137, %f149; fma.rn.f32 %f153, %f94, %f94, %f135; mul.f32 %f154, %f87, %f89; mul.f32 %f155, %f68, %f154; mul.f32 %f156, %f72, %f154; mul.f32 %f157, %f75, %f154; fma.rn.f32 %f158, %f153, %f155, %f150; fma.rn.f32 %f159, %f153, %f156, %f151; fma.rn.f32 %f160, %f153, %f157, %f152; mul.f32 %f161, %f15, 0f40800000; mul.f32 %f162, %f161, %f158; mul.f32 %f163, %f161, %f159; mul.f32 %f164, %f161, %f160; sub.f32 %f165, %f132, %f162; sub.f32 %f166, %f133, %f163; sub.f32 %f167, %f134, %f164; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd55, %rd54, %rd48; ld.global.f32 %f168, [%rd55]; add.f32 %f169, %f168, %f165; st.global.f32 [%rd55], %f169; cvta.to.global.u64 %rd56, %rd2; add.s64 %rd57, %rd56, %rd48; ld.global.f32 %f170, [%rd57]; add.f32 %f171, %f170, %f166; st.global.f32 [%rd57], %f171; cvta.to.global.u64 %rd58, %rd3; add.s64 %rd59, %rd58, %rd48; ld.global.f32 %f172, [%rd59]; add.f32 %f173, %f172, %f167; st.global.f32 [%rd59], %f173; BB0_28: ret; } ` ) mumax3-3.10/cuda/cuda2go.go000066400000000000000000000125601371432437400154520ustar00rootroot00000000000000// +build ignore // This program generates Go wrappers for cuda sources. // The cuda file should contain exactly one __global__ void. package main import ( "bufio" "bytes" "flag" "fmt" "io" "log" "os" "regexp" "strconv" "text/scanner" "text/template" "github.com/mumax/3/util" ) func main() { flag.Parse() for _, fname := range flag.Args() { cuda2go(fname) } } // generate cuda wrapper for file. func cuda2go(fname string) { // open cuda file f, err := os.Open(fname) util.PanicErr(err) defer f.Close() // read tokens var token []string var s scanner.Scanner s.Init(f) tok := s.Scan() for tok != scanner.EOF { if !filter(s.TokenText()) { token = append(token, s.TokenText()) } tok = s.Scan() } // find function name and arguments funcname := "" argstart, argstop := -1, -1 for i := 0; i < len(token); i++ { if token[i] == "__global__" { funcname = token[i+2] argstart = i + 4 } if argstart > 0 && token[i] == ")" { argstop = i + 1 break } } argl := token[argstart:argstop] // isolate individual arguments var args [][]string start := 0 for i, a := range argl { if a == "," || a == ")" { args = append(args, argl[start:i]) start = i + 1 } } // separate arg names/types and make pointers Go-style argn := make([]string, len(args)) argt := make([]string, len(args)) for i := range args { if args[i][1] == "*" { args[i] = []string{args[i][0] + "*", args[i][2]} } argt[i] = typemap(args[i][0]) argn[i] = args[i][1] } wrapgen(fname, funcname, argt, argn) } // translate C type to Go type. func typemap(ctype string) string { if gotype, ok := tm[ctype]; ok { return gotype } panic(fmt.Errorf("unsupported cuda type: %v", ctype)) } var tm = map[string]string{"float*": "unsafe.Pointer", "float": "float32", "int": "int", "uint8_t*": "unsafe.Pointer", "uint8_t": "byte"} // template data type Kernel struct { Name string ArgT []string ArgN []string PTX map[int]string } var ls []string // generate wrapper code from template func wrapgen(filename, funcname string, argt, argn []string) { kernel := &Kernel{funcname, argt, argn, make(map[int]string)} // find corresponding .PTX files if ls == nil { dir, errd := os.Open(".") defer dir.Close() util.PanicErr(errd) var errls error ls, errls = dir.Readdirnames(-1) util.PanicErr(errls) } basename := util.NoExt(filename) for _, f := range ls { match, e := regexp.MatchString("^"+basename+"_*[0-9]..ptx", f) util.PanicErr(e) if match { cc, ei := strconv.Atoi(f[len(f)-len("00.ptx") : len(f)-len(".ptx")]) util.PanicErr(ei) fmt.Println(basename, cc) kernel.PTX[cc] = filterptx(f) } } if len(kernel.PTX) == 0 { log.Fatal("no PTX files for ", filename) } wrapfname := basename + "_wrapper.go" wrapout, err := os.OpenFile(wrapfname, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0666) util.PanicErr(err) defer wrapout.Close() util.PanicErr(templ.Execute(wrapout, kernel)) } // wrapper code template text const templText = `package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import( "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" ) // CUDA handle for {{.Name}} kernel var {{.Name}}_code cu.Function // Stores the arguments for {{.Name}} kernel invocation type {{.Name}}_args_t struct{ {{range $i, $_ := .ArgN}} arg_{{.}} {{index $.ArgT $i}} {{end}} argptr [{{len .ArgN}}]unsafe.Pointer sync.Mutex } // Stores the arguments for {{.Name}} kernel invocation var {{.Name}}_args {{.Name}}_args_t func init(){ // CUDA driver kernel call wants pointers to arguments, set them up once. {{range $i, $t := .ArgN}} {{$.Name}}_args.argptr[{{$i}}] = unsafe.Pointer(&{{$.Name}}_args.arg_{{.}}) {{end}} } // Wrapper for {{.Name}} CUDA kernel, asynchronous. func k_{{.Name}}_async ( {{range $i, $t := .ArgT}}{{index $.ArgN $i}} {{$t}}, {{end}} cfg *config) { if Synchronous{ // debug Sync() timer.Start("{{.Name}}") } {{.Name}}_args.Lock() defer {{.Name}}_args.Unlock() if {{.Name}}_code == 0{ {{.Name}}_code = fatbinLoad({{.Name}}_map, "{{.Name}}") } {{range $i, $t := .ArgN}} {{$.Name}}_args.arg_{{.}} = {{.}} {{end}} args := {{.Name}}_args.argptr[:] cu.LaunchKernel({{.Name}}_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous{ // debug Sync() timer.Stop("{{.Name}}") } } // maps compute capability on PTX code for {{.Name}} kernel. var {{.Name}}_map = map[int]string{ 0: "" {{range $k, $v := .PTX}}, {{$k}}: {{$.Name}}_ptx_{{$k}} {{end}} } // {{.Name}} PTX code for various compute capabilities. const( {{range $k, $v := .PTX}} {{$.Name}}_ptx_{{$k}} = {{$v}} {{end}}) ` // wrapper code template var templ = template.Must(template.New("wrap").Parse(templText)) // should token be filtered out of stream? func filter(token string) bool { switch token { case "__restrict__": return true } return false } // Filter comments and ".file" entries from ptx code. // They spoil the git history. func filterptx(fname string) string { f, err := os.Open(fname) util.PanicErr(err) defer f.Close() in := bufio.NewReader(f) var out bytes.Buffer out.Write(([]byte)("`")) line, err := in.ReadBytes('\n') for err != io.EOF { util.PanicErr(err) if !bytes.HasPrefix(line, []byte("//")) && !bytes.HasPrefix(line, []byte(" .file")) { out.Write(line) } line, err = in.ReadBytes('\n') } out.Write(([]byte)("`")) return out.String() } mumax3-3.10/cuda/cufft/000077500000000000000000000000001371432437400147025ustar00rootroot00000000000000mumax3-3.10/cuda/cufft/Makefile000066400000000000000000000006401371432437400163420ustar00rootroot00000000000000all: 6g gccgo doc 6g: go install -v go tool vet *.go gofmt -w *.go GCCGO=gccgo -gccgoflags '-static-libgcc -O3' gccgo: go build -v -compiler $(GCCGO) test: 6gtest gccgotest 6gtest: go test gccgotest: go test -compiler $(GCCGO) bench: 6gbench gccgobench 6gbench: go test -bench=. gccgobench: go test -bench=. -compiler $(GCCGO) clean: go clean doc: godoc github.com/barnex/cuda5/cufft > README mumax3-3.10/cuda/cufft/README000066400000000000000000000056461371432437400155750ustar00rootroot00000000000000PACKAGE DOCUMENTATION package cufft import "github.com/barnex/cuda5/cufft" Go bindings for the CUDA CUFFT API. CONSTANTS const ( FORWARD = -1 // Forward FFT INVERSE = 1 // Inverse FFT ) TYPES type CompatibilityMode int CUFFT compatibility mode const ( COMPATIBILITY_FFTW_PADDING CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_PADDING ) func (t CompatibilityMode) String() string type Handle uintptr FFT plan handle, reference type to a plan func Plan1d(nx int, typ Type, batch int) Handle 1D FFT plan func Plan2d(nx, ny int, typ Type) Handle 2D FFT plan func Plan3d(nx, ny, nz int, typ Type) Handle 3D FFT plan func PlanMany(n []int, inembed []int, istride int, oembed []int, ostride int, typ Type, batch int) Handle 1D,2D or 3D FFT plan func (plan *Handle) Destroy() Destroys the plan. func (plan Handle) ExecC2C(idata, odata cu.DevicePtr, direction int) Execute Complex-to-Complex plan func (plan Handle) ExecC2R(idata, odata cu.DevicePtr) Execute Complex-to-Real plan func (plan Handle) ExecD2Z(idata, odata cu.DevicePtr) Execute Double Real-to-Complex plan func (plan Handle) ExecR2C(idata, odata cu.DevicePtr) Execute Real-to-Complex plan func (plan Handle) ExecZ2D(idata, odata cu.DevicePtr) Execute Double Complex-to-Real plan func (plan Handle) ExecZ2Z(idata, odata cu.DevicePtr, direction int) Execute Double Complex-to-Complex plan func (plan Handle) SetCompatibilityMode(mode CompatibilityMode) Sets the FFTW compatibility mode func (plan Handle) SetStream(stream cu.Stream) Sets the cuda stream for this plan type Result int FFT result const ( SUCCESS Result = C.CUFFT_SUCCESS INVALID_PLAN Result = C.CUFFT_INVALID_PLAN ALLOC_FAILED Result = C.CUFFT_ALLOC_FAILED INVALID_TYPE Result = C.CUFFT_INVALID_TYPE INVALID_VALUE Result = C.CUFFT_INVALID_VALUE INTERNAL_ERROR Result = C.CUFFT_INTERNAL_ERROR EXEC_FAILED Result = C.CUFFT_EXEC_FAILED SETUP_FAILED Result = C.CUFFT_SETUP_FAILED INVALID_SIZE Result = C.CUFFT_INVALID_SIZE UNALIGNED_DATA Result = C.CUFFT_UNALIGNED_DATA INCOMPLETE_PARAMETER_LIST Result = 0xA // cuda6 values copied to avoid dependency on cuda6/cufft.h INVALID_DEVICE Result = 0xB PARSE_ERROR Result = 0xC NO_WORKSPACE Result = 0xD ) FFT result value func (r Result) String() string type Type int FFT type const ( R2C Type = C.CUFFT_R2C // Real to Complex (interleaved) C2R Type = C.CUFFT_C2R // Complex (interleaved) to Real C2C Type = C.CUFFT_C2C // Complex to Complex, interleaved D2Z Type = C.CUFFT_D2Z // Double to Double-Complex Z2D Type = C.CUFFT_Z2D // Double-Complex to Double Z2Z Type = C.CUFFT_Z2Z // Double-Complex to Double-Complex ) func (t Type) String() string mumax3-3.10/cuda/cufft/cgoflags.go000066400000000000000000000010411371432437400170120ustar00rootroot00000000000000package cufft // This file provides CGO flags to find CUDA libraries and headers. //#cgo LDFLAGS:-lcufft // ////default location: //#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib //#cgo CFLAGS: -I/usr/local/cuda/include/ // ////Ubuntu 15.04: //#cgo LDFLAGS:-L/usr/lib/x86_64-linux-gnu/ //#cgo CFLAGS: -I/usr/include // ////arch linux: //#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib //#cgo CFLAGS: -I/opt/cuda/include // ////WINDOWS: //#cgo windows LDFLAGS:-LC:/cuda/lib/x64 //#cgo windows CFLAGS: -IC:/cuda/include -w import "C" mumax3-3.10/cuda/cufft/doc.go000066400000000000000000000000651371432437400157770ustar00rootroot00000000000000// Go bindings for the CUDA CUFFT API. package cufft mumax3-3.10/cuda/cufft/fft_test.go000066400000000000000000000014441371432437400170520ustar00rootroot00000000000000package cufft import ( "fmt" "testing" "unsafe" "github.com/mumax/3/cuda/cu" ) func TestExampleFFT1D(t *testing.T) { N := 8 hostIn := make([]float32, N) hostIn[0] = 1 devIn := cu.MemAlloc(int64(len(hostIn)) * cu.SIZEOF_FLOAT32) defer cu.MemFree(devIn) cu.MemcpyHtoD(devIn, unsafe.Pointer(&hostIn[0]), devIn.Bytes()) hostOut := make([]complex64, N/2+1) devOut := cu.MemAlloc(int64(len(hostOut)) * cu.SIZEOF_COMPLEX64) defer cu.MemFree(devOut) plan := Plan1d(N, R2C, 1) defer plan.Destroy() plan.ExecR2C(devIn, devOut) cu.MemcpyDtoH(unsafe.Pointer(&hostOut[0]), devOut, devOut.Bytes()) fmt.Println("hostIn:", hostIn) fmt.Println("hostOut:", hostOut) for i := 0; i < N; i++ { if hostOut[0] != 1+0i { t.Errorf("hostOut[%d]: got %f, want %f", i, hostOut[0], 1+0i) } } } mumax3-3.10/cuda/cufft/init_test.go000066400000000000000000000003421371432437400172320ustar00rootroot00000000000000package cufft import ( "fmt" "github.com/mumax/3/cuda/cu" ) // needed for all other tests. func init() { cu.Init(0) ctx := cu.CtxCreate(cu.CTX_SCHED_AUTO, 0) cu.CtxSetCurrent(ctx) fmt.Println("Created CUDA context") } mumax3-3.10/cuda/cufft/mode.go000066400000000000000000000010571371432437400161600ustar00rootroot00000000000000package cufft //#include import "C" import ( "fmt" ) // CUFFT compatibility mode type CompatibilityMode int const ( COMPATIBILITY_FFTW_PADDING CompatibilityMode = C.CUFFT_COMPATIBILITY_FFTW_PADDING ) func (t CompatibilityMode) String() string { if str, ok := compatibilityModeString[t]; ok { return str } return fmt.Sprint("CUFFT Compatibility mode with unknown number:", int(t)) } var compatibilityModeString map[CompatibilityMode]string = map[CompatibilityMode]string{ COMPATIBILITY_FFTW_PADDING: "CUFFT_COMPATIBILITY_FFTW_PADDING"} mumax3-3.10/cuda/cufft/plan.go000066400000000000000000000103541371432437400161660ustar00rootroot00000000000000// Copyright 2011 Arne Vansteenkiste (barnex@gmail.com). All rights reserved. // Use of this source code is governed by a freeBSD // license that can be found in the LICENSE.txt file. package cufft //#include import "C" import ( "unsafe" "github.com/mumax/3/cuda/cu" ) // FFT plan handle, reference type to a plan type Handle uintptr // 1D FFT plan func Plan1d(nx int, typ Type, batch int) Handle { var handle C.cufftHandle err := Result(C.cufftPlan1d( &handle, C.int(nx), C.cufftType(typ), C.int(batch))) if err != SUCCESS { panic(err) } return Handle(handle) } // 2D FFT plan func Plan2d(nx, ny int, typ Type) Handle { var handle C.cufftHandle err := Result(C.cufftPlan2d( &handle, C.int(nx), C.int(ny), C.cufftType(typ))) if err != SUCCESS { panic(err) } return Handle(handle) } // 3D FFT plan func Plan3d(nx, ny, nz int, typ Type) Handle { var handle C.cufftHandle err := Result(C.cufftPlan3d( &handle, C.int(nx), C.int(ny), C.int(nz), C.cufftType(typ))) if err != SUCCESS { panic(err) } return Handle(handle) } //cufftPlanMany( // cufftHandle *plan, int rank, int *n, int *inembed, // int istride, int idist, int *onembed, int ostride, // int odist, cufftType type, int batch ); // 1D,2D or 3D FFT plan func PlanMany(n []int, inembed []int, istride int, oembed []int, ostride int, typ Type, batch int) Handle { var handle C.cufftHandle NULL := (*C.int)(unsafe.Pointer(uintptr(0))) inembedptr := NULL idist := 0 if inembed != nil { inembedptr = (*C.int)(unsafe.Pointer(&inembed[0])) idist = inembed[0] } oembedptr := NULL odist := 0 if oembed != nil { oembedptr = (*C.int)(unsafe.Pointer(&oembed[0])) odist = oembed[0] } err := Result(C.cufftPlanMany( &handle, C.int(len(n)), // rank (*C.int)(unsafe.Pointer(&n[0])), // n inembedptr, C.int(istride), C.int(idist), oembedptr, C.int(ostride), C.int(odist), C.cufftType(typ), C.int(batch))) if err != SUCCESS { panic(err) } return Handle(handle) } // Execute Complex-to-Complex plan func (plan Handle) ExecC2C(idata, odata cu.DevicePtr, direction int) { err := Result(C.cufftExecC2C( C.cufftHandle(plan), (*C.cufftComplex)(unsafe.Pointer(uintptr(idata))), (*C.cufftComplex)(unsafe.Pointer(uintptr(odata))), C.int(direction))) if err != SUCCESS { panic(err) } } // Execute Real-to-Complex plan func (plan Handle) ExecR2C(idata, odata cu.DevicePtr) { err := Result(C.cufftExecR2C( C.cufftHandle(plan), (*C.cufftReal)(unsafe.Pointer(uintptr(idata))), (*C.cufftComplex)(unsafe.Pointer(uintptr(odata))))) if err != SUCCESS { panic(err) } } // Execute Complex-to-Real plan func (plan Handle) ExecC2R(idata, odata cu.DevicePtr) { err := Result(C.cufftExecC2R( C.cufftHandle(plan), (*C.cufftComplex)(unsafe.Pointer(uintptr(idata))), (*C.cufftReal)(unsafe.Pointer(uintptr(odata))))) if err != SUCCESS { panic(err) } } // Execute Double Complex-to-Complex plan func (plan Handle) ExecZ2Z(idata, odata cu.DevicePtr, direction int) { err := Result(C.cufftExecZ2Z( C.cufftHandle(plan), (*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(idata))), (*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(odata))), C.int(direction))) if err != SUCCESS { panic(err) } } // Execute Double Real-to-Complex plan func (plan Handle) ExecD2Z(idata, odata cu.DevicePtr) { err := Result(C.cufftExecD2Z( C.cufftHandle(plan), (*C.cufftDoubleReal)(unsafe.Pointer(uintptr(idata))), (*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(odata))))) if err != SUCCESS { panic(err) } } // Execute Double Complex-to-Real plan func (plan Handle) ExecZ2D(idata, odata cu.DevicePtr) { err := Result(C.cufftExecZ2D( C.cufftHandle(plan), (*C.cufftDoubleComplex)(unsafe.Pointer(uintptr(idata))), (*C.cufftDoubleReal)(unsafe.Pointer(uintptr(odata))))) if err != SUCCESS { panic(err) } } // Destroys the plan. func (plan *Handle) Destroy() { err := Result(C.cufftDestroy(C.cufftHandle(*plan))) *plan = 0 // make sure plan is not used anymore if err != SUCCESS { panic(err) } } // Sets the cuda stream for this plan func (plan Handle) SetStream(stream cu.Stream) { err := Result(C.cufftSetStream( C.cufftHandle(plan), C.cudaStream_t(unsafe.Pointer(uintptr(stream))))) if err != SUCCESS { panic(err) } } mumax3-3.10/cuda/cufft/result.go000066400000000000000000000034671371432437400165610ustar00rootroot00000000000000package cufft //#include import "C" import ( "fmt" ) // FFT result type Result int // FFT result value const ( SUCCESS Result = C.CUFFT_SUCCESS INVALID_PLAN Result = C.CUFFT_INVALID_PLAN ALLOC_FAILED Result = C.CUFFT_ALLOC_FAILED INVALID_TYPE Result = C.CUFFT_INVALID_TYPE INVALID_VALUE Result = C.CUFFT_INVALID_VALUE INTERNAL_ERROR Result = C.CUFFT_INTERNAL_ERROR EXEC_FAILED Result = C.CUFFT_EXEC_FAILED SETUP_FAILED Result = C.CUFFT_SETUP_FAILED INVALID_SIZE Result = C.CUFFT_INVALID_SIZE UNALIGNED_DATA Result = C.CUFFT_UNALIGNED_DATA INCOMPLETE_PARAMETER_LIST Result = 0xA // cuda6 values copied to avoid dependency on cuda6/cufft.h INVALID_DEVICE Result = 0xB PARSE_ERROR Result = 0xC NO_WORKSPACE Result = 0xD ) func (r Result) String() string { if str, ok := resultString[r]; ok { return str } return fmt.Sprint("CUFFT Result with unknown error number:", int(r)) } var resultString map[Result]string = map[Result]string{ SUCCESS: "CUFFT_SUCCESS", INVALID_PLAN: "CUFFT_INVALID_PLAN", ALLOC_FAILED: "CUFFT_ALLOC_FAILED", INVALID_TYPE: "CUFFT_INVALID_TYPE", INVALID_VALUE: "CUFFT_INVALID_VALUE", INTERNAL_ERROR: "CUFFT_INTERNAL_ERROR", EXEC_FAILED: "CUFFT_EXEC_FAILED", SETUP_FAILED: "CUFFT_SETUP_FAILED", INVALID_SIZE: "CUFFT_INVALID_SIZE", UNALIGNED_DATA: "CUFFT_UNALIGNED_DATA", INCOMPLETE_PARAMETER_LIST: "CUFFT_INCOMPLETE_PARAMETER_LIST", INVALID_DEVICE: "CUFFT_INVALID_DEVICE", PARSE_ERROR: "CUFFT_PARSE_ERROR", NO_WORKSPACE: "CUFFT_NO_WORKSPACE"} mumax3-3.10/cuda/cufft/type.go000066400000000000000000000014671371432437400162220ustar00rootroot00000000000000package cufft //#include import "C" import ( "fmt" ) // FFT type type Type int const ( R2C Type = C.CUFFT_R2C // Real to Complex (interleaved) C2R Type = C.CUFFT_C2R // Complex (interleaved) to Real C2C Type = C.CUFFT_C2C // Complex to Complex, interleaved D2Z Type = C.CUFFT_D2Z // Double to Double-Complex Z2D Type = C.CUFFT_Z2D // Double-Complex to Double Z2Z Type = C.CUFFT_Z2Z // Double-Complex to Double-Complex ) const ( FORWARD = -1 // Forward FFT INVERSE = 1 // Inverse FFT ) func (t Type) String() string { if str, ok := typeString[t]; ok { return str } return fmt.Sprint("CUFFT Type with unknown number:", int(t)) } var typeString map[Type]string = map[Type]string{ R2C: "CUFFT_R2C", C2R: "CUFFT_C2R", C2C: "CUFFT_C2C", D2Z: "CUFFT_D2Z", Z2D: "CUFFT_Z2D", Z2Z: "CUFFT_Z2Z"} mumax3-3.10/cuda/curand/000077500000000000000000000000001371432437400150475ustar00rootroot00000000000000mumax3-3.10/cuda/curand/Makefile000066400000000000000000000006171371432437400165130ustar00rootroot00000000000000all: 6g gccgo doc 6g: go install -v gofmt -w *.go GCCGO=gccgo -gccgoflags '-static-libgcc -O3' gccgo: go build -v -compiler $(GCCGO) test: 6gtest gccgotest 6gtest: go test gccgotest: go test -compiler $(GCCGO) bench: 6gbench gccgobench 6gbench: go test -bench=. gccgobench: go test -bench=. -compiler $(GCCGO) clean: go clean doc: godoc github.com/barnex/cuda5/curand > README mumax3-3.10/cuda/curand/README000066400000000000000000000045651371432437400157410ustar00rootroot00000000000000PACKAGE DOCUMENTATION package curand import "github.com/barnex/cuda5/curand" TYPES type Generator uintptr func CreateGenerator(rngType RngType) Generator func (g Generator) GenerateNormal(output uintptr, n int64, mean, stddev float32) func (g Generator) SetSeed(seed int64) type RngType int const ( PSEUDO_DEFAULT RngType = C.CURAND_RNG_PSEUDO_DEFAULT // Default pseudorandom generator PSEUDO_XORWOW RngType = C.CURAND_RNG_PSEUDO_XORWOW // XORWOW pseudorandom generator QUASI_DEFAULT RngType = C.CURAND_RNG_QUASI_DEFAULT // Default quasirandom generator QUASI_SOBOL32 RngType = C.CURAND_RNG_QUASI_SOBOL32 // Sobol32 quasirandom generator QUASI_SCRAMBLED_SOBOL32 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 // Scrambled Sobol32 quasirandom generator QUASI_SOBOL64 RngType = C.CURAND_RNG_QUASI_SOBOL64 // Sobol64 quasirandom generator QUASI_SCRAMBLED_SOBOL64 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 // Scrambled Sobol64 quasirandom generator ) type Status int const ( SUCCESS Status = C.CURAND_STATUS_SUCCESS // No errors VERSION_MISMATCH Status = C.CURAND_STATUS_VERSION_MISMATCH // Header file and linked library version do not match NOT_INITIALIZED Status = C.CURAND_STATUS_NOT_INITIALIZED // Generator not initialized ALLOCATION_FAILED Status = C.CURAND_STATUS_ALLOCATION_FAILED // Memory allocation failed TYPE_ERROR Status = C.CURAND_STATUS_TYPE_ERROR // Generator is wrong type OUT_OF_RANGE Status = C.CURAND_STATUS_OUT_OF_RANGE // Argument out of range LENGTH_NOT_MULTIPLE Status = C.CURAND_STATUS_LENGTH_NOT_MULTIPLE // Length requested is not a multple of dimension LAUNCH_FAILURE Status = C.CURAND_STATUS_LAUNCH_FAILURE // Kernel launch failure PREEXISTING_FAILURE Status = C.CURAND_STATUS_PREEXISTING_FAILURE // Preexisting failure on library entry INITIALIZATION_FAILED Status = C.CURAND_STATUS_INITIALIZATION_FAILED // Initialization of CUDA failed ARCH_MISMATCH Status = C.CURAND_STATUS_ARCH_MISMATCH // Architecture mismatch, GPU does not support requested feature INTERNAL_ERROR Status = C.CURAND_STATUS_INTERNAL_ERROR // Internal library error ) mumax3-3.10/cuda/curand/cgoflags.go000066400000000000000000000010431371432437400171610ustar00rootroot00000000000000package curand // This file provides CGO flags to find CUDA libraries and headers. //#cgo LDFLAGS:-lcurand // ////default location: //#cgo LDFLAGS:-L/usr/local/cuda/lib64 -L/usr/local/cuda/lib //#cgo CFLAGS: -I/usr/local/cuda/include/ // ////Ubuntu 15.04: //#cgo LDFLAGS:-L/usr/lib/x86_64-linux-gnu/ //#cgo CFLAGS: -I/usr/include // ////arch linux: //#cgo LDFLAGS:-L/opt/cuda/lib64 -L/opt/cuda/lib //#cgo CFLAGS: -I/opt/cuda/include // ////WINDOWS: //#cgo windows LDFLAGS:-LC:/cuda/lib/x64 //#cgo windows CFLAGS: -IC:/cuda/include -w import "C" mumax3-3.10/cuda/curand/generator.go000066400000000000000000000032261371432437400173670ustar00rootroot00000000000000package curand //#include import "C" import ( "unsafe" ) type Generator uintptr type RngType int const ( PSEUDO_DEFAULT RngType = C.CURAND_RNG_PSEUDO_DEFAULT // Default pseudorandom generator PSEUDO_XORWOW RngType = C.CURAND_RNG_PSEUDO_XORWOW // XORWOW pseudorandom generator QUASI_DEFAULT RngType = C.CURAND_RNG_QUASI_DEFAULT // Default quasirandom generator QUASI_SOBOL32 RngType = C.CURAND_RNG_QUASI_SOBOL32 // Sobol32 quasirandom generator QUASI_SCRAMBLED_SOBOL32 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 // Scrambled Sobol32 quasirandom generator QUASI_SOBOL64 RngType = C.CURAND_RNG_QUASI_SOBOL64 // Sobol64 quasirandom generator QUASI_SCRAMBLED_SOBOL64 RngType = C.CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 // Scrambled Sobol64 quasirandom generator ) func CreateGenerator(rngType RngType) Generator { var rng C.curandGenerator_t err := Status(C.curandCreateGenerator(&rng, C.curandRngType_t(rngType))) if err != SUCCESS { panic(err) } return Generator(uintptr(unsafe.Pointer(rng))) // cgo } func (g Generator) GenerateNormal(output uintptr, n int64, mean, stddev float32) { err := Status(C.curandGenerateNormal( C.curandGenerator_t(unsafe.Pointer(uintptr(g))), (*C.float)(unsafe.Pointer(output)), C.size_t(n), C.float(mean), C.float(stddev))) if err != SUCCESS { panic(err) } } func (g Generator) SetSeed(seed int64) { err := Status(C.curandSetPseudoRandomGeneratorSeed(C.curandGenerator_t(unsafe.Pointer(uintptr(g))), C.ulonglong(seed))) if err != SUCCESS { panic(err) } } // Documentation was taken from the curand headers. mumax3-3.10/cuda/curand/status.go000066400000000000000000000043411371432437400167230ustar00rootroot00000000000000package curand //#include import "C" import ( "fmt" ) type Status int const ( SUCCESS Status = C.CURAND_STATUS_SUCCESS // No errors VERSION_MISMATCH Status = C.CURAND_STATUS_VERSION_MISMATCH // Header file and linked library version do not match NOT_INITIALIZED Status = C.CURAND_STATUS_NOT_INITIALIZED // Generator not initialized ALLOCATION_FAILED Status = C.CURAND_STATUS_ALLOCATION_FAILED // Memory allocation failed TYPE_ERROR Status = C.CURAND_STATUS_TYPE_ERROR // Generator is wrong type OUT_OF_RANGE Status = C.CURAND_STATUS_OUT_OF_RANGE // Argument out of range LENGTH_NOT_MULTIPLE Status = C.CURAND_STATUS_LENGTH_NOT_MULTIPLE // Length requested is not a multple of dimension LAUNCH_FAILURE Status = C.CURAND_STATUS_LAUNCH_FAILURE // Kernel launch failure PREEXISTING_FAILURE Status = C.CURAND_STATUS_PREEXISTING_FAILURE // Preexisting failure on library entry INITIALIZATION_FAILED Status = C.CURAND_STATUS_INITIALIZATION_FAILED // Initialization of CUDA failed ARCH_MISMATCH Status = C.CURAND_STATUS_ARCH_MISMATCH // Architecture mismatch, GPU does not support requested feature INTERNAL_ERROR Status = C.CURAND_STATUS_INTERNAL_ERROR // Internal library error ) func (s Status) String() string { if str, ok := statusStr[s]; ok { return str } else { return fmt.Sprint("CURAND ERROR NUMBER ", int(s)) } } var statusStr = map[Status]string{ SUCCESS: "CURAND_STATUS_SUCCESS", VERSION_MISMATCH: "CURAND_STATUS_VERSION_MISMATCH", NOT_INITIALIZED: "CURAND_STATUS_NOT_INITIALIZED", ALLOCATION_FAILED: "CURAND_STATUS_ALLOCATION_FAILED", TYPE_ERROR: "CURAND_STATUS_TYPE_ERROR", OUT_OF_RANGE: "CURAND_STATUS_OUT_OF_RANGE", LENGTH_NOT_MULTIPLE: "CURAND_STATUS_LENGTH_NOT_MULTIPLE", LAUNCH_FAILURE: "CURAND_STATUS_LAUNCH_FAILURE", PREEXISTING_FAILURE: "CURAND_STATUS_PREEXISTING_FAILURE", INITIALIZATION_FAILED: "CURAND_STATUS_INITIALIZATION_FAILED", ARCH_MISMATCH: "CURAND_STATUS_ARCH_MISMATCH", INTERNAL_ERROR: "CURAND_STATUS_INTERNAL_ERROR", } // Documentation was taken from the curand headers. mumax3-3.10/cuda/div.cu000066400000000000000000000005611371432437400147100ustar00rootroot00000000000000// dst[i] = a[i] / b[i] extern "C" __global__ void pointwise_div(float* __restrict__ dst, float* __restrict__ a, float* __restrict__ b, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if(i < N) { if (b[i] != 0.0f) { dst[i] = a[i] / b[i]; } else { dst[i] = 0.0f; } } } mumax3-3.10/cuda/div_wrapper.go000066400000000000000000000447511371432437400164570ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for pointwise_div kernel var pointwise_div_code cu.Function // Stores the arguments for pointwise_div kernel invocation type pointwise_div_args_t struct { arg_dst unsafe.Pointer arg_a unsafe.Pointer arg_b unsafe.Pointer arg_N int argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for pointwise_div kernel invocation var pointwise_div_args pointwise_div_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. pointwise_div_args.argptr[0] = unsafe.Pointer(&pointwise_div_args.arg_dst) pointwise_div_args.argptr[1] = unsafe.Pointer(&pointwise_div_args.arg_a) pointwise_div_args.argptr[2] = unsafe.Pointer(&pointwise_div_args.arg_b) pointwise_div_args.argptr[3] = unsafe.Pointer(&pointwise_div_args.arg_N) } // Wrapper for pointwise_div CUDA kernel, asynchronous. func k_pointwise_div_async(dst unsafe.Pointer, a unsafe.Pointer, b unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("pointwise_div") } pointwise_div_args.Lock() defer pointwise_div_args.Unlock() if pointwise_div_code == 0 { pointwise_div_code = fatbinLoad(pointwise_div_map, "pointwise_div") } pointwise_div_args.arg_dst = dst pointwise_div_args.arg_a = a pointwise_div_args.arg_b = b pointwise_div_args.arg_N = N args := pointwise_div_args.argptr[:] cu.LaunchKernel(pointwise_div_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("pointwise_div") } } // maps compute capability on PTX code for pointwise_div kernel. var pointwise_div_map = map[int]string{0: "", 30: pointwise_div_ptx_30, 32: pointwise_div_ptx_32, 35: pointwise_div_ptx_35, 37: pointwise_div_ptx_37, 50: pointwise_div_ptx_50, 52: pointwise_div_ptx_52, 53: pointwise_div_ptx_53, 60: pointwise_div_ptx_60, 61: pointwise_div_ptx_61, 62: pointwise_div_ptx_62, 70: pointwise_div_ptx_70, 72: pointwise_div_ptx_72, 75: pointwise_div_ptx_75} // pointwise_div PTX code for various compute capabilities. const ( pointwise_div_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<12>; ld.param.u64 %rd2, [pointwise_div_param_0]; ld.param.u64 %rd3, [pointwise_div_param_1]; ld.param.u64 %rd4, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd5, %rd4; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.f32 %f1, [%rd7]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd8, %rd2; add.s64 %rd1, %rd8, %rd6; @%p2 bra BB0_3; bra.uni BB0_2; BB0_3: cvta.to.global.u64 %rd9, %rd3; add.s64 %rd11, %rd9, %rd6; ld.global.f32 %f2, [%rd11]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd1], %f3; bra.uni BB0_4; BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd1], %r9; BB0_4: ret; } ` pointwise_div_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<12>; ld.param.u64 %rd2, [pointwise_div_param_0]; ld.param.u64 %rd3, [pointwise_div_param_1]; ld.param.u64 %rd4, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd5, %rd4; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f1, [%rd7]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd8, %rd2; add.s64 %rd1, %rd8, %rd6; @%p2 bra BB0_3; bra.uni BB0_2; BB0_3: cvta.to.global.u64 %rd9, %rd3; add.s64 %rd11, %rd9, %rd6; ld.global.nc.f32 %f2, [%rd11]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd1], %f3; bra.uni BB0_4; BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd1], %r9; BB0_4: ret; } ` pointwise_div_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<12>; ld.param.u64 %rd2, [pointwise_div_param_0]; ld.param.u64 %rd3, [pointwise_div_param_1]; ld.param.u64 %rd4, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd5, %rd4; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f1, [%rd7]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd8, %rd2; add.s64 %rd1, %rd8, %rd6; @%p2 bra BB0_3; bra.uni BB0_2; BB0_3: cvta.to.global.u64 %rd9, %rd3; add.s64 %rd11, %rd9, %rd6; ld.global.nc.f32 %f2, [%rd11]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd1], %f3; bra.uni BB0_4; BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd1], %r9; BB0_4: ret; } ` pointwise_div_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<12>; ld.param.u64 %rd2, [pointwise_div_param_0]; ld.param.u64 %rd3, [pointwise_div_param_1]; ld.param.u64 %rd4, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd5, %rd4; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f1, [%rd7]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd8, %rd2; add.s64 %rd1, %rd8, %rd6; @%p2 bra BB0_3; bra.uni BB0_2; BB0_3: cvta.to.global.u64 %rd9, %rd3; add.s64 %rd11, %rd9, %rd6; ld.global.nc.f32 %f2, [%rd11]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd1], %f3; bra.uni BB0_4; BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd1], %r9; BB0_4: ret; } ` pointwise_div_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<12>; ld.param.u64 %rd2, [pointwise_div_param_0]; ld.param.u64 %rd3, [pointwise_div_param_1]; ld.param.u64 %rd4, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd5, %rd4; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f1, [%rd7]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd8, %rd2; add.s64 %rd1, %rd8, %rd6; @%p2 bra BB0_3; bra.uni BB0_2; BB0_3: cvta.to.global.u64 %rd9, %rd3; add.s64 %rd11, %rd9, %rd6; ld.global.nc.f32 %f2, [%rd11]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd1], %f3; bra.uni BB0_4; BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd1], %r9; BB0_4: ret; } ` pointwise_div_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<12>; ld.param.u64 %rd2, [pointwise_div_param_0]; ld.param.u64 %rd3, [pointwise_div_param_1]; ld.param.u64 %rd4, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd5, %rd4; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f1, [%rd7]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd8, %rd2; add.s64 %rd1, %rd8, %rd6; @%p2 bra BB0_3; bra.uni BB0_2; BB0_3: cvta.to.global.u64 %rd9, %rd3; add.s64 %rd11, %rd9, %rd6; ld.global.nc.f32 %f2, [%rd11]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd1], %f3; bra.uni BB0_4; BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd1], %r9; BB0_4: ret; } ` pointwise_div_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<12>; ld.param.u64 %rd2, [pointwise_div_param_0]; ld.param.u64 %rd3, [pointwise_div_param_1]; ld.param.u64 %rd4, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd5, %rd4; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f1, [%rd7]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd8, %rd2; add.s64 %rd1, %rd8, %rd6; @%p2 bra BB0_3; bra.uni BB0_2; BB0_3: cvta.to.global.u64 %rd9, %rd3; add.s64 %rd11, %rd9, %rd6; ld.global.nc.f32 %f2, [%rd11]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd1], %f3; bra.uni BB0_4; BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd1], %r9; BB0_4: ret; } ` pointwise_div_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<12>; ld.param.u64 %rd2, [pointwise_div_param_0]; ld.param.u64 %rd3, [pointwise_div_param_1]; ld.param.u64 %rd4, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd5, %rd4; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f1, [%rd7]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd8, %rd2; add.s64 %rd1, %rd8, %rd6; @%p2 bra BB0_3; bra.uni BB0_2; BB0_3: cvta.to.global.u64 %rd9, %rd3; add.s64 %rd11, %rd9, %rd6; ld.global.nc.f32 %f2, [%rd11]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd1], %f3; bra.uni BB0_4; BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd1], %r9; BB0_4: ret; } ` pointwise_div_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<12>; ld.param.u64 %rd2, [pointwise_div_param_0]; ld.param.u64 %rd3, [pointwise_div_param_1]; ld.param.u64 %rd4, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd5, %rd4; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f1, [%rd7]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd8, %rd2; add.s64 %rd1, %rd8, %rd6; @%p2 bra BB0_3; bra.uni BB0_2; BB0_3: cvta.to.global.u64 %rd9, %rd3; add.s64 %rd11, %rd9, %rd6; ld.global.nc.f32 %f2, [%rd11]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd1], %f3; bra.uni BB0_4; BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd1], %r9; BB0_4: ret; } ` pointwise_div_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<12>; ld.param.u64 %rd2, [pointwise_div_param_0]; ld.param.u64 %rd3, [pointwise_div_param_1]; ld.param.u64 %rd4, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd5, %rd4; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f1, [%rd7]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd8, %rd2; add.s64 %rd1, %rd8, %rd6; @%p2 bra BB0_3; bra.uni BB0_2; BB0_3: cvta.to.global.u64 %rd9, %rd3; add.s64 %rd11, %rd9, %rd6; ld.global.nc.f32 %f2, [%rd11]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd1], %f3; bra.uni BB0_4; BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd1], %r9; BB0_4: ret; } ` pointwise_div_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<12>; ld.param.u64 %rd2, [pointwise_div_param_0]; ld.param.u64 %rd3, [pointwise_div_param_1]; ld.param.u64 %rd4, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd5, %rd4; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f1, [%rd7]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd8, %rd2; add.s64 %rd1, %rd8, %rd6; @%p2 bra BB0_3; bra.uni BB0_2; BB0_3: cvta.to.global.u64 %rd9, %rd3; add.s64 %rd11, %rd9, %rd6; ld.global.nc.f32 %f2, [%rd11]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd1], %f3; bra.uni BB0_4; BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd1], %r9; BB0_4: ret; } ` pointwise_div_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<12>; ld.param.u64 %rd2, [pointwise_div_param_0]; ld.param.u64 %rd3, [pointwise_div_param_1]; ld.param.u64 %rd4, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd5, %rd4; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f1, [%rd7]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd8, %rd2; add.s64 %rd1, %rd8, %rd6; @%p2 bra BB0_3; bra.uni BB0_2; BB0_3: cvta.to.global.u64 %rd9, %rd3; add.s64 %rd11, %rd9, %rd6; ld.global.nc.f32 %f2, [%rd11]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd1], %f3; bra.uni BB0_4; BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd1], %r9; BB0_4: ret; } ` pointwise_div_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl pointwise_div .visible .entry pointwise_div( .param .u64 pointwise_div_param_0, .param .u64 pointwise_div_param_1, .param .u64 pointwise_div_param_2, .param .u32 pointwise_div_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<12>; ld.param.u64 %rd2, [pointwise_div_param_0]; ld.param.u64 %rd3, [pointwise_div_param_1]; ld.param.u64 %rd4, [pointwise_div_param_2]; ld.param.u32 %r2, [pointwise_div_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd5, %rd4; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f1, [%rd7]; setp.neu.f32 %p2, %f1, 0f00000000; cvta.to.global.u64 %rd8, %rd2; add.s64 %rd1, %rd8, %rd6; @%p2 bra BB0_3; bra.uni BB0_2; BB0_3: cvta.to.global.u64 %rd9, %rd3; add.s64 %rd11, %rd9, %rd6; ld.global.nc.f32 %f2, [%rd11]; div.rn.f32 %f3, %f2, %f1; st.global.f32 [%rd1], %f3; bra.uni BB0_4; BB0_2: mov.u32 %r9, 0; st.global.u32 [%rd1], %r9; BB0_4: ret; } ` ) mumax3-3.10/cuda/dmi.cu000066400000000000000000000170671371432437400147100ustar00rootroot00000000000000#include #include "exchange.h" #include "float3.h" #include "stencil.h" #include "amul.h" // Exchange + Dzyaloshinskii-Moriya interaction according to // Bagdanov and Röβler, PRL 87, 3, 2001. eq.8 (out-of-plane symmetry breaking). // Taking into account proper boundary conditions. // m: normalized magnetization // H: effective field in Tesla // D: dmi strength / Msat, in Tesla*m // A: Aex/Msat extern "C" __global__ void adddmi(float* __restrict__ Hx, float* __restrict__ Hy, float* __restrict__ Hz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ Ms_, float Ms_mul, float* __restrict__ aLUT2d, float* __restrict__ dLUT2d, uint8_t* __restrict__ regions, float cx, float cy, float cz, int Nx, int Ny, int Nz, uint8_t PBC, uint8_t OpenBC) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } int I = idx(ix, iy, iz); // central cell index float3 h = make_float3(0.0,0.0,0.0); // add to H float3 m0 = make_float3(mx[I], my[I], mz[I]); // central m uint8_t r0 = regions[I]; int i_; // neighbor index if(is0(m0)) { return; } // x derivatives (along length) { float3 m1 = make_float3(0.0f, 0.0f, 0.0f); // left neighbor i_ = idx(lclampx(ix-1), iy, iz); // load neighbor m if inside grid, keep 0 otherwise if (ix-1 >= 0 || PBCx) { m1 = make_float3(mx[i_], my[i_], mz[i_]); } int r1 = is0(m1)? r0 : regions[i_]; // don't use inter region params if m1=0 float A1 = aLUT2d[symidx(r0, r1)]; // inter-region Aex float D1 = dLUT2d[symidx(r0, r1)]; // inter-region Dex if (!is0(m1) || !OpenBC){ // do nothing at an open boundary if (is0(m1)) { // neighbor missing m1.x = m0.x - (-cx * (0.5f*D1/A1) * m0.z); // extrapolate missing m from Neumann BC's m1.y = m0.y; m1.z = m0.z + (-cx * (0.5f*D1/A1) * m0.x); } h += (2.0f*A1/(cx*cx)) * (m1 - m0); // exchange h.x += (D1/cx)*(- m1.z); h.z -= (D1/cx)*(- m1.x); } } { float3 m2 = make_float3(0.0f, 0.0f, 0.0f); // right neighbor i_ = idx(hclampx(ix+1), iy, iz); if (ix+1 < Nx || PBCx) { m2 = make_float3(mx[i_], my[i_], mz[i_]); } int r2 = is0(m2)? r0 : regions[i_]; float A2 = aLUT2d[symidx(r0, r2)]; float D2 = dLUT2d[symidx(r0, r2)]; if (!is0(m2) || !OpenBC){ if (is0(m2)) { m2.x = m0.x - (cx * (0.5f*D2/A2) * m0.z); m2.y = m0.y; m2.z = m0.z + (cx * (0.5f*D2/A2) * m0.x); } h += (2.0f*A2/(cx*cx)) * (m2 - m0); h.x += (D2/cx)*(m2.z); h.z -= (D2/cx)*(m2.x); } } // y derivatives (along height) { float3 m1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, lclampy(iy-1), iz); if (iy-1 >= 0 || PBCy) { m1 = make_float3(mx[i_], my[i_], mz[i_]); } int r1 = is0(m1)? r0 : regions[i_]; float A1 = aLUT2d[symidx(r0, r1)]; float D1 = dLUT2d[symidx(r0, r1)]; if (!is0(m1) || !OpenBC){ if (is0(m1)) { m1.x = m0.x; m1.y = m0.y - (-cy * (0.5f*D1/A1) * m0.z); m1.z = m0.z + (-cy * (0.5f*D1/A1) * m0.y); } h += (2.0f*A1/(cy*cy)) * (m1 - m0); h.y += (D1/cy)*(- m1.z); h.z -= (D1/cy)*(- m1.y); } } { float3 m2 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, hclampy(iy+1), iz); if (iy+1 < Ny || PBCy) { m2 = make_float3(mx[i_], my[i_], mz[i_]); } int r2 = is0(m2)? r0 : regions[i_]; float A2 = aLUT2d[symidx(r0, r2)]; float D2 = dLUT2d[symidx(r0, r2)]; if (!is0(m2) || !OpenBC){ if (is0(m2)) { m2.x = m0.x; m2.y = m0.y - (cy * (0.5f*D2/A2) * m0.z); m2.z = m0.z + (cy * (0.5f*D2/A2) * m0.y); } h += (2.0f*A2/(cy*cy)) * (m2 - m0); h.y += (D2/cy)*(m2.z); h.z -= (D2/cy)*(m2.y); } } // only take vertical derivative for 3D sim if (Nz != 1) { // bottom neighbor { i_ = idx(ix, iy, lclampz(iz-1)); float3 m1 = make_float3(mx[i_], my[i_], mz[i_]); m1 = ( is0(m1)? m0: m1 ); // Neumann BC float A1 = aLUT2d[symidx(r0, regions[i_])]; h += (2.0f*A1/(cz*cz)) * (m1 - m0); // Exchange only } // top neighbor { i_ = idx(ix, iy, hclampz(iz+1)); float3 m2 = make_float3(mx[i_], my[i_], mz[i_]); m2 = ( is0(m2)? m0: m2 ); float A2 = aLUT2d[symidx(r0, regions[i_])]; h += (2.0f*A2/(cz*cz)) * (m2 - m0); } } // write back, result is H + Hdmi + Hex float invMs = inv_Msat(Ms_, Ms_mul, I); Hx[I] += h.x*invMs; Hy[I] += h.y*invMs; Hz[I] += h.z*invMs; } // Note on boundary conditions. // // We need the derivative and laplacian of m in point A, but e.g. C lies out of the boundaries. // We use the boundary condition in B (derivative of the magnetization) to extrapolate m to point C: // m_C = m_A + (dm/dx)|_B * cellsize // // When point C is inside the boundary, we just use its actual value. // // Then we can take the central derivative in A: // (dm/dx)|_A = (m_C - m_D) / (2*cellsize) // And the laplacian: // lapl(m)|_A = (m_C + m_D - 2*m_A) / (cellsize^2) // // All these operations should be second order as they involve only central derivatives. // // ------------------------------------------------------------------ * // | | C | // | | ** | // | | *** | // | | *** | // | | *** | // | | *** | // | B | // | *** | | // | *** | | // | **** | | // | **** | | // | **** | | // | ** A | | // | ***** | | // | ****** | | // | ********* | | // |D ******** | | // | | | // +----------------+----------------+-----------------+---------------+ // -1 -0.5 0 0.5 1 // x mumax3-3.10/cuda/dmi.go000066400000000000000000000015521371432437400146760ustar00rootroot00000000000000package cuda import ( "unsafe" "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Add effective field of Dzyaloshinskii-Moriya interaction to Beff (Tesla). // According to Bagdanov and Röβler, PRL 87, 3, 2001. eq.8 (out-of-plane symmetry breaking). // See dmi.cu func AddDMI(Beff *data.Slice, m *data.Slice, Aex_red, Dex_red SymmLUT, Msat MSlice, regions *Bytes, mesh *data.Mesh, OpenBC bool) { cellsize := mesh.CellSize() N := Beff.Size() util.Argument(m.Size() == N) cfg := make3DConf(N) var openBC byte if OpenBC { openBC = 1 } k_adddmi_async(Beff.DevPtr(X), Beff.DevPtr(Y), Beff.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), Msat.DevPtr(0), Msat.Mul(0), unsafe.Pointer(Aex_red), unsafe.Pointer(Dex_red), regions.Ptr, float32(cellsize[X]), float32(cellsize[Y]), float32(cellsize[Z]), N[X], N[Y], N[Z], mesh.PBC_code(), openBC, cfg) } mumax3-3.10/cuda/dmi_wrapper.go000066400000000000000000006267161371432437400164550ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for adddmi kernel var adddmi_code cu.Function // Stores the arguments for adddmi kernel invocation type adddmi_args_t struct { arg_Hx unsafe.Pointer arg_Hy unsafe.Pointer arg_Hz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_Ms_ unsafe.Pointer arg_Ms_mul float32 arg_aLUT2d unsafe.Pointer arg_dLUT2d unsafe.Pointer arg_regions unsafe.Pointer arg_cx float32 arg_cy float32 arg_cz float32 arg_Nx int arg_Ny int arg_Nz int arg_PBC byte arg_OpenBC byte argptr [19]unsafe.Pointer sync.Mutex } // Stores the arguments for adddmi kernel invocation var adddmi_args adddmi_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. adddmi_args.argptr[0] = unsafe.Pointer(&adddmi_args.arg_Hx) adddmi_args.argptr[1] = unsafe.Pointer(&adddmi_args.arg_Hy) adddmi_args.argptr[2] = unsafe.Pointer(&adddmi_args.arg_Hz) adddmi_args.argptr[3] = unsafe.Pointer(&adddmi_args.arg_mx) adddmi_args.argptr[4] = unsafe.Pointer(&adddmi_args.arg_my) adddmi_args.argptr[5] = unsafe.Pointer(&adddmi_args.arg_mz) adddmi_args.argptr[6] = unsafe.Pointer(&adddmi_args.arg_Ms_) adddmi_args.argptr[7] = unsafe.Pointer(&adddmi_args.arg_Ms_mul) adddmi_args.argptr[8] = unsafe.Pointer(&adddmi_args.arg_aLUT2d) adddmi_args.argptr[9] = unsafe.Pointer(&adddmi_args.arg_dLUT2d) adddmi_args.argptr[10] = unsafe.Pointer(&adddmi_args.arg_regions) adddmi_args.argptr[11] = unsafe.Pointer(&adddmi_args.arg_cx) adddmi_args.argptr[12] = unsafe.Pointer(&adddmi_args.arg_cy) adddmi_args.argptr[13] = unsafe.Pointer(&adddmi_args.arg_cz) adddmi_args.argptr[14] = unsafe.Pointer(&adddmi_args.arg_Nx) adddmi_args.argptr[15] = unsafe.Pointer(&adddmi_args.arg_Ny) adddmi_args.argptr[16] = unsafe.Pointer(&adddmi_args.arg_Nz) adddmi_args.argptr[17] = unsafe.Pointer(&adddmi_args.arg_PBC) adddmi_args.argptr[18] = unsafe.Pointer(&adddmi_args.arg_OpenBC) } // Wrapper for adddmi CUDA kernel, asynchronous. func k_adddmi_async(Hx unsafe.Pointer, Hy unsafe.Pointer, Hz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, aLUT2d unsafe.Pointer, dLUT2d unsafe.Pointer, regions unsafe.Pointer, cx float32, cy float32, cz float32, Nx int, Ny int, Nz int, PBC byte, OpenBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("adddmi") } adddmi_args.Lock() defer adddmi_args.Unlock() if adddmi_code == 0 { adddmi_code = fatbinLoad(adddmi_map, "adddmi") } adddmi_args.arg_Hx = Hx adddmi_args.arg_Hy = Hy adddmi_args.arg_Hz = Hz adddmi_args.arg_mx = mx adddmi_args.arg_my = my adddmi_args.arg_mz = mz adddmi_args.arg_Ms_ = Ms_ adddmi_args.arg_Ms_mul = Ms_mul adddmi_args.arg_aLUT2d = aLUT2d adddmi_args.arg_dLUT2d = dLUT2d adddmi_args.arg_regions = regions adddmi_args.arg_cx = cx adddmi_args.arg_cy = cy adddmi_args.arg_cz = cz adddmi_args.arg_Nx = Nx adddmi_args.arg_Ny = Ny adddmi_args.arg_Nz = Nz adddmi_args.arg_PBC = PBC adddmi_args.arg_OpenBC = OpenBC args := adddmi_args.argptr[:] cu.LaunchKernel(adddmi_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("adddmi") } } // maps compute capability on PTX code for adddmi kernel. var adddmi_map = map[int]string{0: "", 30: adddmi_ptx_30, 32: adddmi_ptx_32, 35: adddmi_ptx_35, 37: adddmi_ptx_37, 50: adddmi_ptx_50, 52: adddmi_ptx_52, 53: adddmi_ptx_53, 60: adddmi_ptx_60, 61: adddmi_ptx_61, 62: adddmi_ptx_62, 70: adddmi_ptx_70, 72: adddmi_ptx_72, 75: adddmi_ptx_75} // adddmi PTX code for various compute capabilities. const ( adddmi_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<59>; .reg .b16 %rs<39>; .reg .f32 %f<263>; .reg .b32 %r<219>; .reg .b64 %rd<117>; ld.param.u64 %rd1, [adddmi_param_0]; ld.param.u64 %rd2, [adddmi_param_1]; ld.param.u64 %rd3, [adddmi_param_2]; ld.param.u64 %rd4, [adddmi_param_3]; ld.param.u64 %rd5, [adddmi_param_4]; ld.param.u64 %rd6, [adddmi_param_5]; ld.param.u64 %rd7, [adddmi_param_6]; ld.param.f32 %f261, [adddmi_param_7]; ld.param.u64 %rd8, [adddmi_param_8]; ld.param.u64 %rd9, [adddmi_param_9]; ld.param.u64 %rd10, [adddmi_param_10]; ld.param.f32 %f99, [adddmi_param_11]; ld.param.f32 %f100, [adddmi_param_12]; ld.param.f32 %f101, [adddmi_param_13]; ld.param.u32 %r33, [adddmi_param_14]; ld.param.u32 %r34, [adddmi_param_15]; ld.param.u32 %r35, [adddmi_param_16]; ld.param.u8 %rs13, [adddmi_param_18]; ld.param.u8 %rs12, [adddmi_param_17]; mov.u32 %r36, %ntid.x; mov.u32 %r37, %ctaid.x; mov.u32 %r38, %tid.x; mad.lo.s32 %r1, %r36, %r37, %r38; mov.u32 %r39, %ntid.y; mov.u32 %r40, %ctaid.y; mov.u32 %r41, %tid.y; mad.lo.s32 %r2, %r39, %r40, %r41; mov.u32 %r42, %ntid.z; mov.u32 %r43, %ctaid.z; mov.u32 %r44, %tid.z; mad.lo.s32 %r3, %r42, %r43, %r44; setp.ge.s32 %p1, %r2, %r34; setp.ge.s32 %p2, %r1, %r33; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r35; or.pred %p5, %p3, %p4; @%p5 bra BB0_59; cvta.to.global.u64 %rd11, %rd10; cvta.to.global.u64 %rd12, %rd6; cvta.to.global.u64 %rd13, %rd5; cvta.to.global.u64 %rd14, %rd4; mad.lo.s32 %r45, %r3, %r34, %r2; mul.lo.s32 %r4, %r45, %r33; add.s32 %r46, %r4, %r1; mul.wide.s32 %rd15, %r46, 4; add.s64 %rd16, %rd14, %rd15; cvt.s64.s32 %rd17, %r46; add.s64 %rd18, %rd13, %rd15; add.s64 %rd19, %rd12, %rd15; add.s64 %rd20, %rd11, %rd17; ld.global.u8 %rs1, [%rd20]; ld.global.f32 %f1, [%rd16]; ld.global.f32 %f2, [%rd18]; mul.f32 %f102, %f2, %f2; fma.rn.f32 %f103, %f1, %f1, %f102; ld.global.f32 %f3, [%rd19]; fma.rn.f32 %f104, %f3, %f3, %f103; setp.eq.f32 %p6, %f104, 0f00000000; @%p6 bra BB0_59; and.b16 %rs14, %rs12, 1; setp.eq.b16 %p7, %rs14, 1; add.s32 %r5, %r1, -1; @!%p7 bra BB0_4; bra.uni BB0_3; BB0_3: rem.s32 %r51, %r5, %r33; add.s32 %r52, %r51, %r33; rem.s32 %r213, %r52, %r33; bra.uni BB0_5; BB0_4: mov.u32 %r53, 0; max.s32 %r213, %r5, %r53; BB0_5: add.s32 %r9, %r213, %r4; setp.eq.b16 %p8, %rs14, 1; not.pred %p9, %p8; setp.lt.s32 %p10, %r5, 0; mov.f32 %f225, 0f00000000; and.pred %p11, %p10, %p9; mov.f32 %f226, %f225; mov.f32 %f227, %f225; @%p11 bra BB0_7; mul.wide.s32 %rd22, %r9, 4; add.s64 %rd23, %rd14, %rd22; ld.global.f32 %f225, [%rd23]; add.s64 %rd25, %rd13, %rd22; ld.global.f32 %f226, [%rd25]; add.s64 %rd27, %rd12, %rd22; ld.global.f32 %f227, [%rd27]; BB0_7: mul.f32 %f108, %f226, %f226; fma.rn.f32 %f109, %f225, %f225, %f108; fma.rn.f32 %f10, %f227, %f227, %f109; setp.eq.f32 %p12, %f10, 0f00000000; mov.u16 %rs35, %rs1; @%p12 bra BB0_9; cvt.s64.s32 %rd29, %r9; add.s64 %rd30, %rd11, %rd29; ld.global.u8 %rs35, [%rd30]; BB0_9: cvt.u32.u16 %r54, %rs1; and.b32 %r55, %r54, 255; setp.gt.u16 %p13, %rs35, %rs1; cvt.u32.u16 %r56, %rs35; and.b32 %r57, %r56, 255; selp.b32 %r58, %r55, %r57, %p13; selp.b32 %r59, %r57, %r55, %p13; add.s32 %r60, %r59, 1; mul.lo.s32 %r61, %r60, %r59; shr.u32 %r62, %r61, 1; add.s32 %r63, %r62, %r58; cvta.to.global.u64 %rd31, %rd8; mul.wide.s32 %rd32, %r63, 4; add.s64 %rd33, %rd31, %rd32; ld.global.f32 %f11, [%rd33]; cvta.to.global.u64 %rd34, %rd9; add.s64 %rd35, %rd34, %rd32; ld.global.f32 %f12, [%rd35]; setp.ne.s16 %p14, %rs13, 0; mov.f32 %f237, 0f00000000; and.pred %p16, %p12, %p14; mov.f32 %f238, %f237; mov.f32 %f239, %f237; @%p16 bra BB0_13; setp.neu.f32 %p17, %f10, 0f00000000; @%p17 bra BB0_12; mul.f32 %f113, %f12, 0f3F000000; div.rn.f32 %f114, %f113, %f11; mul.f32 %f115, %f114, %f99; fma.rn.f32 %f225, %f3, %f115, %f1; mul.f32 %f116, %f1, %f115; sub.f32 %f227, %f3, %f116; mov.f32 %f226, %f2; BB0_12: mul.f32 %f117, %f99, %f99; add.f32 %f118, %f11, %f11; div.rn.f32 %f119, %f118, %f117; sub.f32 %f120, %f225, %f1; sub.f32 %f121, %f226, %f2; sub.f32 %f122, %f227, %f3; fma.rn.f32 %f123, %f120, %f119, 0f00000000; fma.rn.f32 %f238, %f121, %f119, 0f00000000; fma.rn.f32 %f124, %f119, %f122, 0f00000000; div.rn.f32 %f125, %f12, %f99; mul.f32 %f126, %f227, %f125; sub.f32 %f237, %f123, %f126; fma.rn.f32 %f239, %f225, %f125, %f124; BB0_13: setp.eq.b16 %p18, %rs14, 1; add.s32 %r10, %r1, 1; @!%p18 bra BB0_15; bra.uni BB0_14; BB0_14: rem.s32 %r68, %r10, %r33; add.s32 %r69, %r68, %r33; rem.s32 %r214, %r69, %r33; bra.uni BB0_16; BB0_15: add.s32 %r70, %r33, -1; min.s32 %r214, %r10, %r70; BB0_16: setp.eq.b16 %p19, %rs14, 1; not.pred %p20, %p19; add.s32 %r14, %r214, %r4; setp.ge.s32 %p21, %r10, %r33; mov.f32 %f231, 0f00000000; and.pred %p22, %p21, %p20; mov.f32 %f232, %f231; mov.f32 %f233, %f231; @%p22 bra BB0_18; mul.wide.s32 %rd37, %r14, 4; add.s64 %rd38, %rd14, %rd37; ld.global.f32 %f231, [%rd38]; add.s64 %rd40, %rd13, %rd37; ld.global.f32 %f232, [%rd40]; add.s64 %rd42, %rd12, %rd37; ld.global.f32 %f233, [%rd42]; BB0_18: mul.f32 %f130, %f232, %f232; fma.rn.f32 %f131, %f231, %f231, %f130; fma.rn.f32 %f30, %f233, %f233, %f131; setp.eq.f32 %p23, %f30, 0f00000000; mov.u16 %rs36, %rs1; @%p23 bra BB0_20; cvt.s64.s32 %rd44, %r14; add.s64 %rd45, %rd11, %rd44; ld.global.u8 %rs36, [%rd45]; BB0_20: setp.gt.u16 %p24, %rs36, %rs1; cvt.u32.u16 %r73, %rs36; and.b32 %r74, %r73, 255; selp.b32 %r75, %r55, %r74, %p24; selp.b32 %r76, %r74, %r55, %p24; add.s32 %r77, %r76, 1; mul.lo.s32 %r78, %r77, %r76; shr.u32 %r79, %r78, 1; add.s32 %r80, %r79, %r75; mul.wide.s32 %rd47, %r80, 4; add.s64 %rd48, %rd31, %rd47; ld.global.f32 %f31, [%rd48]; add.s64 %rd50, %rd34, %rd47; ld.global.f32 %f32, [%rd50]; and.pred %p27, %p23, %p14; @%p27 bra BB0_24; setp.neu.f32 %p28, %f30, 0f00000000; @%p28 bra BB0_23; mul.f32 %f132, %f32, 0f3F000000; div.rn.f32 %f133, %f132, %f31; mul.f32 %f134, %f133, %f99; mul.f32 %f135, %f3, %f134; sub.f32 %f231, %f1, %f135; fma.rn.f32 %f233, %f1, %f134, %f3; mov.f32 %f232, %f2; BB0_23: mul.f32 %f136, %f99, %f99; add.f32 %f137, %f31, %f31; div.rn.f32 %f138, %f137, %f136; sub.f32 %f139, %f231, %f1; sub.f32 %f140, %f232, %f2; sub.f32 %f141, %f233, %f3; fma.rn.f32 %f142, %f139, %f138, %f237; fma.rn.f32 %f238, %f140, %f138, %f238; fma.rn.f32 %f143, %f138, %f141, %f239; div.rn.f32 %f144, %f32, %f99; fma.rn.f32 %f237, %f233, %f144, %f142; mul.f32 %f145, %f231, %f144; sub.f32 %f239, %f143, %f145; BB0_24: and.b16 %rs6, %rs12, 2; setp.eq.s16 %p29, %rs6, 0; add.s32 %r15, %r2, -1; @%p29 bra BB0_26; rem.s32 %r85, %r15, %r34; add.s32 %r86, %r85, %r34; rem.s32 %r215, %r86, %r34; bra.uni BB0_27; BB0_26: mov.u32 %r87, 0; max.s32 %r215, %r15, %r87; BB0_27: mad.lo.s32 %r92, %r3, %r34, %r215; mad.lo.s32 %r19, %r92, %r33, %r1; setp.lt.s32 %p31, %r15, 0; mov.f32 %f240, 0f00000000; and.pred %p32, %p31, %p29; mov.f32 %f241, %f240; mov.f32 %f242, %f240; @%p32 bra BB0_29; mul.wide.s32 %rd52, %r19, 4; add.s64 %rd53, %rd14, %rd52; ld.global.f32 %f240, [%rd53]; add.s64 %rd55, %rd13, %rd52; ld.global.f32 %f241, [%rd55]; add.s64 %rd57, %rd12, %rd52; ld.global.f32 %f242, [%rd57]; BB0_29: mul.f32 %f149, %f241, %f241; fma.rn.f32 %f150, %f240, %f240, %f149; fma.rn.f32 %f50, %f242, %f242, %f150; setp.eq.f32 %p33, %f50, 0f00000000; mov.u16 %rs37, %rs1; @%p33 bra BB0_31; cvt.s64.s32 %rd59, %r19; add.s64 %rd60, %rd11, %rd59; ld.global.u8 %rs37, [%rd60]; BB0_31: setp.gt.u16 %p34, %rs37, %rs1; cvt.u32.u16 %r99, %rs37; and.b32 %r100, %r99, 255; selp.b32 %r101, %r55, %r100, %p34; selp.b32 %r102, %r100, %r55, %p34; add.s32 %r103, %r102, 1; mul.lo.s32 %r104, %r103, %r102; shr.u32 %r105, %r104, 1; add.s32 %r106, %r105, %r101; mul.wide.s32 %rd62, %r106, 4; add.s64 %rd63, %rd31, %rd62; ld.global.f32 %f51, [%rd63]; add.s64 %rd65, %rd34, %rd62; ld.global.f32 %f52, [%rd65]; and.pred %p37, %p33, %p14; @%p37 bra BB0_35; setp.neu.f32 %p38, %f50, 0f00000000; @%p38 bra BB0_34; mul.f32 %f151, %f52, 0f3F000000; div.rn.f32 %f152, %f151, %f51; mul.f32 %f153, %f152, %f100; fma.rn.f32 %f241, %f3, %f153, %f2; mul.f32 %f154, %f2, %f153; sub.f32 %f242, %f3, %f154; mov.f32 %f240, %f1; BB0_34: mul.f32 %f155, %f100, %f100; add.f32 %f156, %f51, %f51; div.rn.f32 %f157, %f156, %f155; sub.f32 %f158, %f240, %f1; sub.f32 %f159, %f241, %f2; sub.f32 %f160, %f242, %f3; fma.rn.f32 %f237, %f158, %f157, %f237; fma.rn.f32 %f161, %f159, %f157, %f238; fma.rn.f32 %f162, %f157, %f160, %f239; div.rn.f32 %f163, %f52, %f100; mul.f32 %f164, %f242, %f163; sub.f32 %f238, %f161, %f164; fma.rn.f32 %f239, %f241, %f163, %f162; BB0_35: add.s32 %r20, %r2, 1; @%p29 bra BB0_37; rem.s32 %r111, %r20, %r34; add.s32 %r112, %r111, %r34; rem.s32 %r216, %r112, %r34; bra.uni BB0_38; BB0_37: add.s32 %r113, %r34, -1; min.s32 %r216, %r20, %r113; BB0_38: shr.u16 %rs26, %rs12, 1; and.b16 %rs27, %rs26, 1; setp.eq.b16 %p40, %rs27, 1; not.pred %p41, %p40; mad.lo.s32 %r118, %r3, %r34, %r216; mad.lo.s32 %r24, %r118, %r33, %r1; setp.ge.s32 %p42, %r20, %r34; mov.f32 %f249, 0f00000000; and.pred %p43, %p42, %p41; mov.f32 %f250, %f249; mov.f32 %f251, %f249; @%p43 bra BB0_40; mul.wide.s32 %rd67, %r24, 4; add.s64 %rd68, %rd14, %rd67; ld.global.f32 %f249, [%rd68]; add.s64 %rd70, %rd13, %rd67; ld.global.f32 %f250, [%rd70]; add.s64 %rd72, %rd12, %rd67; ld.global.f32 %f251, [%rd72]; BB0_40: mul.f32 %f168, %f250, %f250; fma.rn.f32 %f169, %f249, %f249, %f168; fma.rn.f32 %f70, %f251, %f251, %f169; setp.eq.f32 %p44, %f70, 0f00000000; mov.u16 %rs38, %rs1; @%p44 bra BB0_42; cvt.s64.s32 %rd74, %r24; add.s64 %rd75, %rd11, %rd74; ld.global.u8 %rs38, [%rd75]; BB0_42: setp.gt.u16 %p45, %rs38, %rs1; cvt.u32.u16 %r125, %rs38; and.b32 %r126, %r125, 255; selp.b32 %r127, %r55, %r126, %p45; selp.b32 %r128, %r126, %r55, %p45; add.s32 %r129, %r128, 1; mul.lo.s32 %r130, %r129, %r128; shr.u32 %r131, %r130, 1; add.s32 %r132, %r131, %r127; mul.wide.s32 %rd77, %r132, 4; add.s64 %rd78, %rd31, %rd77; ld.global.f32 %f71, [%rd78]; add.s64 %rd80, %rd34, %rd77; ld.global.f32 %f72, [%rd80]; and.pred %p48, %p44, %p14; @%p48 bra BB0_46; setp.neu.f32 %p49, %f70, 0f00000000; @%p49 bra BB0_45; mul.f32 %f170, %f72, 0f3F000000; div.rn.f32 %f171, %f170, %f71; mul.f32 %f172, %f171, %f100; mul.f32 %f173, %f3, %f172; sub.f32 %f250, %f2, %f173; fma.rn.f32 %f251, %f2, %f172, %f3; mov.f32 %f249, %f1; BB0_45: mul.f32 %f174, %f100, %f100; add.f32 %f175, %f71, %f71; div.rn.f32 %f176, %f175, %f174; sub.f32 %f177, %f249, %f1; sub.f32 %f178, %f250, %f2; sub.f32 %f179, %f251, %f3; fma.rn.f32 %f237, %f177, %f176, %f237; fma.rn.f32 %f180, %f178, %f176, %f238; fma.rn.f32 %f181, %f176, %f179, %f239; div.rn.f32 %f182, %f72, %f100; fma.rn.f32 %f238, %f251, %f182, %f180; mul.f32 %f183, %f250, %f182; sub.f32 %f239, %f181, %f183; BB0_46: setp.eq.s32 %p50, %r35, 1; @%p50 bra BB0_54; and.b16 %rs11, %rs12, 4; setp.eq.s16 %p51, %rs11, 0; add.s32 %r25, %r3, -1; @%p51 bra BB0_49; rem.s32 %r137, %r25, %r35; add.s32 %r138, %r137, %r35; rem.s32 %r217, %r138, %r35; bra.uni BB0_50; BB0_49: mov.u32 %r139, 0; max.s32 %r217, %r25, %r139; BB0_50: mad.lo.s32 %r144, %r217, %r34, %r2; mad.lo.s32 %r149, %r144, %r33, %r1; cvt.s64.s32 %rd82, %r149; mul.wide.s32 %rd83, %r149, 4; add.s64 %rd84, %rd14, %rd83; add.s64 %rd86, %rd13, %rd83; add.s64 %rd88, %rd12, %rd83; ld.global.f32 %f184, [%rd84]; ld.global.f32 %f185, [%rd86]; mul.f32 %f186, %f185, %f185; fma.rn.f32 %f187, %f184, %f184, %f186; ld.global.f32 %f188, [%rd88]; fma.rn.f32 %f189, %f188, %f188, %f187; setp.eq.f32 %p52, %f189, 0f00000000; selp.f32 %f190, %f1, %f184, %p52; selp.f32 %f191, %f2, %f185, %p52; selp.f32 %f192, %f3, %f188, %p52; add.s64 %rd90, %rd11, %rd82; ld.global.u8 %rs30, [%rd90]; setp.gt.u16 %p53, %rs30, %rs1; cvt.u32.u16 %r150, %rs30; selp.b32 %r153, %r55, %r150, %p53; selp.b32 %r154, %r150, %r55, %p53; add.s32 %r155, %r154, 1; mul.lo.s32 %r156, %r155, %r154; shr.u32 %r157, %r156, 1; add.s32 %r158, %r157, %r153; mul.wide.s32 %rd92, %r158, 4; add.s64 %rd93, %rd31, %rd92; ld.global.f32 %f193, [%rd93]; add.f32 %f194, %f193, %f193; mul.f32 %f84, %f101, %f101; div.rn.f32 %f195, %f194, %f84; sub.f32 %f196, %f190, %f1; sub.f32 %f197, %f191, %f2; sub.f32 %f198, %f192, %f3; fma.rn.f32 %f85, %f195, %f196, %f237; fma.rn.f32 %f86, %f195, %f197, %f238; fma.rn.f32 %f87, %f195, %f198, %f239; add.s32 %r29, %r3, 1; @%p51 bra BB0_52; rem.s32 %r163, %r29, %r35; add.s32 %r164, %r163, %r35; rem.s32 %r218, %r164, %r35; bra.uni BB0_53; BB0_52: add.s32 %r165, %r35, -1; min.s32 %r218, %r29, %r165; BB0_53: mad.lo.s32 %r170, %r218, %r34, %r2; mad.lo.s32 %r175, %r170, %r33, %r1; cvt.s64.s32 %rd95, %r175; mul.wide.s32 %rd96, %r175, 4; add.s64 %rd97, %rd14, %rd96; add.s64 %rd99, %rd13, %rd96; add.s64 %rd101, %rd12, %rd96; ld.global.f32 %f199, [%rd97]; ld.global.f32 %f200, [%rd99]; mul.f32 %f201, %f200, %f200; fma.rn.f32 %f202, %f199, %f199, %f201; ld.global.f32 %f203, [%rd101]; fma.rn.f32 %f204, %f203, %f203, %f202; setp.eq.f32 %p55, %f204, 0f00000000; selp.f32 %f205, %f3, %f203, %p55; selp.f32 %f206, %f2, %f200, %p55; selp.f32 %f207, %f1, %f199, %p55; add.s64 %rd103, %rd11, %rd95; ld.global.u8 %rs33, [%rd103]; setp.gt.u16 %p56, %rs33, %rs1; cvt.u32.u16 %r176, %rs33; selp.b32 %r179, %r55, %r176, %p56; selp.b32 %r180, %r176, %r55, %p56; add.s32 %r181, %r180, 1; mul.lo.s32 %r182, %r181, %r180; shr.u32 %r183, %r182, 1; add.s32 %r184, %r183, %r179; mul.wide.s32 %rd105, %r184, 4; add.s64 %rd106, %rd31, %rd105; ld.global.f32 %f208, [%rd106]; add.f32 %f209, %f208, %f208; div.rn.f32 %f210, %f209, %f84; sub.f32 %f211, %f207, %f1; sub.f32 %f212, %f206, %f2; sub.f32 %f213, %f205, %f3; fma.rn.f32 %f237, %f210, %f211, %f85; fma.rn.f32 %f238, %f210, %f212, %f86; fma.rn.f32 %f239, %f210, %f213, %f87; BB0_54: setp.eq.s64 %p57, %rd7, 0; @%p57 bra BB0_56; mad.lo.s32 %r198, %r45, %r33, %r1; cvta.to.global.u64 %rd107, %rd7; mul.wide.s32 %rd108, %r198, 4; add.s64 %rd109, %rd107, %rd108; ld.global.f32 %f214, [%rd109]; mul.f32 %f261, %f214, %f261; BB0_56: setp.eq.f32 %p58, %f261, 0f00000000; mov.f32 %f262, 0f00000000; @%p58 bra BB0_58; rcp.rn.f32 %f262, %f261; BB0_58: mad.lo.s32 %r212, %r45, %r33, %r1; cvta.to.global.u64 %rd110, %rd1; mul.wide.s32 %rd111, %r212, 4; add.s64 %rd112, %rd110, %rd111; ld.global.f32 %f216, [%rd112]; fma.rn.f32 %f217, %f237, %f262, %f216; st.global.f32 [%rd112], %f217; cvta.to.global.u64 %rd113, %rd2; add.s64 %rd114, %rd113, %rd111; ld.global.f32 %f218, [%rd114]; fma.rn.f32 %f219, %f238, %f262, %f218; st.global.f32 [%rd114], %f219; cvta.to.global.u64 %rd115, %rd3; add.s64 %rd116, %rd115, %rd111; ld.global.f32 %f220, [%rd116]; fma.rn.f32 %f221, %f239, %f262, %f220; st.global.f32 [%rd116], %f221; BB0_59: ret; } ` adddmi_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<56>; .reg .b16 %rs<37>; .reg .f32 %f<263>; .reg .b32 %r<128>; .reg .b64 %rd<85>; ld.param.u64 %rd7, [adddmi_param_0]; ld.param.u64 %rd8, [adddmi_param_1]; ld.param.u64 %rd9, [adddmi_param_2]; ld.param.u64 %rd11, [adddmi_param_3]; ld.param.u64 %rd12, [adddmi_param_4]; ld.param.u64 %rd13, [adddmi_param_5]; ld.param.u64 %rd10, [adddmi_param_6]; ld.param.f32 %f261, [adddmi_param_7]; ld.param.u64 %rd14, [adddmi_param_8]; ld.param.u64 %rd15, [adddmi_param_9]; ld.param.u64 %rd16, [adddmi_param_10]; ld.param.f32 %f99, [adddmi_param_11]; ld.param.f32 %f100, [adddmi_param_12]; ld.param.f32 %f101, [adddmi_param_13]; ld.param.u32 %r36, [adddmi_param_14]; ld.param.u32 %r37, [adddmi_param_15]; ld.param.u32 %r38, [adddmi_param_16]; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; cvta.to.global.u64 %rd1, %rd15; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd16; cvta.to.global.u64 %rd4, %rd13; cvta.to.global.u64 %rd5, %rd12; cvta.to.global.u64 %rd6, %rd11; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r39, %r40, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r42, %r43, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r45, %r46, %r47; setp.ge.s32 %p1, %r2, %r37; setp.ge.s32 %p2, %r1, %r36; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra BB0_59; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd17, %r6, 4; add.s64 %rd18, %rd6, %rd17; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd5, %rd17; add.s64 %rd21, %rd4, %rd17; add.s64 %rd22, %rd3, %rd19; ld.global.nc.u8 %rs1, [%rd22]; cvt.u32.u16 %r49, %rs1; and.b32 %r7, %r49, 255; ld.global.nc.f32 %f1, [%rd18]; ld.global.nc.f32 %f2, [%rd20]; mul.f32 %f102, %f2, %f2; fma.rn.f32 %f103, %f1, %f1, %f102; ld.global.nc.f32 %f3, [%rd21]; fma.rn.f32 %f104, %f3, %f3, %f103; setp.eq.f32 %p6, %f104, 0f00000000; @%p6 bra BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r50, %r8, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r122, %r51, %r36; bra.uni BB0_5; BB0_4: mov.u32 %r52, 0; max.s32 %r122, %r8, %r52; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f225, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f226, %f225; mov.f32 %f227, %f225; @%p10 bra BB0_7; mul.wide.s32 %rd23, %r12, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f225, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f226, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f227, [%rd26]; BB0_7: mul.f32 %f108, %f226, %f226; fma.rn.f32 %f109, %f225, %f225, %f108; fma.rn.f32 %f10, %f227, %f227, %f109; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs33, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd27, %r12; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs33, [%rd28]; BB0_9: setp.gt.u16 %p12, %rs33, %rs1; cvt.u32.u16 %r53, %rs33; and.b32 %r54, %r53, 255; selp.b32 %r55, %r7, %r54, %p12; selp.b32 %r56, %r54, %r7, %p12; add.s32 %r57, %r56, 1; mul.lo.s32 %r58, %r57, %r56; shr.u32 %r59, %r58, 1; add.s32 %r60, %r59, %r55; mul.wide.s32 %rd29, %r60, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f12, [%rd31]; setp.ne.s16 %p13, %rs14, 0; mov.f32 %f237, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f238, %f237; mov.f32 %f239, %f237; @%p15 bra BB0_13; setp.neu.f32 %p16, %f10, 0f00000000; @%p16 bra BB0_12; mul.f32 %f113, %f12, 0f3F000000; div.rn.f32 %f114, %f113, %f11; mul.f32 %f115, %f114, %f99; fma.rn.f32 %f225, %f3, %f115, %f1; mul.f32 %f116, %f1, %f115; sub.f32 %f227, %f3, %f116; mov.f32 %f226, %f2; BB0_12: mul.f32 %f117, %f99, %f99; add.f32 %f118, %f11, %f11; div.rn.f32 %f119, %f118, %f117; sub.f32 %f120, %f225, %f1; sub.f32 %f121, %f226, %f2; sub.f32 %f122, %f227, %f3; fma.rn.f32 %f123, %f120, %f119, 0f00000000; fma.rn.f32 %f238, %f121, %f119, 0f00000000; fma.rn.f32 %f124, %f119, %f122, 0f00000000; div.rn.f32 %f125, %f12, %f99; mul.f32 %f126, %f227, %f125; sub.f32 %f237, %f123, %f126; fma.rn.f32 %f239, %f225, %f125, %f124; BB0_13: add.s32 %r13, %r1, 1; @%p7 bra BB0_15; rem.s32 %r61, %r13, %r36; add.s32 %r62, %r61, %r36; rem.s32 %r123, %r62, %r36; bra.uni BB0_16; BB0_15: add.s32 %r63, %r36, -1; min.s32 %r123, %r13, %r63; BB0_16: add.s32 %r17, %r123, %r5; setp.ge.s32 %p18, %r13, %r36; mov.f32 %f231, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f232, %f231; mov.f32 %f233, %f231; @%p20 bra BB0_18; mul.wide.s32 %rd32, %r17, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f231, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f232, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f233, [%rd35]; BB0_18: mul.f32 %f130, %f232, %f232; fma.rn.f32 %f131, %f231, %f231, %f130; fma.rn.f32 %f30, %f233, %f233, %f131; setp.eq.f32 %p21, %f30, 0f00000000; mov.u16 %rs34, %rs1; @%p21 bra BB0_20; cvt.s64.s32 %rd36, %r17; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs34, [%rd37]; BB0_20: setp.gt.u16 %p22, %rs34, %rs1; cvt.u32.u16 %r64, %rs34; and.b32 %r65, %r64, 255; selp.b32 %r66, %r7, %r65, %p22; selp.b32 %r67, %r65, %r7, %p22; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; add.s32 %r71, %r70, %r66; mul.wide.s32 %rd38, %r71, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f31, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f32, [%rd40]; and.pred %p25, %p21, %p13; @%p25 bra BB0_24; setp.neu.f32 %p26, %f30, 0f00000000; @%p26 bra BB0_23; mul.f32 %f132, %f32, 0f3F000000; div.rn.f32 %f133, %f132, %f31; mul.f32 %f134, %f133, %f99; mul.f32 %f135, %f3, %f134; sub.f32 %f231, %f1, %f135; fma.rn.f32 %f233, %f1, %f134, %f3; mov.f32 %f232, %f2; BB0_23: mul.f32 %f136, %f99, %f99; add.f32 %f137, %f31, %f31; div.rn.f32 %f138, %f137, %f136; sub.f32 %f139, %f231, %f1; sub.f32 %f140, %f232, %f2; sub.f32 %f141, %f233, %f3; fma.rn.f32 %f142, %f139, %f138, %f237; fma.rn.f32 %f238, %f140, %f138, %f238; fma.rn.f32 %f143, %f138, %f141, %f239; div.rn.f32 %f144, %f32, %f99; fma.rn.f32 %f237, %f233, %f144, %f142; mul.f32 %f145, %f231, %f144; sub.f32 %f239, %f143, %f145; BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r18, %r2, -1; @%p27 bra BB0_26; rem.s32 %r72, %r18, %r37; add.s32 %r73, %r72, %r37; rem.s32 %r124, %r73, %r37; bra.uni BB0_27; BB0_26: mov.u32 %r74, 0; max.s32 %r124, %r18, %r74; BB0_27: add.s32 %r75, %r124, %r4; mad.lo.s32 %r22, %r75, %r36, %r1; setp.lt.s32 %p29, %r18, 0; mov.f32 %f240, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f241, %f240; mov.f32 %f242, %f240; @%p30 bra BB0_29; mul.wide.s32 %rd41, %r22, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f240, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f241, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f242, [%rd44]; BB0_29: mul.f32 %f149, %f241, %f241; fma.rn.f32 %f150, %f240, %f240, %f149; fma.rn.f32 %f50, %f242, %f242, %f150; setp.eq.f32 %p31, %f50, 0f00000000; mov.u16 %rs35, %rs1; @%p31 bra BB0_31; cvt.s64.s32 %rd45, %r22; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs35, [%rd46]; BB0_31: setp.gt.u16 %p32, %rs35, %rs1; cvt.u32.u16 %r76, %rs35; and.b32 %r77, %r76, 255; selp.b32 %r78, %r7, %r77, %p32; selp.b32 %r79, %r77, %r7, %p32; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd47, %r83, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f51, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f52, [%rd49]; and.pred %p35, %p31, %p13; @%p35 bra BB0_35; setp.neu.f32 %p36, %f50, 0f00000000; @%p36 bra BB0_34; mul.f32 %f151, %f52, 0f3F000000; div.rn.f32 %f152, %f151, %f51; mul.f32 %f153, %f152, %f100; fma.rn.f32 %f241, %f3, %f153, %f2; mul.f32 %f154, %f2, %f153; sub.f32 %f242, %f3, %f154; mov.f32 %f240, %f1; BB0_34: mul.f32 %f155, %f100, %f100; add.f32 %f156, %f51, %f51; div.rn.f32 %f157, %f156, %f155; sub.f32 %f158, %f240, %f1; sub.f32 %f159, %f241, %f2; sub.f32 %f160, %f242, %f3; fma.rn.f32 %f237, %f158, %f157, %f237; fma.rn.f32 %f161, %f159, %f157, %f238; fma.rn.f32 %f162, %f157, %f160, %f239; div.rn.f32 %f163, %f52, %f100; mul.f32 %f164, %f242, %f163; sub.f32 %f238, %f161, %f164; fma.rn.f32 %f239, %f241, %f163, %f162; BB0_35: add.s32 %r23, %r2, 1; @%p27 bra BB0_37; rem.s32 %r84, %r23, %r37; add.s32 %r85, %r84, %r37; rem.s32 %r125, %r85, %r37; bra.uni BB0_38; BB0_37: add.s32 %r86, %r37, -1; min.s32 %r125, %r23, %r86; BB0_38: add.s32 %r87, %r125, %r4; mad.lo.s32 %r27, %r87, %r36, %r1; setp.ge.s32 %p38, %r23, %r37; mov.f32 %f249, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f250, %f249; mov.f32 %f251, %f249; @%p40 bra BB0_40; mul.wide.s32 %rd50, %r27, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f249, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f250, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f251, [%rd53]; BB0_40: mul.f32 %f168, %f250, %f250; fma.rn.f32 %f169, %f249, %f249, %f168; fma.rn.f32 %f70, %f251, %f251, %f169; setp.eq.f32 %p41, %f70, 0f00000000; mov.u16 %rs36, %rs1; @%p41 bra BB0_42; cvt.s64.s32 %rd54, %r27; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs36, [%rd55]; BB0_42: setp.gt.u16 %p42, %rs36, %rs1; cvt.u32.u16 %r88, %rs36; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p42; selp.b32 %r91, %r89, %r7, %p42; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd56, %r95, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f71, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f72, [%rd58]; and.pred %p45, %p41, %p13; @%p45 bra BB0_46; setp.neu.f32 %p46, %f70, 0f00000000; @%p46 bra BB0_45; mul.f32 %f170, %f72, 0f3F000000; div.rn.f32 %f171, %f170, %f71; mul.f32 %f172, %f171, %f100; mul.f32 %f173, %f3, %f172; sub.f32 %f250, %f2, %f173; fma.rn.f32 %f251, %f2, %f172, %f3; mov.f32 %f249, %f1; BB0_45: mul.f32 %f174, %f100, %f100; add.f32 %f175, %f71, %f71; div.rn.f32 %f176, %f175, %f174; sub.f32 %f177, %f249, %f1; sub.f32 %f178, %f250, %f2; sub.f32 %f179, %f251, %f3; fma.rn.f32 %f237, %f177, %f176, %f237; fma.rn.f32 %f180, %f178, %f176, %f238; fma.rn.f32 %f181, %f176, %f179, %f239; div.rn.f32 %f182, %f72, %f100; fma.rn.f32 %f238, %f251, %f182, %f180; mul.f32 %f183, %f250, %f182; sub.f32 %f239, %f181, %f183; BB0_46: setp.eq.s32 %p47, %r38, 1; @%p47 bra BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r28, %r3, -1; @%p48 bra BB0_49; rem.s32 %r96, %r28, %r38; add.s32 %r97, %r96, %r38; rem.s32 %r126, %r97, %r38; bra.uni BB0_50; BB0_49: mov.u32 %r98, 0; max.s32 %r126, %r28, %r98; BB0_50: mad.lo.s32 %r99, %r126, %r37, %r2; mad.lo.s32 %r100, %r99, %r36, %r1; cvt.s64.s32 %rd59, %r100; mul.wide.s32 %rd60, %r100, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f184, [%rd61]; ld.global.nc.f32 %f185, [%rd62]; mul.f32 %f186, %f185, %f185; fma.rn.f32 %f187, %f184, %f184, %f186; ld.global.nc.f32 %f188, [%rd63]; fma.rn.f32 %f189, %f188, %f188, %f187; setp.eq.f32 %p49, %f189, 0f00000000; selp.f32 %f190, %f1, %f184, %p49; selp.f32 %f191, %f2, %f185, %p49; selp.f32 %f192, %f3, %f188, %p49; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs26, [%rd64]; setp.gt.u16 %p50, %rs26, %rs1; cvt.u32.u16 %r101, %rs26; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p50; selp.b32 %r104, %r102, %r7, %p50; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd65, %r108, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f193, [%rd66]; add.f32 %f194, %f193, %f193; mul.f32 %f84, %f101, %f101; div.rn.f32 %f195, %f194, %f84; sub.f32 %f196, %f190, %f1; sub.f32 %f197, %f191, %f2; sub.f32 %f198, %f192, %f3; fma.rn.f32 %f85, %f195, %f196, %f237; fma.rn.f32 %f86, %f195, %f197, %f238; fma.rn.f32 %f87, %f195, %f198, %f239; add.s32 %r32, %r3, 1; @%p48 bra BB0_52; rem.s32 %r109, %r32, %r38; add.s32 %r110, %r109, %r38; rem.s32 %r127, %r110, %r38; bra.uni BB0_53; BB0_52: add.s32 %r111, %r38, -1; min.s32 %r127, %r32, %r111; BB0_53: mad.lo.s32 %r112, %r127, %r37, %r2; mad.lo.s32 %r113, %r112, %r36, %r1; cvt.s64.s32 %rd67, %r113; mul.wide.s32 %rd68, %r113, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f199, [%rd69]; ld.global.nc.f32 %f200, [%rd70]; mul.f32 %f201, %f200, %f200; fma.rn.f32 %f202, %f199, %f199, %f201; ld.global.nc.f32 %f203, [%rd71]; fma.rn.f32 %f204, %f203, %f203, %f202; setp.eq.f32 %p52, %f204, 0f00000000; selp.f32 %f205, %f3, %f203, %p52; selp.f32 %f206, %f2, %f200, %p52; selp.f32 %f207, %f1, %f199, %p52; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs30, [%rd72]; setp.gt.u16 %p53, %rs30, %rs1; cvt.u32.u16 %r114, %rs30; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p53; selp.b32 %r117, %r115, %r7, %p53; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd73, %r121, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f208, [%rd74]; add.f32 %f209, %f208, %f208; div.rn.f32 %f210, %f209, %f84; sub.f32 %f211, %f207, %f1; sub.f32 %f212, %f206, %f2; sub.f32 %f213, %f205, %f3; fma.rn.f32 %f237, %f210, %f211, %f85; fma.rn.f32 %f238, %f210, %f212, %f86; fma.rn.f32 %f239, %f210, %f213, %f87; BB0_54: setp.eq.s64 %p54, %rd10, 0; @%p54 bra BB0_56; cvta.to.global.u64 %rd75, %rd10; add.s64 %rd77, %rd75, %rd17; ld.global.nc.f32 %f214, [%rd77]; mul.f32 %f261, %f214, %f261; BB0_56: setp.eq.f32 %p55, %f261, 0f00000000; mov.f32 %f262, 0f00000000; @%p55 bra BB0_58; rcp.rn.f32 %f262, %f261; BB0_58: cvta.to.global.u64 %rd78, %rd9; cvta.to.global.u64 %rd79, %rd8; cvta.to.global.u64 %rd80, %rd7; add.s64 %rd82, %rd80, %rd17; ld.global.f32 %f216, [%rd82]; fma.rn.f32 %f217, %f237, %f262, %f216; st.global.f32 [%rd82], %f217; add.s64 %rd83, %rd79, %rd17; ld.global.f32 %f218, [%rd83]; fma.rn.f32 %f219, %f238, %f262, %f218; st.global.f32 [%rd83], %f219; add.s64 %rd84, %rd78, %rd17; ld.global.f32 %f220, [%rd84]; fma.rn.f32 %f221, %f239, %f262, %f220; st.global.f32 [%rd84], %f221; BB0_59: ret; } ` adddmi_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<56>; .reg .b16 %rs<37>; .reg .f32 %f<263>; .reg .b32 %r<128>; .reg .b64 %rd<85>; ld.param.u64 %rd7, [adddmi_param_0]; ld.param.u64 %rd8, [adddmi_param_1]; ld.param.u64 %rd9, [adddmi_param_2]; ld.param.u64 %rd11, [adddmi_param_3]; ld.param.u64 %rd12, [adddmi_param_4]; ld.param.u64 %rd13, [adddmi_param_5]; ld.param.u64 %rd10, [adddmi_param_6]; ld.param.f32 %f261, [adddmi_param_7]; ld.param.u64 %rd14, [adddmi_param_8]; ld.param.u64 %rd15, [adddmi_param_9]; ld.param.u64 %rd16, [adddmi_param_10]; ld.param.f32 %f99, [adddmi_param_11]; ld.param.f32 %f100, [adddmi_param_12]; ld.param.f32 %f101, [adddmi_param_13]; ld.param.u32 %r36, [adddmi_param_14]; ld.param.u32 %r37, [adddmi_param_15]; ld.param.u32 %r38, [adddmi_param_16]; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; cvta.to.global.u64 %rd1, %rd15; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd16; cvta.to.global.u64 %rd4, %rd13; cvta.to.global.u64 %rd5, %rd12; cvta.to.global.u64 %rd6, %rd11; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r39, %r40, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r42, %r43, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r45, %r46, %r47; setp.ge.s32 %p1, %r2, %r37; setp.ge.s32 %p2, %r1, %r36; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra BB0_59; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd17, %r6, 4; add.s64 %rd18, %rd6, %rd17; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd5, %rd17; add.s64 %rd21, %rd4, %rd17; add.s64 %rd22, %rd3, %rd19; ld.global.nc.u8 %rs1, [%rd22]; cvt.u32.u16 %r49, %rs1; and.b32 %r7, %r49, 255; ld.global.nc.f32 %f1, [%rd18]; ld.global.nc.f32 %f2, [%rd20]; mul.f32 %f102, %f2, %f2; fma.rn.f32 %f103, %f1, %f1, %f102; ld.global.nc.f32 %f3, [%rd21]; fma.rn.f32 %f104, %f3, %f3, %f103; setp.eq.f32 %p6, %f104, 0f00000000; @%p6 bra BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r50, %r8, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r122, %r51, %r36; bra.uni BB0_5; BB0_4: mov.u32 %r52, 0; max.s32 %r122, %r8, %r52; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f225, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f226, %f225; mov.f32 %f227, %f225; @%p10 bra BB0_7; mul.wide.s32 %rd23, %r12, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f225, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f226, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f227, [%rd26]; BB0_7: mul.f32 %f108, %f226, %f226; fma.rn.f32 %f109, %f225, %f225, %f108; fma.rn.f32 %f10, %f227, %f227, %f109; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs33, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd27, %r12; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs33, [%rd28]; BB0_9: setp.gt.u16 %p12, %rs33, %rs1; cvt.u32.u16 %r53, %rs33; and.b32 %r54, %r53, 255; selp.b32 %r55, %r7, %r54, %p12; selp.b32 %r56, %r54, %r7, %p12; add.s32 %r57, %r56, 1; mul.lo.s32 %r58, %r57, %r56; shr.u32 %r59, %r58, 1; add.s32 %r60, %r59, %r55; mul.wide.s32 %rd29, %r60, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f12, [%rd31]; setp.ne.s16 %p13, %rs14, 0; mov.f32 %f237, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f238, %f237; mov.f32 %f239, %f237; @%p15 bra BB0_13; setp.neu.f32 %p16, %f10, 0f00000000; @%p16 bra BB0_12; mul.f32 %f113, %f12, 0f3F000000; div.rn.f32 %f114, %f113, %f11; mul.f32 %f115, %f114, %f99; fma.rn.f32 %f225, %f3, %f115, %f1; mul.f32 %f116, %f1, %f115; sub.f32 %f227, %f3, %f116; mov.f32 %f226, %f2; BB0_12: mul.f32 %f117, %f99, %f99; add.f32 %f118, %f11, %f11; div.rn.f32 %f119, %f118, %f117; sub.f32 %f120, %f225, %f1; sub.f32 %f121, %f226, %f2; sub.f32 %f122, %f227, %f3; fma.rn.f32 %f123, %f120, %f119, 0f00000000; fma.rn.f32 %f238, %f121, %f119, 0f00000000; fma.rn.f32 %f124, %f119, %f122, 0f00000000; div.rn.f32 %f125, %f12, %f99; mul.f32 %f126, %f227, %f125; sub.f32 %f237, %f123, %f126; fma.rn.f32 %f239, %f225, %f125, %f124; BB0_13: add.s32 %r13, %r1, 1; @%p7 bra BB0_15; rem.s32 %r61, %r13, %r36; add.s32 %r62, %r61, %r36; rem.s32 %r123, %r62, %r36; bra.uni BB0_16; BB0_15: add.s32 %r63, %r36, -1; min.s32 %r123, %r13, %r63; BB0_16: add.s32 %r17, %r123, %r5; setp.ge.s32 %p18, %r13, %r36; mov.f32 %f231, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f232, %f231; mov.f32 %f233, %f231; @%p20 bra BB0_18; mul.wide.s32 %rd32, %r17, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f231, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f232, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f233, [%rd35]; BB0_18: mul.f32 %f130, %f232, %f232; fma.rn.f32 %f131, %f231, %f231, %f130; fma.rn.f32 %f30, %f233, %f233, %f131; setp.eq.f32 %p21, %f30, 0f00000000; mov.u16 %rs34, %rs1; @%p21 bra BB0_20; cvt.s64.s32 %rd36, %r17; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs34, [%rd37]; BB0_20: setp.gt.u16 %p22, %rs34, %rs1; cvt.u32.u16 %r64, %rs34; and.b32 %r65, %r64, 255; selp.b32 %r66, %r7, %r65, %p22; selp.b32 %r67, %r65, %r7, %p22; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; add.s32 %r71, %r70, %r66; mul.wide.s32 %rd38, %r71, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f31, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f32, [%rd40]; and.pred %p25, %p21, %p13; @%p25 bra BB0_24; setp.neu.f32 %p26, %f30, 0f00000000; @%p26 bra BB0_23; mul.f32 %f132, %f32, 0f3F000000; div.rn.f32 %f133, %f132, %f31; mul.f32 %f134, %f133, %f99; mul.f32 %f135, %f3, %f134; sub.f32 %f231, %f1, %f135; fma.rn.f32 %f233, %f1, %f134, %f3; mov.f32 %f232, %f2; BB0_23: mul.f32 %f136, %f99, %f99; add.f32 %f137, %f31, %f31; div.rn.f32 %f138, %f137, %f136; sub.f32 %f139, %f231, %f1; sub.f32 %f140, %f232, %f2; sub.f32 %f141, %f233, %f3; fma.rn.f32 %f142, %f139, %f138, %f237; fma.rn.f32 %f238, %f140, %f138, %f238; fma.rn.f32 %f143, %f138, %f141, %f239; div.rn.f32 %f144, %f32, %f99; fma.rn.f32 %f237, %f233, %f144, %f142; mul.f32 %f145, %f231, %f144; sub.f32 %f239, %f143, %f145; BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r18, %r2, -1; @%p27 bra BB0_26; rem.s32 %r72, %r18, %r37; add.s32 %r73, %r72, %r37; rem.s32 %r124, %r73, %r37; bra.uni BB0_27; BB0_26: mov.u32 %r74, 0; max.s32 %r124, %r18, %r74; BB0_27: add.s32 %r75, %r124, %r4; mad.lo.s32 %r22, %r75, %r36, %r1; setp.lt.s32 %p29, %r18, 0; mov.f32 %f240, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f241, %f240; mov.f32 %f242, %f240; @%p30 bra BB0_29; mul.wide.s32 %rd41, %r22, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f240, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f241, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f242, [%rd44]; BB0_29: mul.f32 %f149, %f241, %f241; fma.rn.f32 %f150, %f240, %f240, %f149; fma.rn.f32 %f50, %f242, %f242, %f150; setp.eq.f32 %p31, %f50, 0f00000000; mov.u16 %rs35, %rs1; @%p31 bra BB0_31; cvt.s64.s32 %rd45, %r22; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs35, [%rd46]; BB0_31: setp.gt.u16 %p32, %rs35, %rs1; cvt.u32.u16 %r76, %rs35; and.b32 %r77, %r76, 255; selp.b32 %r78, %r7, %r77, %p32; selp.b32 %r79, %r77, %r7, %p32; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd47, %r83, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f51, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f52, [%rd49]; and.pred %p35, %p31, %p13; @%p35 bra BB0_35; setp.neu.f32 %p36, %f50, 0f00000000; @%p36 bra BB0_34; mul.f32 %f151, %f52, 0f3F000000; div.rn.f32 %f152, %f151, %f51; mul.f32 %f153, %f152, %f100; fma.rn.f32 %f241, %f3, %f153, %f2; mul.f32 %f154, %f2, %f153; sub.f32 %f242, %f3, %f154; mov.f32 %f240, %f1; BB0_34: mul.f32 %f155, %f100, %f100; add.f32 %f156, %f51, %f51; div.rn.f32 %f157, %f156, %f155; sub.f32 %f158, %f240, %f1; sub.f32 %f159, %f241, %f2; sub.f32 %f160, %f242, %f3; fma.rn.f32 %f237, %f158, %f157, %f237; fma.rn.f32 %f161, %f159, %f157, %f238; fma.rn.f32 %f162, %f157, %f160, %f239; div.rn.f32 %f163, %f52, %f100; mul.f32 %f164, %f242, %f163; sub.f32 %f238, %f161, %f164; fma.rn.f32 %f239, %f241, %f163, %f162; BB0_35: add.s32 %r23, %r2, 1; @%p27 bra BB0_37; rem.s32 %r84, %r23, %r37; add.s32 %r85, %r84, %r37; rem.s32 %r125, %r85, %r37; bra.uni BB0_38; BB0_37: add.s32 %r86, %r37, -1; min.s32 %r125, %r23, %r86; BB0_38: add.s32 %r87, %r125, %r4; mad.lo.s32 %r27, %r87, %r36, %r1; setp.ge.s32 %p38, %r23, %r37; mov.f32 %f249, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f250, %f249; mov.f32 %f251, %f249; @%p40 bra BB0_40; mul.wide.s32 %rd50, %r27, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f249, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f250, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f251, [%rd53]; BB0_40: mul.f32 %f168, %f250, %f250; fma.rn.f32 %f169, %f249, %f249, %f168; fma.rn.f32 %f70, %f251, %f251, %f169; setp.eq.f32 %p41, %f70, 0f00000000; mov.u16 %rs36, %rs1; @%p41 bra BB0_42; cvt.s64.s32 %rd54, %r27; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs36, [%rd55]; BB0_42: setp.gt.u16 %p42, %rs36, %rs1; cvt.u32.u16 %r88, %rs36; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p42; selp.b32 %r91, %r89, %r7, %p42; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd56, %r95, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f71, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f72, [%rd58]; and.pred %p45, %p41, %p13; @%p45 bra BB0_46; setp.neu.f32 %p46, %f70, 0f00000000; @%p46 bra BB0_45; mul.f32 %f170, %f72, 0f3F000000; div.rn.f32 %f171, %f170, %f71; mul.f32 %f172, %f171, %f100; mul.f32 %f173, %f3, %f172; sub.f32 %f250, %f2, %f173; fma.rn.f32 %f251, %f2, %f172, %f3; mov.f32 %f249, %f1; BB0_45: mul.f32 %f174, %f100, %f100; add.f32 %f175, %f71, %f71; div.rn.f32 %f176, %f175, %f174; sub.f32 %f177, %f249, %f1; sub.f32 %f178, %f250, %f2; sub.f32 %f179, %f251, %f3; fma.rn.f32 %f237, %f177, %f176, %f237; fma.rn.f32 %f180, %f178, %f176, %f238; fma.rn.f32 %f181, %f176, %f179, %f239; div.rn.f32 %f182, %f72, %f100; fma.rn.f32 %f238, %f251, %f182, %f180; mul.f32 %f183, %f250, %f182; sub.f32 %f239, %f181, %f183; BB0_46: setp.eq.s32 %p47, %r38, 1; @%p47 bra BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r28, %r3, -1; @%p48 bra BB0_49; rem.s32 %r96, %r28, %r38; add.s32 %r97, %r96, %r38; rem.s32 %r126, %r97, %r38; bra.uni BB0_50; BB0_49: mov.u32 %r98, 0; max.s32 %r126, %r28, %r98; BB0_50: mad.lo.s32 %r99, %r126, %r37, %r2; mad.lo.s32 %r100, %r99, %r36, %r1; cvt.s64.s32 %rd59, %r100; mul.wide.s32 %rd60, %r100, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f184, [%rd61]; ld.global.nc.f32 %f185, [%rd62]; mul.f32 %f186, %f185, %f185; fma.rn.f32 %f187, %f184, %f184, %f186; ld.global.nc.f32 %f188, [%rd63]; fma.rn.f32 %f189, %f188, %f188, %f187; setp.eq.f32 %p49, %f189, 0f00000000; selp.f32 %f190, %f1, %f184, %p49; selp.f32 %f191, %f2, %f185, %p49; selp.f32 %f192, %f3, %f188, %p49; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs26, [%rd64]; setp.gt.u16 %p50, %rs26, %rs1; cvt.u32.u16 %r101, %rs26; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p50; selp.b32 %r104, %r102, %r7, %p50; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd65, %r108, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f193, [%rd66]; add.f32 %f194, %f193, %f193; mul.f32 %f84, %f101, %f101; div.rn.f32 %f195, %f194, %f84; sub.f32 %f196, %f190, %f1; sub.f32 %f197, %f191, %f2; sub.f32 %f198, %f192, %f3; fma.rn.f32 %f85, %f195, %f196, %f237; fma.rn.f32 %f86, %f195, %f197, %f238; fma.rn.f32 %f87, %f195, %f198, %f239; add.s32 %r32, %r3, 1; @%p48 bra BB0_52; rem.s32 %r109, %r32, %r38; add.s32 %r110, %r109, %r38; rem.s32 %r127, %r110, %r38; bra.uni BB0_53; BB0_52: add.s32 %r111, %r38, -1; min.s32 %r127, %r32, %r111; BB0_53: mad.lo.s32 %r112, %r127, %r37, %r2; mad.lo.s32 %r113, %r112, %r36, %r1; cvt.s64.s32 %rd67, %r113; mul.wide.s32 %rd68, %r113, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f199, [%rd69]; ld.global.nc.f32 %f200, [%rd70]; mul.f32 %f201, %f200, %f200; fma.rn.f32 %f202, %f199, %f199, %f201; ld.global.nc.f32 %f203, [%rd71]; fma.rn.f32 %f204, %f203, %f203, %f202; setp.eq.f32 %p52, %f204, 0f00000000; selp.f32 %f205, %f3, %f203, %p52; selp.f32 %f206, %f2, %f200, %p52; selp.f32 %f207, %f1, %f199, %p52; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs30, [%rd72]; setp.gt.u16 %p53, %rs30, %rs1; cvt.u32.u16 %r114, %rs30; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p53; selp.b32 %r117, %r115, %r7, %p53; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd73, %r121, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f208, [%rd74]; add.f32 %f209, %f208, %f208; div.rn.f32 %f210, %f209, %f84; sub.f32 %f211, %f207, %f1; sub.f32 %f212, %f206, %f2; sub.f32 %f213, %f205, %f3; fma.rn.f32 %f237, %f210, %f211, %f85; fma.rn.f32 %f238, %f210, %f212, %f86; fma.rn.f32 %f239, %f210, %f213, %f87; BB0_54: setp.eq.s64 %p54, %rd10, 0; @%p54 bra BB0_56; cvta.to.global.u64 %rd75, %rd10; add.s64 %rd77, %rd75, %rd17; ld.global.nc.f32 %f214, [%rd77]; mul.f32 %f261, %f214, %f261; BB0_56: setp.eq.f32 %p55, %f261, 0f00000000; mov.f32 %f262, 0f00000000; @%p55 bra BB0_58; rcp.rn.f32 %f262, %f261; BB0_58: cvta.to.global.u64 %rd78, %rd9; cvta.to.global.u64 %rd79, %rd8; cvta.to.global.u64 %rd80, %rd7; add.s64 %rd82, %rd80, %rd17; ld.global.f32 %f216, [%rd82]; fma.rn.f32 %f217, %f237, %f262, %f216; st.global.f32 [%rd82], %f217; add.s64 %rd83, %rd79, %rd17; ld.global.f32 %f218, [%rd83]; fma.rn.f32 %f219, %f238, %f262, %f218; st.global.f32 [%rd83], %f219; add.s64 %rd84, %rd78, %rd17; ld.global.f32 %f220, [%rd84]; fma.rn.f32 %f221, %f239, %f262, %f220; st.global.f32 [%rd84], %f221; BB0_59: ret; } ` adddmi_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<56>; .reg .b16 %rs<37>; .reg .f32 %f<263>; .reg .b32 %r<128>; .reg .b64 %rd<85>; ld.param.u64 %rd7, [adddmi_param_0]; ld.param.u64 %rd8, [adddmi_param_1]; ld.param.u64 %rd9, [adddmi_param_2]; ld.param.u64 %rd11, [adddmi_param_3]; ld.param.u64 %rd12, [adddmi_param_4]; ld.param.u64 %rd13, [adddmi_param_5]; ld.param.u64 %rd10, [adddmi_param_6]; ld.param.f32 %f261, [adddmi_param_7]; ld.param.u64 %rd14, [adddmi_param_8]; ld.param.u64 %rd15, [adddmi_param_9]; ld.param.u64 %rd16, [adddmi_param_10]; ld.param.f32 %f99, [adddmi_param_11]; ld.param.f32 %f100, [adddmi_param_12]; ld.param.f32 %f101, [adddmi_param_13]; ld.param.u32 %r36, [adddmi_param_14]; ld.param.u32 %r37, [adddmi_param_15]; ld.param.u32 %r38, [adddmi_param_16]; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; cvta.to.global.u64 %rd1, %rd15; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd16; cvta.to.global.u64 %rd4, %rd13; cvta.to.global.u64 %rd5, %rd12; cvta.to.global.u64 %rd6, %rd11; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r39, %r40, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r42, %r43, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r45, %r46, %r47; setp.ge.s32 %p1, %r2, %r37; setp.ge.s32 %p2, %r1, %r36; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra BB0_59; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd17, %r6, 4; add.s64 %rd18, %rd6, %rd17; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd5, %rd17; add.s64 %rd21, %rd4, %rd17; add.s64 %rd22, %rd3, %rd19; ld.global.nc.u8 %rs1, [%rd22]; cvt.u32.u16 %r49, %rs1; and.b32 %r7, %r49, 255; ld.global.nc.f32 %f1, [%rd18]; ld.global.nc.f32 %f2, [%rd20]; mul.f32 %f102, %f2, %f2; fma.rn.f32 %f103, %f1, %f1, %f102; ld.global.nc.f32 %f3, [%rd21]; fma.rn.f32 %f104, %f3, %f3, %f103; setp.eq.f32 %p6, %f104, 0f00000000; @%p6 bra BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r50, %r8, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r122, %r51, %r36; bra.uni BB0_5; BB0_4: mov.u32 %r52, 0; max.s32 %r122, %r8, %r52; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f225, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f226, %f225; mov.f32 %f227, %f225; @%p10 bra BB0_7; mul.wide.s32 %rd23, %r12, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f225, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f226, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f227, [%rd26]; BB0_7: mul.f32 %f108, %f226, %f226; fma.rn.f32 %f109, %f225, %f225, %f108; fma.rn.f32 %f10, %f227, %f227, %f109; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs33, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd27, %r12; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs33, [%rd28]; BB0_9: setp.gt.u16 %p12, %rs33, %rs1; cvt.u32.u16 %r53, %rs33; and.b32 %r54, %r53, 255; selp.b32 %r55, %r7, %r54, %p12; selp.b32 %r56, %r54, %r7, %p12; add.s32 %r57, %r56, 1; mul.lo.s32 %r58, %r57, %r56; shr.u32 %r59, %r58, 1; add.s32 %r60, %r59, %r55; mul.wide.s32 %rd29, %r60, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f12, [%rd31]; setp.ne.s16 %p13, %rs14, 0; mov.f32 %f237, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f238, %f237; mov.f32 %f239, %f237; @%p15 bra BB0_13; setp.neu.f32 %p16, %f10, 0f00000000; @%p16 bra BB0_12; mul.f32 %f113, %f12, 0f3F000000; div.rn.f32 %f114, %f113, %f11; mul.f32 %f115, %f114, %f99; fma.rn.f32 %f225, %f3, %f115, %f1; mul.f32 %f116, %f1, %f115; sub.f32 %f227, %f3, %f116; mov.f32 %f226, %f2; BB0_12: mul.f32 %f117, %f99, %f99; add.f32 %f118, %f11, %f11; div.rn.f32 %f119, %f118, %f117; sub.f32 %f120, %f225, %f1; sub.f32 %f121, %f226, %f2; sub.f32 %f122, %f227, %f3; fma.rn.f32 %f123, %f120, %f119, 0f00000000; fma.rn.f32 %f238, %f121, %f119, 0f00000000; fma.rn.f32 %f124, %f119, %f122, 0f00000000; div.rn.f32 %f125, %f12, %f99; mul.f32 %f126, %f227, %f125; sub.f32 %f237, %f123, %f126; fma.rn.f32 %f239, %f225, %f125, %f124; BB0_13: add.s32 %r13, %r1, 1; @%p7 bra BB0_15; rem.s32 %r61, %r13, %r36; add.s32 %r62, %r61, %r36; rem.s32 %r123, %r62, %r36; bra.uni BB0_16; BB0_15: add.s32 %r63, %r36, -1; min.s32 %r123, %r13, %r63; BB0_16: add.s32 %r17, %r123, %r5; setp.ge.s32 %p18, %r13, %r36; mov.f32 %f231, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f232, %f231; mov.f32 %f233, %f231; @%p20 bra BB0_18; mul.wide.s32 %rd32, %r17, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f231, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f232, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f233, [%rd35]; BB0_18: mul.f32 %f130, %f232, %f232; fma.rn.f32 %f131, %f231, %f231, %f130; fma.rn.f32 %f30, %f233, %f233, %f131; setp.eq.f32 %p21, %f30, 0f00000000; mov.u16 %rs34, %rs1; @%p21 bra BB0_20; cvt.s64.s32 %rd36, %r17; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs34, [%rd37]; BB0_20: setp.gt.u16 %p22, %rs34, %rs1; cvt.u32.u16 %r64, %rs34; and.b32 %r65, %r64, 255; selp.b32 %r66, %r7, %r65, %p22; selp.b32 %r67, %r65, %r7, %p22; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; add.s32 %r71, %r70, %r66; mul.wide.s32 %rd38, %r71, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f31, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f32, [%rd40]; and.pred %p25, %p21, %p13; @%p25 bra BB0_24; setp.neu.f32 %p26, %f30, 0f00000000; @%p26 bra BB0_23; mul.f32 %f132, %f32, 0f3F000000; div.rn.f32 %f133, %f132, %f31; mul.f32 %f134, %f133, %f99; mul.f32 %f135, %f3, %f134; sub.f32 %f231, %f1, %f135; fma.rn.f32 %f233, %f1, %f134, %f3; mov.f32 %f232, %f2; BB0_23: mul.f32 %f136, %f99, %f99; add.f32 %f137, %f31, %f31; div.rn.f32 %f138, %f137, %f136; sub.f32 %f139, %f231, %f1; sub.f32 %f140, %f232, %f2; sub.f32 %f141, %f233, %f3; fma.rn.f32 %f142, %f139, %f138, %f237; fma.rn.f32 %f238, %f140, %f138, %f238; fma.rn.f32 %f143, %f138, %f141, %f239; div.rn.f32 %f144, %f32, %f99; fma.rn.f32 %f237, %f233, %f144, %f142; mul.f32 %f145, %f231, %f144; sub.f32 %f239, %f143, %f145; BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r18, %r2, -1; @%p27 bra BB0_26; rem.s32 %r72, %r18, %r37; add.s32 %r73, %r72, %r37; rem.s32 %r124, %r73, %r37; bra.uni BB0_27; BB0_26: mov.u32 %r74, 0; max.s32 %r124, %r18, %r74; BB0_27: add.s32 %r75, %r124, %r4; mad.lo.s32 %r22, %r75, %r36, %r1; setp.lt.s32 %p29, %r18, 0; mov.f32 %f240, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f241, %f240; mov.f32 %f242, %f240; @%p30 bra BB0_29; mul.wide.s32 %rd41, %r22, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f240, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f241, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f242, [%rd44]; BB0_29: mul.f32 %f149, %f241, %f241; fma.rn.f32 %f150, %f240, %f240, %f149; fma.rn.f32 %f50, %f242, %f242, %f150; setp.eq.f32 %p31, %f50, 0f00000000; mov.u16 %rs35, %rs1; @%p31 bra BB0_31; cvt.s64.s32 %rd45, %r22; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs35, [%rd46]; BB0_31: setp.gt.u16 %p32, %rs35, %rs1; cvt.u32.u16 %r76, %rs35; and.b32 %r77, %r76, 255; selp.b32 %r78, %r7, %r77, %p32; selp.b32 %r79, %r77, %r7, %p32; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd47, %r83, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f51, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f52, [%rd49]; and.pred %p35, %p31, %p13; @%p35 bra BB0_35; setp.neu.f32 %p36, %f50, 0f00000000; @%p36 bra BB0_34; mul.f32 %f151, %f52, 0f3F000000; div.rn.f32 %f152, %f151, %f51; mul.f32 %f153, %f152, %f100; fma.rn.f32 %f241, %f3, %f153, %f2; mul.f32 %f154, %f2, %f153; sub.f32 %f242, %f3, %f154; mov.f32 %f240, %f1; BB0_34: mul.f32 %f155, %f100, %f100; add.f32 %f156, %f51, %f51; div.rn.f32 %f157, %f156, %f155; sub.f32 %f158, %f240, %f1; sub.f32 %f159, %f241, %f2; sub.f32 %f160, %f242, %f3; fma.rn.f32 %f237, %f158, %f157, %f237; fma.rn.f32 %f161, %f159, %f157, %f238; fma.rn.f32 %f162, %f157, %f160, %f239; div.rn.f32 %f163, %f52, %f100; mul.f32 %f164, %f242, %f163; sub.f32 %f238, %f161, %f164; fma.rn.f32 %f239, %f241, %f163, %f162; BB0_35: add.s32 %r23, %r2, 1; @%p27 bra BB0_37; rem.s32 %r84, %r23, %r37; add.s32 %r85, %r84, %r37; rem.s32 %r125, %r85, %r37; bra.uni BB0_38; BB0_37: add.s32 %r86, %r37, -1; min.s32 %r125, %r23, %r86; BB0_38: add.s32 %r87, %r125, %r4; mad.lo.s32 %r27, %r87, %r36, %r1; setp.ge.s32 %p38, %r23, %r37; mov.f32 %f249, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f250, %f249; mov.f32 %f251, %f249; @%p40 bra BB0_40; mul.wide.s32 %rd50, %r27, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f249, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f250, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f251, [%rd53]; BB0_40: mul.f32 %f168, %f250, %f250; fma.rn.f32 %f169, %f249, %f249, %f168; fma.rn.f32 %f70, %f251, %f251, %f169; setp.eq.f32 %p41, %f70, 0f00000000; mov.u16 %rs36, %rs1; @%p41 bra BB0_42; cvt.s64.s32 %rd54, %r27; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs36, [%rd55]; BB0_42: setp.gt.u16 %p42, %rs36, %rs1; cvt.u32.u16 %r88, %rs36; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p42; selp.b32 %r91, %r89, %r7, %p42; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd56, %r95, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f71, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f72, [%rd58]; and.pred %p45, %p41, %p13; @%p45 bra BB0_46; setp.neu.f32 %p46, %f70, 0f00000000; @%p46 bra BB0_45; mul.f32 %f170, %f72, 0f3F000000; div.rn.f32 %f171, %f170, %f71; mul.f32 %f172, %f171, %f100; mul.f32 %f173, %f3, %f172; sub.f32 %f250, %f2, %f173; fma.rn.f32 %f251, %f2, %f172, %f3; mov.f32 %f249, %f1; BB0_45: mul.f32 %f174, %f100, %f100; add.f32 %f175, %f71, %f71; div.rn.f32 %f176, %f175, %f174; sub.f32 %f177, %f249, %f1; sub.f32 %f178, %f250, %f2; sub.f32 %f179, %f251, %f3; fma.rn.f32 %f237, %f177, %f176, %f237; fma.rn.f32 %f180, %f178, %f176, %f238; fma.rn.f32 %f181, %f176, %f179, %f239; div.rn.f32 %f182, %f72, %f100; fma.rn.f32 %f238, %f251, %f182, %f180; mul.f32 %f183, %f250, %f182; sub.f32 %f239, %f181, %f183; BB0_46: setp.eq.s32 %p47, %r38, 1; @%p47 bra BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r28, %r3, -1; @%p48 bra BB0_49; rem.s32 %r96, %r28, %r38; add.s32 %r97, %r96, %r38; rem.s32 %r126, %r97, %r38; bra.uni BB0_50; BB0_49: mov.u32 %r98, 0; max.s32 %r126, %r28, %r98; BB0_50: mad.lo.s32 %r99, %r126, %r37, %r2; mad.lo.s32 %r100, %r99, %r36, %r1; cvt.s64.s32 %rd59, %r100; mul.wide.s32 %rd60, %r100, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f184, [%rd61]; ld.global.nc.f32 %f185, [%rd62]; mul.f32 %f186, %f185, %f185; fma.rn.f32 %f187, %f184, %f184, %f186; ld.global.nc.f32 %f188, [%rd63]; fma.rn.f32 %f189, %f188, %f188, %f187; setp.eq.f32 %p49, %f189, 0f00000000; selp.f32 %f190, %f1, %f184, %p49; selp.f32 %f191, %f2, %f185, %p49; selp.f32 %f192, %f3, %f188, %p49; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs26, [%rd64]; setp.gt.u16 %p50, %rs26, %rs1; cvt.u32.u16 %r101, %rs26; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p50; selp.b32 %r104, %r102, %r7, %p50; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd65, %r108, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f193, [%rd66]; add.f32 %f194, %f193, %f193; mul.f32 %f84, %f101, %f101; div.rn.f32 %f195, %f194, %f84; sub.f32 %f196, %f190, %f1; sub.f32 %f197, %f191, %f2; sub.f32 %f198, %f192, %f3; fma.rn.f32 %f85, %f195, %f196, %f237; fma.rn.f32 %f86, %f195, %f197, %f238; fma.rn.f32 %f87, %f195, %f198, %f239; add.s32 %r32, %r3, 1; @%p48 bra BB0_52; rem.s32 %r109, %r32, %r38; add.s32 %r110, %r109, %r38; rem.s32 %r127, %r110, %r38; bra.uni BB0_53; BB0_52: add.s32 %r111, %r38, -1; min.s32 %r127, %r32, %r111; BB0_53: mad.lo.s32 %r112, %r127, %r37, %r2; mad.lo.s32 %r113, %r112, %r36, %r1; cvt.s64.s32 %rd67, %r113; mul.wide.s32 %rd68, %r113, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f199, [%rd69]; ld.global.nc.f32 %f200, [%rd70]; mul.f32 %f201, %f200, %f200; fma.rn.f32 %f202, %f199, %f199, %f201; ld.global.nc.f32 %f203, [%rd71]; fma.rn.f32 %f204, %f203, %f203, %f202; setp.eq.f32 %p52, %f204, 0f00000000; selp.f32 %f205, %f3, %f203, %p52; selp.f32 %f206, %f2, %f200, %p52; selp.f32 %f207, %f1, %f199, %p52; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs30, [%rd72]; setp.gt.u16 %p53, %rs30, %rs1; cvt.u32.u16 %r114, %rs30; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p53; selp.b32 %r117, %r115, %r7, %p53; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd73, %r121, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f208, [%rd74]; add.f32 %f209, %f208, %f208; div.rn.f32 %f210, %f209, %f84; sub.f32 %f211, %f207, %f1; sub.f32 %f212, %f206, %f2; sub.f32 %f213, %f205, %f3; fma.rn.f32 %f237, %f210, %f211, %f85; fma.rn.f32 %f238, %f210, %f212, %f86; fma.rn.f32 %f239, %f210, %f213, %f87; BB0_54: setp.eq.s64 %p54, %rd10, 0; @%p54 bra BB0_56; cvta.to.global.u64 %rd75, %rd10; add.s64 %rd77, %rd75, %rd17; ld.global.nc.f32 %f214, [%rd77]; mul.f32 %f261, %f214, %f261; BB0_56: setp.eq.f32 %p55, %f261, 0f00000000; mov.f32 %f262, 0f00000000; @%p55 bra BB0_58; rcp.rn.f32 %f262, %f261; BB0_58: cvta.to.global.u64 %rd78, %rd9; cvta.to.global.u64 %rd79, %rd8; cvta.to.global.u64 %rd80, %rd7; add.s64 %rd82, %rd80, %rd17; ld.global.f32 %f216, [%rd82]; fma.rn.f32 %f217, %f237, %f262, %f216; st.global.f32 [%rd82], %f217; add.s64 %rd83, %rd79, %rd17; ld.global.f32 %f218, [%rd83]; fma.rn.f32 %f219, %f238, %f262, %f218; st.global.f32 [%rd83], %f219; add.s64 %rd84, %rd78, %rd17; ld.global.f32 %f220, [%rd84]; fma.rn.f32 %f221, %f239, %f262, %f220; st.global.f32 [%rd84], %f221; BB0_59: ret; } ` adddmi_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<56>; .reg .b16 %rs<37>; .reg .f32 %f<263>; .reg .b32 %r<128>; .reg .b64 %rd<85>; ld.param.u64 %rd7, [adddmi_param_0]; ld.param.u64 %rd8, [adddmi_param_1]; ld.param.u64 %rd9, [adddmi_param_2]; ld.param.u64 %rd11, [adddmi_param_3]; ld.param.u64 %rd12, [adddmi_param_4]; ld.param.u64 %rd13, [adddmi_param_5]; ld.param.u64 %rd10, [adddmi_param_6]; ld.param.f32 %f261, [adddmi_param_7]; ld.param.u64 %rd14, [adddmi_param_8]; ld.param.u64 %rd15, [adddmi_param_9]; ld.param.u64 %rd16, [adddmi_param_10]; ld.param.f32 %f99, [adddmi_param_11]; ld.param.f32 %f100, [adddmi_param_12]; ld.param.f32 %f101, [adddmi_param_13]; ld.param.u32 %r36, [adddmi_param_14]; ld.param.u32 %r37, [adddmi_param_15]; ld.param.u32 %r38, [adddmi_param_16]; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; cvta.to.global.u64 %rd1, %rd15; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd16; cvta.to.global.u64 %rd4, %rd13; cvta.to.global.u64 %rd5, %rd12; cvta.to.global.u64 %rd6, %rd11; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r39, %r40, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r42, %r43, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r45, %r46, %r47; setp.ge.s32 %p1, %r2, %r37; setp.ge.s32 %p2, %r1, %r36; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra BB0_59; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd17, %r6, 4; add.s64 %rd18, %rd6, %rd17; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd5, %rd17; add.s64 %rd21, %rd4, %rd17; add.s64 %rd22, %rd3, %rd19; ld.global.nc.u8 %rs1, [%rd22]; cvt.u32.u16 %r49, %rs1; and.b32 %r7, %r49, 255; ld.global.nc.f32 %f1, [%rd18]; ld.global.nc.f32 %f2, [%rd20]; mul.f32 %f102, %f2, %f2; fma.rn.f32 %f103, %f1, %f1, %f102; ld.global.nc.f32 %f3, [%rd21]; fma.rn.f32 %f104, %f3, %f3, %f103; setp.eq.f32 %p6, %f104, 0f00000000; @%p6 bra BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r50, %r8, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r122, %r51, %r36; bra.uni BB0_5; BB0_4: mov.u32 %r52, 0; max.s32 %r122, %r8, %r52; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f225, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f226, %f225; mov.f32 %f227, %f225; @%p10 bra BB0_7; mul.wide.s32 %rd23, %r12, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f225, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f226, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f227, [%rd26]; BB0_7: mul.f32 %f108, %f226, %f226; fma.rn.f32 %f109, %f225, %f225, %f108; fma.rn.f32 %f10, %f227, %f227, %f109; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs33, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd27, %r12; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs33, [%rd28]; BB0_9: setp.gt.u16 %p12, %rs33, %rs1; cvt.u32.u16 %r53, %rs33; and.b32 %r54, %r53, 255; selp.b32 %r55, %r7, %r54, %p12; selp.b32 %r56, %r54, %r7, %p12; add.s32 %r57, %r56, 1; mul.lo.s32 %r58, %r57, %r56; shr.u32 %r59, %r58, 1; add.s32 %r60, %r59, %r55; mul.wide.s32 %rd29, %r60, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f12, [%rd31]; setp.ne.s16 %p13, %rs14, 0; mov.f32 %f237, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f238, %f237; mov.f32 %f239, %f237; @%p15 bra BB0_13; setp.neu.f32 %p16, %f10, 0f00000000; @%p16 bra BB0_12; mul.f32 %f113, %f12, 0f3F000000; div.rn.f32 %f114, %f113, %f11; mul.f32 %f115, %f114, %f99; fma.rn.f32 %f225, %f3, %f115, %f1; mul.f32 %f116, %f1, %f115; sub.f32 %f227, %f3, %f116; mov.f32 %f226, %f2; BB0_12: mul.f32 %f117, %f99, %f99; add.f32 %f118, %f11, %f11; div.rn.f32 %f119, %f118, %f117; sub.f32 %f120, %f225, %f1; sub.f32 %f121, %f226, %f2; sub.f32 %f122, %f227, %f3; fma.rn.f32 %f123, %f120, %f119, 0f00000000; fma.rn.f32 %f238, %f121, %f119, 0f00000000; fma.rn.f32 %f124, %f119, %f122, 0f00000000; div.rn.f32 %f125, %f12, %f99; mul.f32 %f126, %f227, %f125; sub.f32 %f237, %f123, %f126; fma.rn.f32 %f239, %f225, %f125, %f124; BB0_13: add.s32 %r13, %r1, 1; @%p7 bra BB0_15; rem.s32 %r61, %r13, %r36; add.s32 %r62, %r61, %r36; rem.s32 %r123, %r62, %r36; bra.uni BB0_16; BB0_15: add.s32 %r63, %r36, -1; min.s32 %r123, %r13, %r63; BB0_16: add.s32 %r17, %r123, %r5; setp.ge.s32 %p18, %r13, %r36; mov.f32 %f231, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f232, %f231; mov.f32 %f233, %f231; @%p20 bra BB0_18; mul.wide.s32 %rd32, %r17, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f231, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f232, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f233, [%rd35]; BB0_18: mul.f32 %f130, %f232, %f232; fma.rn.f32 %f131, %f231, %f231, %f130; fma.rn.f32 %f30, %f233, %f233, %f131; setp.eq.f32 %p21, %f30, 0f00000000; mov.u16 %rs34, %rs1; @%p21 bra BB0_20; cvt.s64.s32 %rd36, %r17; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs34, [%rd37]; BB0_20: setp.gt.u16 %p22, %rs34, %rs1; cvt.u32.u16 %r64, %rs34; and.b32 %r65, %r64, 255; selp.b32 %r66, %r7, %r65, %p22; selp.b32 %r67, %r65, %r7, %p22; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; add.s32 %r71, %r70, %r66; mul.wide.s32 %rd38, %r71, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f31, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f32, [%rd40]; and.pred %p25, %p21, %p13; @%p25 bra BB0_24; setp.neu.f32 %p26, %f30, 0f00000000; @%p26 bra BB0_23; mul.f32 %f132, %f32, 0f3F000000; div.rn.f32 %f133, %f132, %f31; mul.f32 %f134, %f133, %f99; mul.f32 %f135, %f3, %f134; sub.f32 %f231, %f1, %f135; fma.rn.f32 %f233, %f1, %f134, %f3; mov.f32 %f232, %f2; BB0_23: mul.f32 %f136, %f99, %f99; add.f32 %f137, %f31, %f31; div.rn.f32 %f138, %f137, %f136; sub.f32 %f139, %f231, %f1; sub.f32 %f140, %f232, %f2; sub.f32 %f141, %f233, %f3; fma.rn.f32 %f142, %f139, %f138, %f237; fma.rn.f32 %f238, %f140, %f138, %f238; fma.rn.f32 %f143, %f138, %f141, %f239; div.rn.f32 %f144, %f32, %f99; fma.rn.f32 %f237, %f233, %f144, %f142; mul.f32 %f145, %f231, %f144; sub.f32 %f239, %f143, %f145; BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r18, %r2, -1; @%p27 bra BB0_26; rem.s32 %r72, %r18, %r37; add.s32 %r73, %r72, %r37; rem.s32 %r124, %r73, %r37; bra.uni BB0_27; BB0_26: mov.u32 %r74, 0; max.s32 %r124, %r18, %r74; BB0_27: add.s32 %r75, %r124, %r4; mad.lo.s32 %r22, %r75, %r36, %r1; setp.lt.s32 %p29, %r18, 0; mov.f32 %f240, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f241, %f240; mov.f32 %f242, %f240; @%p30 bra BB0_29; mul.wide.s32 %rd41, %r22, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f240, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f241, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f242, [%rd44]; BB0_29: mul.f32 %f149, %f241, %f241; fma.rn.f32 %f150, %f240, %f240, %f149; fma.rn.f32 %f50, %f242, %f242, %f150; setp.eq.f32 %p31, %f50, 0f00000000; mov.u16 %rs35, %rs1; @%p31 bra BB0_31; cvt.s64.s32 %rd45, %r22; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs35, [%rd46]; BB0_31: setp.gt.u16 %p32, %rs35, %rs1; cvt.u32.u16 %r76, %rs35; and.b32 %r77, %r76, 255; selp.b32 %r78, %r7, %r77, %p32; selp.b32 %r79, %r77, %r7, %p32; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd47, %r83, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f51, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f52, [%rd49]; and.pred %p35, %p31, %p13; @%p35 bra BB0_35; setp.neu.f32 %p36, %f50, 0f00000000; @%p36 bra BB0_34; mul.f32 %f151, %f52, 0f3F000000; div.rn.f32 %f152, %f151, %f51; mul.f32 %f153, %f152, %f100; fma.rn.f32 %f241, %f3, %f153, %f2; mul.f32 %f154, %f2, %f153; sub.f32 %f242, %f3, %f154; mov.f32 %f240, %f1; BB0_34: mul.f32 %f155, %f100, %f100; add.f32 %f156, %f51, %f51; div.rn.f32 %f157, %f156, %f155; sub.f32 %f158, %f240, %f1; sub.f32 %f159, %f241, %f2; sub.f32 %f160, %f242, %f3; fma.rn.f32 %f237, %f158, %f157, %f237; fma.rn.f32 %f161, %f159, %f157, %f238; fma.rn.f32 %f162, %f157, %f160, %f239; div.rn.f32 %f163, %f52, %f100; mul.f32 %f164, %f242, %f163; sub.f32 %f238, %f161, %f164; fma.rn.f32 %f239, %f241, %f163, %f162; BB0_35: add.s32 %r23, %r2, 1; @%p27 bra BB0_37; rem.s32 %r84, %r23, %r37; add.s32 %r85, %r84, %r37; rem.s32 %r125, %r85, %r37; bra.uni BB0_38; BB0_37: add.s32 %r86, %r37, -1; min.s32 %r125, %r23, %r86; BB0_38: add.s32 %r87, %r125, %r4; mad.lo.s32 %r27, %r87, %r36, %r1; setp.ge.s32 %p38, %r23, %r37; mov.f32 %f249, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f250, %f249; mov.f32 %f251, %f249; @%p40 bra BB0_40; mul.wide.s32 %rd50, %r27, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f249, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f250, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f251, [%rd53]; BB0_40: mul.f32 %f168, %f250, %f250; fma.rn.f32 %f169, %f249, %f249, %f168; fma.rn.f32 %f70, %f251, %f251, %f169; setp.eq.f32 %p41, %f70, 0f00000000; mov.u16 %rs36, %rs1; @%p41 bra BB0_42; cvt.s64.s32 %rd54, %r27; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs36, [%rd55]; BB0_42: setp.gt.u16 %p42, %rs36, %rs1; cvt.u32.u16 %r88, %rs36; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p42; selp.b32 %r91, %r89, %r7, %p42; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd56, %r95, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f71, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f72, [%rd58]; and.pred %p45, %p41, %p13; @%p45 bra BB0_46; setp.neu.f32 %p46, %f70, 0f00000000; @%p46 bra BB0_45; mul.f32 %f170, %f72, 0f3F000000; div.rn.f32 %f171, %f170, %f71; mul.f32 %f172, %f171, %f100; mul.f32 %f173, %f3, %f172; sub.f32 %f250, %f2, %f173; fma.rn.f32 %f251, %f2, %f172, %f3; mov.f32 %f249, %f1; BB0_45: mul.f32 %f174, %f100, %f100; add.f32 %f175, %f71, %f71; div.rn.f32 %f176, %f175, %f174; sub.f32 %f177, %f249, %f1; sub.f32 %f178, %f250, %f2; sub.f32 %f179, %f251, %f3; fma.rn.f32 %f237, %f177, %f176, %f237; fma.rn.f32 %f180, %f178, %f176, %f238; fma.rn.f32 %f181, %f176, %f179, %f239; div.rn.f32 %f182, %f72, %f100; fma.rn.f32 %f238, %f251, %f182, %f180; mul.f32 %f183, %f250, %f182; sub.f32 %f239, %f181, %f183; BB0_46: setp.eq.s32 %p47, %r38, 1; @%p47 bra BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r28, %r3, -1; @%p48 bra BB0_49; rem.s32 %r96, %r28, %r38; add.s32 %r97, %r96, %r38; rem.s32 %r126, %r97, %r38; bra.uni BB0_50; BB0_49: mov.u32 %r98, 0; max.s32 %r126, %r28, %r98; BB0_50: mad.lo.s32 %r99, %r126, %r37, %r2; mad.lo.s32 %r100, %r99, %r36, %r1; cvt.s64.s32 %rd59, %r100; mul.wide.s32 %rd60, %r100, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f184, [%rd61]; ld.global.nc.f32 %f185, [%rd62]; mul.f32 %f186, %f185, %f185; fma.rn.f32 %f187, %f184, %f184, %f186; ld.global.nc.f32 %f188, [%rd63]; fma.rn.f32 %f189, %f188, %f188, %f187; setp.eq.f32 %p49, %f189, 0f00000000; selp.f32 %f190, %f1, %f184, %p49; selp.f32 %f191, %f2, %f185, %p49; selp.f32 %f192, %f3, %f188, %p49; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs26, [%rd64]; setp.gt.u16 %p50, %rs26, %rs1; cvt.u32.u16 %r101, %rs26; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p50; selp.b32 %r104, %r102, %r7, %p50; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd65, %r108, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f193, [%rd66]; add.f32 %f194, %f193, %f193; mul.f32 %f84, %f101, %f101; div.rn.f32 %f195, %f194, %f84; sub.f32 %f196, %f190, %f1; sub.f32 %f197, %f191, %f2; sub.f32 %f198, %f192, %f3; fma.rn.f32 %f85, %f195, %f196, %f237; fma.rn.f32 %f86, %f195, %f197, %f238; fma.rn.f32 %f87, %f195, %f198, %f239; add.s32 %r32, %r3, 1; @%p48 bra BB0_52; rem.s32 %r109, %r32, %r38; add.s32 %r110, %r109, %r38; rem.s32 %r127, %r110, %r38; bra.uni BB0_53; BB0_52: add.s32 %r111, %r38, -1; min.s32 %r127, %r32, %r111; BB0_53: mad.lo.s32 %r112, %r127, %r37, %r2; mad.lo.s32 %r113, %r112, %r36, %r1; cvt.s64.s32 %rd67, %r113; mul.wide.s32 %rd68, %r113, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f199, [%rd69]; ld.global.nc.f32 %f200, [%rd70]; mul.f32 %f201, %f200, %f200; fma.rn.f32 %f202, %f199, %f199, %f201; ld.global.nc.f32 %f203, [%rd71]; fma.rn.f32 %f204, %f203, %f203, %f202; setp.eq.f32 %p52, %f204, 0f00000000; selp.f32 %f205, %f3, %f203, %p52; selp.f32 %f206, %f2, %f200, %p52; selp.f32 %f207, %f1, %f199, %p52; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs30, [%rd72]; setp.gt.u16 %p53, %rs30, %rs1; cvt.u32.u16 %r114, %rs30; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p53; selp.b32 %r117, %r115, %r7, %p53; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd73, %r121, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f208, [%rd74]; add.f32 %f209, %f208, %f208; div.rn.f32 %f210, %f209, %f84; sub.f32 %f211, %f207, %f1; sub.f32 %f212, %f206, %f2; sub.f32 %f213, %f205, %f3; fma.rn.f32 %f237, %f210, %f211, %f85; fma.rn.f32 %f238, %f210, %f212, %f86; fma.rn.f32 %f239, %f210, %f213, %f87; BB0_54: setp.eq.s64 %p54, %rd10, 0; @%p54 bra BB0_56; cvta.to.global.u64 %rd75, %rd10; add.s64 %rd77, %rd75, %rd17; ld.global.nc.f32 %f214, [%rd77]; mul.f32 %f261, %f214, %f261; BB0_56: setp.eq.f32 %p55, %f261, 0f00000000; mov.f32 %f262, 0f00000000; @%p55 bra BB0_58; rcp.rn.f32 %f262, %f261; BB0_58: cvta.to.global.u64 %rd78, %rd9; cvta.to.global.u64 %rd79, %rd8; cvta.to.global.u64 %rd80, %rd7; add.s64 %rd82, %rd80, %rd17; ld.global.f32 %f216, [%rd82]; fma.rn.f32 %f217, %f237, %f262, %f216; st.global.f32 [%rd82], %f217; add.s64 %rd83, %rd79, %rd17; ld.global.f32 %f218, [%rd83]; fma.rn.f32 %f219, %f238, %f262, %f218; st.global.f32 [%rd83], %f219; add.s64 %rd84, %rd78, %rd17; ld.global.f32 %f220, [%rd84]; fma.rn.f32 %f221, %f239, %f262, %f220; st.global.f32 [%rd84], %f221; BB0_59: ret; } ` adddmi_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<56>; .reg .b16 %rs<37>; .reg .f32 %f<263>; .reg .b32 %r<128>; .reg .b64 %rd<85>; ld.param.u64 %rd7, [adddmi_param_0]; ld.param.u64 %rd8, [adddmi_param_1]; ld.param.u64 %rd9, [adddmi_param_2]; ld.param.u64 %rd11, [adddmi_param_3]; ld.param.u64 %rd12, [adddmi_param_4]; ld.param.u64 %rd13, [adddmi_param_5]; ld.param.u64 %rd10, [adddmi_param_6]; ld.param.f32 %f261, [adddmi_param_7]; ld.param.u64 %rd14, [adddmi_param_8]; ld.param.u64 %rd15, [adddmi_param_9]; ld.param.u64 %rd16, [adddmi_param_10]; ld.param.f32 %f99, [adddmi_param_11]; ld.param.f32 %f100, [adddmi_param_12]; ld.param.f32 %f101, [adddmi_param_13]; ld.param.u32 %r36, [adddmi_param_14]; ld.param.u32 %r37, [adddmi_param_15]; ld.param.u32 %r38, [adddmi_param_16]; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; cvta.to.global.u64 %rd1, %rd15; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd16; cvta.to.global.u64 %rd4, %rd13; cvta.to.global.u64 %rd5, %rd12; cvta.to.global.u64 %rd6, %rd11; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r39, %r40, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r42, %r43, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r45, %r46, %r47; setp.ge.s32 %p1, %r2, %r37; setp.ge.s32 %p2, %r1, %r36; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra BB0_59; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd17, %r6, 4; add.s64 %rd18, %rd6, %rd17; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd5, %rd17; add.s64 %rd21, %rd4, %rd17; add.s64 %rd22, %rd3, %rd19; ld.global.nc.u8 %rs1, [%rd22]; cvt.u32.u16 %r49, %rs1; and.b32 %r7, %r49, 255; ld.global.nc.f32 %f1, [%rd18]; ld.global.nc.f32 %f2, [%rd20]; mul.f32 %f102, %f2, %f2; fma.rn.f32 %f103, %f1, %f1, %f102; ld.global.nc.f32 %f3, [%rd21]; fma.rn.f32 %f104, %f3, %f3, %f103; setp.eq.f32 %p6, %f104, 0f00000000; @%p6 bra BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r50, %r8, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r122, %r51, %r36; bra.uni BB0_5; BB0_4: mov.u32 %r52, 0; max.s32 %r122, %r8, %r52; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f225, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f226, %f225; mov.f32 %f227, %f225; @%p10 bra BB0_7; mul.wide.s32 %rd23, %r12, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f225, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f226, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f227, [%rd26]; BB0_7: mul.f32 %f108, %f226, %f226; fma.rn.f32 %f109, %f225, %f225, %f108; fma.rn.f32 %f10, %f227, %f227, %f109; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs33, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd27, %r12; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs33, [%rd28]; BB0_9: setp.gt.u16 %p12, %rs33, %rs1; cvt.u32.u16 %r53, %rs33; and.b32 %r54, %r53, 255; selp.b32 %r55, %r7, %r54, %p12; selp.b32 %r56, %r54, %r7, %p12; add.s32 %r57, %r56, 1; mul.lo.s32 %r58, %r57, %r56; shr.u32 %r59, %r58, 1; add.s32 %r60, %r59, %r55; mul.wide.s32 %rd29, %r60, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f12, [%rd31]; setp.ne.s16 %p13, %rs14, 0; mov.f32 %f237, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f238, %f237; mov.f32 %f239, %f237; @%p15 bra BB0_13; setp.neu.f32 %p16, %f10, 0f00000000; @%p16 bra BB0_12; mul.f32 %f113, %f12, 0f3F000000; div.rn.f32 %f114, %f113, %f11; mul.f32 %f115, %f114, %f99; fma.rn.f32 %f225, %f3, %f115, %f1; mul.f32 %f116, %f1, %f115; sub.f32 %f227, %f3, %f116; mov.f32 %f226, %f2; BB0_12: mul.f32 %f117, %f99, %f99; add.f32 %f118, %f11, %f11; div.rn.f32 %f119, %f118, %f117; sub.f32 %f120, %f225, %f1; sub.f32 %f121, %f226, %f2; sub.f32 %f122, %f227, %f3; fma.rn.f32 %f123, %f120, %f119, 0f00000000; fma.rn.f32 %f238, %f121, %f119, 0f00000000; fma.rn.f32 %f124, %f119, %f122, 0f00000000; div.rn.f32 %f125, %f12, %f99; mul.f32 %f126, %f227, %f125; sub.f32 %f237, %f123, %f126; fma.rn.f32 %f239, %f225, %f125, %f124; BB0_13: add.s32 %r13, %r1, 1; @%p7 bra BB0_15; rem.s32 %r61, %r13, %r36; add.s32 %r62, %r61, %r36; rem.s32 %r123, %r62, %r36; bra.uni BB0_16; BB0_15: add.s32 %r63, %r36, -1; min.s32 %r123, %r13, %r63; BB0_16: add.s32 %r17, %r123, %r5; setp.ge.s32 %p18, %r13, %r36; mov.f32 %f231, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f232, %f231; mov.f32 %f233, %f231; @%p20 bra BB0_18; mul.wide.s32 %rd32, %r17, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f231, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f232, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f233, [%rd35]; BB0_18: mul.f32 %f130, %f232, %f232; fma.rn.f32 %f131, %f231, %f231, %f130; fma.rn.f32 %f30, %f233, %f233, %f131; setp.eq.f32 %p21, %f30, 0f00000000; mov.u16 %rs34, %rs1; @%p21 bra BB0_20; cvt.s64.s32 %rd36, %r17; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs34, [%rd37]; BB0_20: setp.gt.u16 %p22, %rs34, %rs1; cvt.u32.u16 %r64, %rs34; and.b32 %r65, %r64, 255; selp.b32 %r66, %r7, %r65, %p22; selp.b32 %r67, %r65, %r7, %p22; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; add.s32 %r71, %r70, %r66; mul.wide.s32 %rd38, %r71, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f31, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f32, [%rd40]; and.pred %p25, %p21, %p13; @%p25 bra BB0_24; setp.neu.f32 %p26, %f30, 0f00000000; @%p26 bra BB0_23; mul.f32 %f132, %f32, 0f3F000000; div.rn.f32 %f133, %f132, %f31; mul.f32 %f134, %f133, %f99; mul.f32 %f135, %f3, %f134; sub.f32 %f231, %f1, %f135; fma.rn.f32 %f233, %f1, %f134, %f3; mov.f32 %f232, %f2; BB0_23: mul.f32 %f136, %f99, %f99; add.f32 %f137, %f31, %f31; div.rn.f32 %f138, %f137, %f136; sub.f32 %f139, %f231, %f1; sub.f32 %f140, %f232, %f2; sub.f32 %f141, %f233, %f3; fma.rn.f32 %f142, %f139, %f138, %f237; fma.rn.f32 %f238, %f140, %f138, %f238; fma.rn.f32 %f143, %f138, %f141, %f239; div.rn.f32 %f144, %f32, %f99; fma.rn.f32 %f237, %f233, %f144, %f142; mul.f32 %f145, %f231, %f144; sub.f32 %f239, %f143, %f145; BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r18, %r2, -1; @%p27 bra BB0_26; rem.s32 %r72, %r18, %r37; add.s32 %r73, %r72, %r37; rem.s32 %r124, %r73, %r37; bra.uni BB0_27; BB0_26: mov.u32 %r74, 0; max.s32 %r124, %r18, %r74; BB0_27: add.s32 %r75, %r124, %r4; mad.lo.s32 %r22, %r75, %r36, %r1; setp.lt.s32 %p29, %r18, 0; mov.f32 %f240, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f241, %f240; mov.f32 %f242, %f240; @%p30 bra BB0_29; mul.wide.s32 %rd41, %r22, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f240, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f241, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f242, [%rd44]; BB0_29: mul.f32 %f149, %f241, %f241; fma.rn.f32 %f150, %f240, %f240, %f149; fma.rn.f32 %f50, %f242, %f242, %f150; setp.eq.f32 %p31, %f50, 0f00000000; mov.u16 %rs35, %rs1; @%p31 bra BB0_31; cvt.s64.s32 %rd45, %r22; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs35, [%rd46]; BB0_31: setp.gt.u16 %p32, %rs35, %rs1; cvt.u32.u16 %r76, %rs35; and.b32 %r77, %r76, 255; selp.b32 %r78, %r7, %r77, %p32; selp.b32 %r79, %r77, %r7, %p32; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd47, %r83, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f51, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f52, [%rd49]; and.pred %p35, %p31, %p13; @%p35 bra BB0_35; setp.neu.f32 %p36, %f50, 0f00000000; @%p36 bra BB0_34; mul.f32 %f151, %f52, 0f3F000000; div.rn.f32 %f152, %f151, %f51; mul.f32 %f153, %f152, %f100; fma.rn.f32 %f241, %f3, %f153, %f2; mul.f32 %f154, %f2, %f153; sub.f32 %f242, %f3, %f154; mov.f32 %f240, %f1; BB0_34: mul.f32 %f155, %f100, %f100; add.f32 %f156, %f51, %f51; div.rn.f32 %f157, %f156, %f155; sub.f32 %f158, %f240, %f1; sub.f32 %f159, %f241, %f2; sub.f32 %f160, %f242, %f3; fma.rn.f32 %f237, %f158, %f157, %f237; fma.rn.f32 %f161, %f159, %f157, %f238; fma.rn.f32 %f162, %f157, %f160, %f239; div.rn.f32 %f163, %f52, %f100; mul.f32 %f164, %f242, %f163; sub.f32 %f238, %f161, %f164; fma.rn.f32 %f239, %f241, %f163, %f162; BB0_35: add.s32 %r23, %r2, 1; @%p27 bra BB0_37; rem.s32 %r84, %r23, %r37; add.s32 %r85, %r84, %r37; rem.s32 %r125, %r85, %r37; bra.uni BB0_38; BB0_37: add.s32 %r86, %r37, -1; min.s32 %r125, %r23, %r86; BB0_38: add.s32 %r87, %r125, %r4; mad.lo.s32 %r27, %r87, %r36, %r1; setp.ge.s32 %p38, %r23, %r37; mov.f32 %f249, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f250, %f249; mov.f32 %f251, %f249; @%p40 bra BB0_40; mul.wide.s32 %rd50, %r27, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f249, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f250, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f251, [%rd53]; BB0_40: mul.f32 %f168, %f250, %f250; fma.rn.f32 %f169, %f249, %f249, %f168; fma.rn.f32 %f70, %f251, %f251, %f169; setp.eq.f32 %p41, %f70, 0f00000000; mov.u16 %rs36, %rs1; @%p41 bra BB0_42; cvt.s64.s32 %rd54, %r27; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs36, [%rd55]; BB0_42: setp.gt.u16 %p42, %rs36, %rs1; cvt.u32.u16 %r88, %rs36; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p42; selp.b32 %r91, %r89, %r7, %p42; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd56, %r95, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f71, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f72, [%rd58]; and.pred %p45, %p41, %p13; @%p45 bra BB0_46; setp.neu.f32 %p46, %f70, 0f00000000; @%p46 bra BB0_45; mul.f32 %f170, %f72, 0f3F000000; div.rn.f32 %f171, %f170, %f71; mul.f32 %f172, %f171, %f100; mul.f32 %f173, %f3, %f172; sub.f32 %f250, %f2, %f173; fma.rn.f32 %f251, %f2, %f172, %f3; mov.f32 %f249, %f1; BB0_45: mul.f32 %f174, %f100, %f100; add.f32 %f175, %f71, %f71; div.rn.f32 %f176, %f175, %f174; sub.f32 %f177, %f249, %f1; sub.f32 %f178, %f250, %f2; sub.f32 %f179, %f251, %f3; fma.rn.f32 %f237, %f177, %f176, %f237; fma.rn.f32 %f180, %f178, %f176, %f238; fma.rn.f32 %f181, %f176, %f179, %f239; div.rn.f32 %f182, %f72, %f100; fma.rn.f32 %f238, %f251, %f182, %f180; mul.f32 %f183, %f250, %f182; sub.f32 %f239, %f181, %f183; BB0_46: setp.eq.s32 %p47, %r38, 1; @%p47 bra BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r28, %r3, -1; @%p48 bra BB0_49; rem.s32 %r96, %r28, %r38; add.s32 %r97, %r96, %r38; rem.s32 %r126, %r97, %r38; bra.uni BB0_50; BB0_49: mov.u32 %r98, 0; max.s32 %r126, %r28, %r98; BB0_50: mad.lo.s32 %r99, %r126, %r37, %r2; mad.lo.s32 %r100, %r99, %r36, %r1; cvt.s64.s32 %rd59, %r100; mul.wide.s32 %rd60, %r100, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f184, [%rd61]; ld.global.nc.f32 %f185, [%rd62]; mul.f32 %f186, %f185, %f185; fma.rn.f32 %f187, %f184, %f184, %f186; ld.global.nc.f32 %f188, [%rd63]; fma.rn.f32 %f189, %f188, %f188, %f187; setp.eq.f32 %p49, %f189, 0f00000000; selp.f32 %f190, %f1, %f184, %p49; selp.f32 %f191, %f2, %f185, %p49; selp.f32 %f192, %f3, %f188, %p49; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs26, [%rd64]; setp.gt.u16 %p50, %rs26, %rs1; cvt.u32.u16 %r101, %rs26; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p50; selp.b32 %r104, %r102, %r7, %p50; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd65, %r108, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f193, [%rd66]; add.f32 %f194, %f193, %f193; mul.f32 %f84, %f101, %f101; div.rn.f32 %f195, %f194, %f84; sub.f32 %f196, %f190, %f1; sub.f32 %f197, %f191, %f2; sub.f32 %f198, %f192, %f3; fma.rn.f32 %f85, %f195, %f196, %f237; fma.rn.f32 %f86, %f195, %f197, %f238; fma.rn.f32 %f87, %f195, %f198, %f239; add.s32 %r32, %r3, 1; @%p48 bra BB0_52; rem.s32 %r109, %r32, %r38; add.s32 %r110, %r109, %r38; rem.s32 %r127, %r110, %r38; bra.uni BB0_53; BB0_52: add.s32 %r111, %r38, -1; min.s32 %r127, %r32, %r111; BB0_53: mad.lo.s32 %r112, %r127, %r37, %r2; mad.lo.s32 %r113, %r112, %r36, %r1; cvt.s64.s32 %rd67, %r113; mul.wide.s32 %rd68, %r113, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f199, [%rd69]; ld.global.nc.f32 %f200, [%rd70]; mul.f32 %f201, %f200, %f200; fma.rn.f32 %f202, %f199, %f199, %f201; ld.global.nc.f32 %f203, [%rd71]; fma.rn.f32 %f204, %f203, %f203, %f202; setp.eq.f32 %p52, %f204, 0f00000000; selp.f32 %f205, %f3, %f203, %p52; selp.f32 %f206, %f2, %f200, %p52; selp.f32 %f207, %f1, %f199, %p52; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs30, [%rd72]; setp.gt.u16 %p53, %rs30, %rs1; cvt.u32.u16 %r114, %rs30; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p53; selp.b32 %r117, %r115, %r7, %p53; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd73, %r121, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f208, [%rd74]; add.f32 %f209, %f208, %f208; div.rn.f32 %f210, %f209, %f84; sub.f32 %f211, %f207, %f1; sub.f32 %f212, %f206, %f2; sub.f32 %f213, %f205, %f3; fma.rn.f32 %f237, %f210, %f211, %f85; fma.rn.f32 %f238, %f210, %f212, %f86; fma.rn.f32 %f239, %f210, %f213, %f87; BB0_54: setp.eq.s64 %p54, %rd10, 0; @%p54 bra BB0_56; cvta.to.global.u64 %rd75, %rd10; add.s64 %rd77, %rd75, %rd17; ld.global.nc.f32 %f214, [%rd77]; mul.f32 %f261, %f214, %f261; BB0_56: setp.eq.f32 %p55, %f261, 0f00000000; mov.f32 %f262, 0f00000000; @%p55 bra BB0_58; rcp.rn.f32 %f262, %f261; BB0_58: cvta.to.global.u64 %rd78, %rd9; cvta.to.global.u64 %rd79, %rd8; cvta.to.global.u64 %rd80, %rd7; add.s64 %rd82, %rd80, %rd17; ld.global.f32 %f216, [%rd82]; fma.rn.f32 %f217, %f237, %f262, %f216; st.global.f32 [%rd82], %f217; add.s64 %rd83, %rd79, %rd17; ld.global.f32 %f218, [%rd83]; fma.rn.f32 %f219, %f238, %f262, %f218; st.global.f32 [%rd83], %f219; add.s64 %rd84, %rd78, %rd17; ld.global.f32 %f220, [%rd84]; fma.rn.f32 %f221, %f239, %f262, %f220; st.global.f32 [%rd84], %f221; BB0_59: ret; } ` adddmi_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<56>; .reg .b16 %rs<37>; .reg .f32 %f<263>; .reg .b32 %r<128>; .reg .b64 %rd<85>; ld.param.u64 %rd7, [adddmi_param_0]; ld.param.u64 %rd8, [adddmi_param_1]; ld.param.u64 %rd9, [adddmi_param_2]; ld.param.u64 %rd11, [adddmi_param_3]; ld.param.u64 %rd12, [adddmi_param_4]; ld.param.u64 %rd13, [adddmi_param_5]; ld.param.u64 %rd10, [adddmi_param_6]; ld.param.f32 %f261, [adddmi_param_7]; ld.param.u64 %rd14, [adddmi_param_8]; ld.param.u64 %rd15, [adddmi_param_9]; ld.param.u64 %rd16, [adddmi_param_10]; ld.param.f32 %f99, [adddmi_param_11]; ld.param.f32 %f100, [adddmi_param_12]; ld.param.f32 %f101, [adddmi_param_13]; ld.param.u32 %r36, [adddmi_param_14]; ld.param.u32 %r37, [adddmi_param_15]; ld.param.u32 %r38, [adddmi_param_16]; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; cvta.to.global.u64 %rd1, %rd15; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd16; cvta.to.global.u64 %rd4, %rd13; cvta.to.global.u64 %rd5, %rd12; cvta.to.global.u64 %rd6, %rd11; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r39, %r40, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r42, %r43, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r45, %r46, %r47; setp.ge.s32 %p1, %r2, %r37; setp.ge.s32 %p2, %r1, %r36; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra BB0_59; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd17, %r6, 4; add.s64 %rd18, %rd6, %rd17; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd5, %rd17; add.s64 %rd21, %rd4, %rd17; add.s64 %rd22, %rd3, %rd19; ld.global.nc.u8 %rs1, [%rd22]; cvt.u32.u16 %r49, %rs1; and.b32 %r7, %r49, 255; ld.global.nc.f32 %f1, [%rd18]; ld.global.nc.f32 %f2, [%rd20]; mul.f32 %f102, %f2, %f2; fma.rn.f32 %f103, %f1, %f1, %f102; ld.global.nc.f32 %f3, [%rd21]; fma.rn.f32 %f104, %f3, %f3, %f103; setp.eq.f32 %p6, %f104, 0f00000000; @%p6 bra BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r50, %r8, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r122, %r51, %r36; bra.uni BB0_5; BB0_4: mov.u32 %r52, 0; max.s32 %r122, %r8, %r52; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f225, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f226, %f225; mov.f32 %f227, %f225; @%p10 bra BB0_7; mul.wide.s32 %rd23, %r12, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f225, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f226, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f227, [%rd26]; BB0_7: mul.f32 %f108, %f226, %f226; fma.rn.f32 %f109, %f225, %f225, %f108; fma.rn.f32 %f10, %f227, %f227, %f109; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs33, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd27, %r12; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs33, [%rd28]; BB0_9: setp.gt.u16 %p12, %rs33, %rs1; cvt.u32.u16 %r53, %rs33; and.b32 %r54, %r53, 255; selp.b32 %r55, %r7, %r54, %p12; selp.b32 %r56, %r54, %r7, %p12; add.s32 %r57, %r56, 1; mul.lo.s32 %r58, %r57, %r56; shr.u32 %r59, %r58, 1; add.s32 %r60, %r59, %r55; mul.wide.s32 %rd29, %r60, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f12, [%rd31]; setp.ne.s16 %p13, %rs14, 0; mov.f32 %f237, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f238, %f237; mov.f32 %f239, %f237; @%p15 bra BB0_13; setp.neu.f32 %p16, %f10, 0f00000000; @%p16 bra BB0_12; mul.f32 %f113, %f12, 0f3F000000; div.rn.f32 %f114, %f113, %f11; mul.f32 %f115, %f114, %f99; fma.rn.f32 %f225, %f3, %f115, %f1; mul.f32 %f116, %f1, %f115; sub.f32 %f227, %f3, %f116; mov.f32 %f226, %f2; BB0_12: mul.f32 %f117, %f99, %f99; add.f32 %f118, %f11, %f11; div.rn.f32 %f119, %f118, %f117; sub.f32 %f120, %f225, %f1; sub.f32 %f121, %f226, %f2; sub.f32 %f122, %f227, %f3; fma.rn.f32 %f123, %f120, %f119, 0f00000000; fma.rn.f32 %f238, %f121, %f119, 0f00000000; fma.rn.f32 %f124, %f119, %f122, 0f00000000; div.rn.f32 %f125, %f12, %f99; mul.f32 %f126, %f227, %f125; sub.f32 %f237, %f123, %f126; fma.rn.f32 %f239, %f225, %f125, %f124; BB0_13: add.s32 %r13, %r1, 1; @%p7 bra BB0_15; rem.s32 %r61, %r13, %r36; add.s32 %r62, %r61, %r36; rem.s32 %r123, %r62, %r36; bra.uni BB0_16; BB0_15: add.s32 %r63, %r36, -1; min.s32 %r123, %r13, %r63; BB0_16: add.s32 %r17, %r123, %r5; setp.ge.s32 %p18, %r13, %r36; mov.f32 %f231, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f232, %f231; mov.f32 %f233, %f231; @%p20 bra BB0_18; mul.wide.s32 %rd32, %r17, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f231, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f232, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f233, [%rd35]; BB0_18: mul.f32 %f130, %f232, %f232; fma.rn.f32 %f131, %f231, %f231, %f130; fma.rn.f32 %f30, %f233, %f233, %f131; setp.eq.f32 %p21, %f30, 0f00000000; mov.u16 %rs34, %rs1; @%p21 bra BB0_20; cvt.s64.s32 %rd36, %r17; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs34, [%rd37]; BB0_20: setp.gt.u16 %p22, %rs34, %rs1; cvt.u32.u16 %r64, %rs34; and.b32 %r65, %r64, 255; selp.b32 %r66, %r7, %r65, %p22; selp.b32 %r67, %r65, %r7, %p22; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; add.s32 %r71, %r70, %r66; mul.wide.s32 %rd38, %r71, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f31, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f32, [%rd40]; and.pred %p25, %p21, %p13; @%p25 bra BB0_24; setp.neu.f32 %p26, %f30, 0f00000000; @%p26 bra BB0_23; mul.f32 %f132, %f32, 0f3F000000; div.rn.f32 %f133, %f132, %f31; mul.f32 %f134, %f133, %f99; mul.f32 %f135, %f3, %f134; sub.f32 %f231, %f1, %f135; fma.rn.f32 %f233, %f1, %f134, %f3; mov.f32 %f232, %f2; BB0_23: mul.f32 %f136, %f99, %f99; add.f32 %f137, %f31, %f31; div.rn.f32 %f138, %f137, %f136; sub.f32 %f139, %f231, %f1; sub.f32 %f140, %f232, %f2; sub.f32 %f141, %f233, %f3; fma.rn.f32 %f142, %f139, %f138, %f237; fma.rn.f32 %f238, %f140, %f138, %f238; fma.rn.f32 %f143, %f138, %f141, %f239; div.rn.f32 %f144, %f32, %f99; fma.rn.f32 %f237, %f233, %f144, %f142; mul.f32 %f145, %f231, %f144; sub.f32 %f239, %f143, %f145; BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r18, %r2, -1; @%p27 bra BB0_26; rem.s32 %r72, %r18, %r37; add.s32 %r73, %r72, %r37; rem.s32 %r124, %r73, %r37; bra.uni BB0_27; BB0_26: mov.u32 %r74, 0; max.s32 %r124, %r18, %r74; BB0_27: add.s32 %r75, %r124, %r4; mad.lo.s32 %r22, %r75, %r36, %r1; setp.lt.s32 %p29, %r18, 0; mov.f32 %f240, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f241, %f240; mov.f32 %f242, %f240; @%p30 bra BB0_29; mul.wide.s32 %rd41, %r22, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f240, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f241, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f242, [%rd44]; BB0_29: mul.f32 %f149, %f241, %f241; fma.rn.f32 %f150, %f240, %f240, %f149; fma.rn.f32 %f50, %f242, %f242, %f150; setp.eq.f32 %p31, %f50, 0f00000000; mov.u16 %rs35, %rs1; @%p31 bra BB0_31; cvt.s64.s32 %rd45, %r22; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs35, [%rd46]; BB0_31: setp.gt.u16 %p32, %rs35, %rs1; cvt.u32.u16 %r76, %rs35; and.b32 %r77, %r76, 255; selp.b32 %r78, %r7, %r77, %p32; selp.b32 %r79, %r77, %r7, %p32; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd47, %r83, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f51, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f52, [%rd49]; and.pred %p35, %p31, %p13; @%p35 bra BB0_35; setp.neu.f32 %p36, %f50, 0f00000000; @%p36 bra BB0_34; mul.f32 %f151, %f52, 0f3F000000; div.rn.f32 %f152, %f151, %f51; mul.f32 %f153, %f152, %f100; fma.rn.f32 %f241, %f3, %f153, %f2; mul.f32 %f154, %f2, %f153; sub.f32 %f242, %f3, %f154; mov.f32 %f240, %f1; BB0_34: mul.f32 %f155, %f100, %f100; add.f32 %f156, %f51, %f51; div.rn.f32 %f157, %f156, %f155; sub.f32 %f158, %f240, %f1; sub.f32 %f159, %f241, %f2; sub.f32 %f160, %f242, %f3; fma.rn.f32 %f237, %f158, %f157, %f237; fma.rn.f32 %f161, %f159, %f157, %f238; fma.rn.f32 %f162, %f157, %f160, %f239; div.rn.f32 %f163, %f52, %f100; mul.f32 %f164, %f242, %f163; sub.f32 %f238, %f161, %f164; fma.rn.f32 %f239, %f241, %f163, %f162; BB0_35: add.s32 %r23, %r2, 1; @%p27 bra BB0_37; rem.s32 %r84, %r23, %r37; add.s32 %r85, %r84, %r37; rem.s32 %r125, %r85, %r37; bra.uni BB0_38; BB0_37: add.s32 %r86, %r37, -1; min.s32 %r125, %r23, %r86; BB0_38: add.s32 %r87, %r125, %r4; mad.lo.s32 %r27, %r87, %r36, %r1; setp.ge.s32 %p38, %r23, %r37; mov.f32 %f249, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f250, %f249; mov.f32 %f251, %f249; @%p40 bra BB0_40; mul.wide.s32 %rd50, %r27, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f249, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f250, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f251, [%rd53]; BB0_40: mul.f32 %f168, %f250, %f250; fma.rn.f32 %f169, %f249, %f249, %f168; fma.rn.f32 %f70, %f251, %f251, %f169; setp.eq.f32 %p41, %f70, 0f00000000; mov.u16 %rs36, %rs1; @%p41 bra BB0_42; cvt.s64.s32 %rd54, %r27; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs36, [%rd55]; BB0_42: setp.gt.u16 %p42, %rs36, %rs1; cvt.u32.u16 %r88, %rs36; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p42; selp.b32 %r91, %r89, %r7, %p42; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd56, %r95, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f71, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f72, [%rd58]; and.pred %p45, %p41, %p13; @%p45 bra BB0_46; setp.neu.f32 %p46, %f70, 0f00000000; @%p46 bra BB0_45; mul.f32 %f170, %f72, 0f3F000000; div.rn.f32 %f171, %f170, %f71; mul.f32 %f172, %f171, %f100; mul.f32 %f173, %f3, %f172; sub.f32 %f250, %f2, %f173; fma.rn.f32 %f251, %f2, %f172, %f3; mov.f32 %f249, %f1; BB0_45: mul.f32 %f174, %f100, %f100; add.f32 %f175, %f71, %f71; div.rn.f32 %f176, %f175, %f174; sub.f32 %f177, %f249, %f1; sub.f32 %f178, %f250, %f2; sub.f32 %f179, %f251, %f3; fma.rn.f32 %f237, %f177, %f176, %f237; fma.rn.f32 %f180, %f178, %f176, %f238; fma.rn.f32 %f181, %f176, %f179, %f239; div.rn.f32 %f182, %f72, %f100; fma.rn.f32 %f238, %f251, %f182, %f180; mul.f32 %f183, %f250, %f182; sub.f32 %f239, %f181, %f183; BB0_46: setp.eq.s32 %p47, %r38, 1; @%p47 bra BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r28, %r3, -1; @%p48 bra BB0_49; rem.s32 %r96, %r28, %r38; add.s32 %r97, %r96, %r38; rem.s32 %r126, %r97, %r38; bra.uni BB0_50; BB0_49: mov.u32 %r98, 0; max.s32 %r126, %r28, %r98; BB0_50: mad.lo.s32 %r99, %r126, %r37, %r2; mad.lo.s32 %r100, %r99, %r36, %r1; cvt.s64.s32 %rd59, %r100; mul.wide.s32 %rd60, %r100, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f184, [%rd61]; ld.global.nc.f32 %f185, [%rd62]; mul.f32 %f186, %f185, %f185; fma.rn.f32 %f187, %f184, %f184, %f186; ld.global.nc.f32 %f188, [%rd63]; fma.rn.f32 %f189, %f188, %f188, %f187; setp.eq.f32 %p49, %f189, 0f00000000; selp.f32 %f190, %f1, %f184, %p49; selp.f32 %f191, %f2, %f185, %p49; selp.f32 %f192, %f3, %f188, %p49; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs26, [%rd64]; setp.gt.u16 %p50, %rs26, %rs1; cvt.u32.u16 %r101, %rs26; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p50; selp.b32 %r104, %r102, %r7, %p50; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd65, %r108, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f193, [%rd66]; add.f32 %f194, %f193, %f193; mul.f32 %f84, %f101, %f101; div.rn.f32 %f195, %f194, %f84; sub.f32 %f196, %f190, %f1; sub.f32 %f197, %f191, %f2; sub.f32 %f198, %f192, %f3; fma.rn.f32 %f85, %f195, %f196, %f237; fma.rn.f32 %f86, %f195, %f197, %f238; fma.rn.f32 %f87, %f195, %f198, %f239; add.s32 %r32, %r3, 1; @%p48 bra BB0_52; rem.s32 %r109, %r32, %r38; add.s32 %r110, %r109, %r38; rem.s32 %r127, %r110, %r38; bra.uni BB0_53; BB0_52: add.s32 %r111, %r38, -1; min.s32 %r127, %r32, %r111; BB0_53: mad.lo.s32 %r112, %r127, %r37, %r2; mad.lo.s32 %r113, %r112, %r36, %r1; cvt.s64.s32 %rd67, %r113; mul.wide.s32 %rd68, %r113, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f199, [%rd69]; ld.global.nc.f32 %f200, [%rd70]; mul.f32 %f201, %f200, %f200; fma.rn.f32 %f202, %f199, %f199, %f201; ld.global.nc.f32 %f203, [%rd71]; fma.rn.f32 %f204, %f203, %f203, %f202; setp.eq.f32 %p52, %f204, 0f00000000; selp.f32 %f205, %f3, %f203, %p52; selp.f32 %f206, %f2, %f200, %p52; selp.f32 %f207, %f1, %f199, %p52; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs30, [%rd72]; setp.gt.u16 %p53, %rs30, %rs1; cvt.u32.u16 %r114, %rs30; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p53; selp.b32 %r117, %r115, %r7, %p53; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd73, %r121, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f208, [%rd74]; add.f32 %f209, %f208, %f208; div.rn.f32 %f210, %f209, %f84; sub.f32 %f211, %f207, %f1; sub.f32 %f212, %f206, %f2; sub.f32 %f213, %f205, %f3; fma.rn.f32 %f237, %f210, %f211, %f85; fma.rn.f32 %f238, %f210, %f212, %f86; fma.rn.f32 %f239, %f210, %f213, %f87; BB0_54: setp.eq.s64 %p54, %rd10, 0; @%p54 bra BB0_56; cvta.to.global.u64 %rd75, %rd10; add.s64 %rd77, %rd75, %rd17; ld.global.nc.f32 %f214, [%rd77]; mul.f32 %f261, %f214, %f261; BB0_56: setp.eq.f32 %p55, %f261, 0f00000000; mov.f32 %f262, 0f00000000; @%p55 bra BB0_58; rcp.rn.f32 %f262, %f261; BB0_58: cvta.to.global.u64 %rd78, %rd9; cvta.to.global.u64 %rd79, %rd8; cvta.to.global.u64 %rd80, %rd7; add.s64 %rd82, %rd80, %rd17; ld.global.f32 %f216, [%rd82]; fma.rn.f32 %f217, %f237, %f262, %f216; st.global.f32 [%rd82], %f217; add.s64 %rd83, %rd79, %rd17; ld.global.f32 %f218, [%rd83]; fma.rn.f32 %f219, %f238, %f262, %f218; st.global.f32 [%rd83], %f219; add.s64 %rd84, %rd78, %rd17; ld.global.f32 %f220, [%rd84]; fma.rn.f32 %f221, %f239, %f262, %f220; st.global.f32 [%rd84], %f221; BB0_59: ret; } ` adddmi_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<56>; .reg .b16 %rs<37>; .reg .f32 %f<263>; .reg .b32 %r<128>; .reg .b64 %rd<85>; ld.param.u64 %rd7, [adddmi_param_0]; ld.param.u64 %rd8, [adddmi_param_1]; ld.param.u64 %rd9, [adddmi_param_2]; ld.param.u64 %rd11, [adddmi_param_3]; ld.param.u64 %rd12, [adddmi_param_4]; ld.param.u64 %rd13, [adddmi_param_5]; ld.param.u64 %rd10, [adddmi_param_6]; ld.param.f32 %f261, [adddmi_param_7]; ld.param.u64 %rd14, [adddmi_param_8]; ld.param.u64 %rd15, [adddmi_param_9]; ld.param.u64 %rd16, [adddmi_param_10]; ld.param.f32 %f99, [adddmi_param_11]; ld.param.f32 %f100, [adddmi_param_12]; ld.param.f32 %f101, [adddmi_param_13]; ld.param.u32 %r36, [adddmi_param_14]; ld.param.u32 %r37, [adddmi_param_15]; ld.param.u32 %r38, [adddmi_param_16]; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; cvta.to.global.u64 %rd1, %rd15; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd16; cvta.to.global.u64 %rd4, %rd13; cvta.to.global.u64 %rd5, %rd12; cvta.to.global.u64 %rd6, %rd11; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r39, %r40, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r42, %r43, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r45, %r46, %r47; setp.ge.s32 %p1, %r2, %r37; setp.ge.s32 %p2, %r1, %r36; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra BB0_59; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd17, %r6, 4; add.s64 %rd18, %rd6, %rd17; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd5, %rd17; add.s64 %rd21, %rd4, %rd17; add.s64 %rd22, %rd3, %rd19; ld.global.nc.u8 %rs1, [%rd22]; cvt.u32.u16 %r49, %rs1; and.b32 %r7, %r49, 255; ld.global.nc.f32 %f1, [%rd18]; ld.global.nc.f32 %f2, [%rd20]; mul.f32 %f102, %f2, %f2; fma.rn.f32 %f103, %f1, %f1, %f102; ld.global.nc.f32 %f3, [%rd21]; fma.rn.f32 %f104, %f3, %f3, %f103; setp.eq.f32 %p6, %f104, 0f00000000; @%p6 bra BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r50, %r8, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r122, %r51, %r36; bra.uni BB0_5; BB0_4: mov.u32 %r52, 0; max.s32 %r122, %r8, %r52; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f225, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f226, %f225; mov.f32 %f227, %f225; @%p10 bra BB0_7; mul.wide.s32 %rd23, %r12, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f225, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f226, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f227, [%rd26]; BB0_7: mul.f32 %f108, %f226, %f226; fma.rn.f32 %f109, %f225, %f225, %f108; fma.rn.f32 %f10, %f227, %f227, %f109; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs33, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd27, %r12; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs33, [%rd28]; BB0_9: setp.gt.u16 %p12, %rs33, %rs1; cvt.u32.u16 %r53, %rs33; and.b32 %r54, %r53, 255; selp.b32 %r55, %r7, %r54, %p12; selp.b32 %r56, %r54, %r7, %p12; add.s32 %r57, %r56, 1; mul.lo.s32 %r58, %r57, %r56; shr.u32 %r59, %r58, 1; add.s32 %r60, %r59, %r55; mul.wide.s32 %rd29, %r60, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f12, [%rd31]; setp.ne.s16 %p13, %rs14, 0; mov.f32 %f237, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f238, %f237; mov.f32 %f239, %f237; @%p15 bra BB0_13; setp.neu.f32 %p16, %f10, 0f00000000; @%p16 bra BB0_12; mul.f32 %f113, %f12, 0f3F000000; div.rn.f32 %f114, %f113, %f11; mul.f32 %f115, %f114, %f99; fma.rn.f32 %f225, %f3, %f115, %f1; mul.f32 %f116, %f1, %f115; sub.f32 %f227, %f3, %f116; mov.f32 %f226, %f2; BB0_12: mul.f32 %f117, %f99, %f99; add.f32 %f118, %f11, %f11; div.rn.f32 %f119, %f118, %f117; sub.f32 %f120, %f225, %f1; sub.f32 %f121, %f226, %f2; sub.f32 %f122, %f227, %f3; fma.rn.f32 %f123, %f120, %f119, 0f00000000; fma.rn.f32 %f238, %f121, %f119, 0f00000000; fma.rn.f32 %f124, %f119, %f122, 0f00000000; div.rn.f32 %f125, %f12, %f99; mul.f32 %f126, %f227, %f125; sub.f32 %f237, %f123, %f126; fma.rn.f32 %f239, %f225, %f125, %f124; BB0_13: add.s32 %r13, %r1, 1; @%p7 bra BB0_15; rem.s32 %r61, %r13, %r36; add.s32 %r62, %r61, %r36; rem.s32 %r123, %r62, %r36; bra.uni BB0_16; BB0_15: add.s32 %r63, %r36, -1; min.s32 %r123, %r13, %r63; BB0_16: add.s32 %r17, %r123, %r5; setp.ge.s32 %p18, %r13, %r36; mov.f32 %f231, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f232, %f231; mov.f32 %f233, %f231; @%p20 bra BB0_18; mul.wide.s32 %rd32, %r17, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f231, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f232, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f233, [%rd35]; BB0_18: mul.f32 %f130, %f232, %f232; fma.rn.f32 %f131, %f231, %f231, %f130; fma.rn.f32 %f30, %f233, %f233, %f131; setp.eq.f32 %p21, %f30, 0f00000000; mov.u16 %rs34, %rs1; @%p21 bra BB0_20; cvt.s64.s32 %rd36, %r17; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs34, [%rd37]; BB0_20: setp.gt.u16 %p22, %rs34, %rs1; cvt.u32.u16 %r64, %rs34; and.b32 %r65, %r64, 255; selp.b32 %r66, %r7, %r65, %p22; selp.b32 %r67, %r65, %r7, %p22; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; add.s32 %r71, %r70, %r66; mul.wide.s32 %rd38, %r71, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f31, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f32, [%rd40]; and.pred %p25, %p21, %p13; @%p25 bra BB0_24; setp.neu.f32 %p26, %f30, 0f00000000; @%p26 bra BB0_23; mul.f32 %f132, %f32, 0f3F000000; div.rn.f32 %f133, %f132, %f31; mul.f32 %f134, %f133, %f99; mul.f32 %f135, %f3, %f134; sub.f32 %f231, %f1, %f135; fma.rn.f32 %f233, %f1, %f134, %f3; mov.f32 %f232, %f2; BB0_23: mul.f32 %f136, %f99, %f99; add.f32 %f137, %f31, %f31; div.rn.f32 %f138, %f137, %f136; sub.f32 %f139, %f231, %f1; sub.f32 %f140, %f232, %f2; sub.f32 %f141, %f233, %f3; fma.rn.f32 %f142, %f139, %f138, %f237; fma.rn.f32 %f238, %f140, %f138, %f238; fma.rn.f32 %f143, %f138, %f141, %f239; div.rn.f32 %f144, %f32, %f99; fma.rn.f32 %f237, %f233, %f144, %f142; mul.f32 %f145, %f231, %f144; sub.f32 %f239, %f143, %f145; BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r18, %r2, -1; @%p27 bra BB0_26; rem.s32 %r72, %r18, %r37; add.s32 %r73, %r72, %r37; rem.s32 %r124, %r73, %r37; bra.uni BB0_27; BB0_26: mov.u32 %r74, 0; max.s32 %r124, %r18, %r74; BB0_27: add.s32 %r75, %r124, %r4; mad.lo.s32 %r22, %r75, %r36, %r1; setp.lt.s32 %p29, %r18, 0; mov.f32 %f240, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f241, %f240; mov.f32 %f242, %f240; @%p30 bra BB0_29; mul.wide.s32 %rd41, %r22, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f240, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f241, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f242, [%rd44]; BB0_29: mul.f32 %f149, %f241, %f241; fma.rn.f32 %f150, %f240, %f240, %f149; fma.rn.f32 %f50, %f242, %f242, %f150; setp.eq.f32 %p31, %f50, 0f00000000; mov.u16 %rs35, %rs1; @%p31 bra BB0_31; cvt.s64.s32 %rd45, %r22; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs35, [%rd46]; BB0_31: setp.gt.u16 %p32, %rs35, %rs1; cvt.u32.u16 %r76, %rs35; and.b32 %r77, %r76, 255; selp.b32 %r78, %r7, %r77, %p32; selp.b32 %r79, %r77, %r7, %p32; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd47, %r83, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f51, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f52, [%rd49]; and.pred %p35, %p31, %p13; @%p35 bra BB0_35; setp.neu.f32 %p36, %f50, 0f00000000; @%p36 bra BB0_34; mul.f32 %f151, %f52, 0f3F000000; div.rn.f32 %f152, %f151, %f51; mul.f32 %f153, %f152, %f100; fma.rn.f32 %f241, %f3, %f153, %f2; mul.f32 %f154, %f2, %f153; sub.f32 %f242, %f3, %f154; mov.f32 %f240, %f1; BB0_34: mul.f32 %f155, %f100, %f100; add.f32 %f156, %f51, %f51; div.rn.f32 %f157, %f156, %f155; sub.f32 %f158, %f240, %f1; sub.f32 %f159, %f241, %f2; sub.f32 %f160, %f242, %f3; fma.rn.f32 %f237, %f158, %f157, %f237; fma.rn.f32 %f161, %f159, %f157, %f238; fma.rn.f32 %f162, %f157, %f160, %f239; div.rn.f32 %f163, %f52, %f100; mul.f32 %f164, %f242, %f163; sub.f32 %f238, %f161, %f164; fma.rn.f32 %f239, %f241, %f163, %f162; BB0_35: add.s32 %r23, %r2, 1; @%p27 bra BB0_37; rem.s32 %r84, %r23, %r37; add.s32 %r85, %r84, %r37; rem.s32 %r125, %r85, %r37; bra.uni BB0_38; BB0_37: add.s32 %r86, %r37, -1; min.s32 %r125, %r23, %r86; BB0_38: add.s32 %r87, %r125, %r4; mad.lo.s32 %r27, %r87, %r36, %r1; setp.ge.s32 %p38, %r23, %r37; mov.f32 %f249, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f250, %f249; mov.f32 %f251, %f249; @%p40 bra BB0_40; mul.wide.s32 %rd50, %r27, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f249, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f250, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f251, [%rd53]; BB0_40: mul.f32 %f168, %f250, %f250; fma.rn.f32 %f169, %f249, %f249, %f168; fma.rn.f32 %f70, %f251, %f251, %f169; setp.eq.f32 %p41, %f70, 0f00000000; mov.u16 %rs36, %rs1; @%p41 bra BB0_42; cvt.s64.s32 %rd54, %r27; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs36, [%rd55]; BB0_42: setp.gt.u16 %p42, %rs36, %rs1; cvt.u32.u16 %r88, %rs36; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p42; selp.b32 %r91, %r89, %r7, %p42; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd56, %r95, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f71, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f72, [%rd58]; and.pred %p45, %p41, %p13; @%p45 bra BB0_46; setp.neu.f32 %p46, %f70, 0f00000000; @%p46 bra BB0_45; mul.f32 %f170, %f72, 0f3F000000; div.rn.f32 %f171, %f170, %f71; mul.f32 %f172, %f171, %f100; mul.f32 %f173, %f3, %f172; sub.f32 %f250, %f2, %f173; fma.rn.f32 %f251, %f2, %f172, %f3; mov.f32 %f249, %f1; BB0_45: mul.f32 %f174, %f100, %f100; add.f32 %f175, %f71, %f71; div.rn.f32 %f176, %f175, %f174; sub.f32 %f177, %f249, %f1; sub.f32 %f178, %f250, %f2; sub.f32 %f179, %f251, %f3; fma.rn.f32 %f237, %f177, %f176, %f237; fma.rn.f32 %f180, %f178, %f176, %f238; fma.rn.f32 %f181, %f176, %f179, %f239; div.rn.f32 %f182, %f72, %f100; fma.rn.f32 %f238, %f251, %f182, %f180; mul.f32 %f183, %f250, %f182; sub.f32 %f239, %f181, %f183; BB0_46: setp.eq.s32 %p47, %r38, 1; @%p47 bra BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r28, %r3, -1; @%p48 bra BB0_49; rem.s32 %r96, %r28, %r38; add.s32 %r97, %r96, %r38; rem.s32 %r126, %r97, %r38; bra.uni BB0_50; BB0_49: mov.u32 %r98, 0; max.s32 %r126, %r28, %r98; BB0_50: mad.lo.s32 %r99, %r126, %r37, %r2; mad.lo.s32 %r100, %r99, %r36, %r1; cvt.s64.s32 %rd59, %r100; mul.wide.s32 %rd60, %r100, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f184, [%rd61]; ld.global.nc.f32 %f185, [%rd62]; mul.f32 %f186, %f185, %f185; fma.rn.f32 %f187, %f184, %f184, %f186; ld.global.nc.f32 %f188, [%rd63]; fma.rn.f32 %f189, %f188, %f188, %f187; setp.eq.f32 %p49, %f189, 0f00000000; selp.f32 %f190, %f1, %f184, %p49; selp.f32 %f191, %f2, %f185, %p49; selp.f32 %f192, %f3, %f188, %p49; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs26, [%rd64]; setp.gt.u16 %p50, %rs26, %rs1; cvt.u32.u16 %r101, %rs26; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p50; selp.b32 %r104, %r102, %r7, %p50; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd65, %r108, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f193, [%rd66]; add.f32 %f194, %f193, %f193; mul.f32 %f84, %f101, %f101; div.rn.f32 %f195, %f194, %f84; sub.f32 %f196, %f190, %f1; sub.f32 %f197, %f191, %f2; sub.f32 %f198, %f192, %f3; fma.rn.f32 %f85, %f195, %f196, %f237; fma.rn.f32 %f86, %f195, %f197, %f238; fma.rn.f32 %f87, %f195, %f198, %f239; add.s32 %r32, %r3, 1; @%p48 bra BB0_52; rem.s32 %r109, %r32, %r38; add.s32 %r110, %r109, %r38; rem.s32 %r127, %r110, %r38; bra.uni BB0_53; BB0_52: add.s32 %r111, %r38, -1; min.s32 %r127, %r32, %r111; BB0_53: mad.lo.s32 %r112, %r127, %r37, %r2; mad.lo.s32 %r113, %r112, %r36, %r1; cvt.s64.s32 %rd67, %r113; mul.wide.s32 %rd68, %r113, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f199, [%rd69]; ld.global.nc.f32 %f200, [%rd70]; mul.f32 %f201, %f200, %f200; fma.rn.f32 %f202, %f199, %f199, %f201; ld.global.nc.f32 %f203, [%rd71]; fma.rn.f32 %f204, %f203, %f203, %f202; setp.eq.f32 %p52, %f204, 0f00000000; selp.f32 %f205, %f3, %f203, %p52; selp.f32 %f206, %f2, %f200, %p52; selp.f32 %f207, %f1, %f199, %p52; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs30, [%rd72]; setp.gt.u16 %p53, %rs30, %rs1; cvt.u32.u16 %r114, %rs30; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p53; selp.b32 %r117, %r115, %r7, %p53; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd73, %r121, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f208, [%rd74]; add.f32 %f209, %f208, %f208; div.rn.f32 %f210, %f209, %f84; sub.f32 %f211, %f207, %f1; sub.f32 %f212, %f206, %f2; sub.f32 %f213, %f205, %f3; fma.rn.f32 %f237, %f210, %f211, %f85; fma.rn.f32 %f238, %f210, %f212, %f86; fma.rn.f32 %f239, %f210, %f213, %f87; BB0_54: setp.eq.s64 %p54, %rd10, 0; @%p54 bra BB0_56; cvta.to.global.u64 %rd75, %rd10; add.s64 %rd77, %rd75, %rd17; ld.global.nc.f32 %f214, [%rd77]; mul.f32 %f261, %f214, %f261; BB0_56: setp.eq.f32 %p55, %f261, 0f00000000; mov.f32 %f262, 0f00000000; @%p55 bra BB0_58; rcp.rn.f32 %f262, %f261; BB0_58: cvta.to.global.u64 %rd78, %rd9; cvta.to.global.u64 %rd79, %rd8; cvta.to.global.u64 %rd80, %rd7; add.s64 %rd82, %rd80, %rd17; ld.global.f32 %f216, [%rd82]; fma.rn.f32 %f217, %f237, %f262, %f216; st.global.f32 [%rd82], %f217; add.s64 %rd83, %rd79, %rd17; ld.global.f32 %f218, [%rd83]; fma.rn.f32 %f219, %f238, %f262, %f218; st.global.f32 [%rd83], %f219; add.s64 %rd84, %rd78, %rd17; ld.global.f32 %f220, [%rd84]; fma.rn.f32 %f221, %f239, %f262, %f220; st.global.f32 [%rd84], %f221; BB0_59: ret; } ` adddmi_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<56>; .reg .b16 %rs<37>; .reg .f32 %f<263>; .reg .b32 %r<128>; .reg .b64 %rd<85>; ld.param.u64 %rd7, [adddmi_param_0]; ld.param.u64 %rd8, [adddmi_param_1]; ld.param.u64 %rd9, [adddmi_param_2]; ld.param.u64 %rd11, [adddmi_param_3]; ld.param.u64 %rd12, [adddmi_param_4]; ld.param.u64 %rd13, [adddmi_param_5]; ld.param.u64 %rd10, [adddmi_param_6]; ld.param.f32 %f261, [adddmi_param_7]; ld.param.u64 %rd14, [adddmi_param_8]; ld.param.u64 %rd15, [adddmi_param_9]; ld.param.u64 %rd16, [adddmi_param_10]; ld.param.f32 %f99, [adddmi_param_11]; ld.param.f32 %f100, [adddmi_param_12]; ld.param.f32 %f101, [adddmi_param_13]; ld.param.u32 %r36, [adddmi_param_14]; ld.param.u32 %r37, [adddmi_param_15]; ld.param.u32 %r38, [adddmi_param_16]; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; cvta.to.global.u64 %rd1, %rd15; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd16; cvta.to.global.u64 %rd4, %rd13; cvta.to.global.u64 %rd5, %rd12; cvta.to.global.u64 %rd6, %rd11; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r39, %r40, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r42, %r43, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r45, %r46, %r47; setp.ge.s32 %p1, %r2, %r37; setp.ge.s32 %p2, %r1, %r36; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra BB0_59; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd17, %r6, 4; add.s64 %rd18, %rd6, %rd17; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd5, %rd17; add.s64 %rd21, %rd4, %rd17; add.s64 %rd22, %rd3, %rd19; ld.global.nc.u8 %rs1, [%rd22]; cvt.u32.u16 %r49, %rs1; and.b32 %r7, %r49, 255; ld.global.nc.f32 %f1, [%rd18]; ld.global.nc.f32 %f2, [%rd20]; mul.f32 %f102, %f2, %f2; fma.rn.f32 %f103, %f1, %f1, %f102; ld.global.nc.f32 %f3, [%rd21]; fma.rn.f32 %f104, %f3, %f3, %f103; setp.eq.f32 %p6, %f104, 0f00000000; @%p6 bra BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r50, %r8, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r122, %r51, %r36; bra.uni BB0_5; BB0_4: mov.u32 %r52, 0; max.s32 %r122, %r8, %r52; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f225, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f226, %f225; mov.f32 %f227, %f225; @%p10 bra BB0_7; mul.wide.s32 %rd23, %r12, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f225, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f226, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f227, [%rd26]; BB0_7: mul.f32 %f108, %f226, %f226; fma.rn.f32 %f109, %f225, %f225, %f108; fma.rn.f32 %f10, %f227, %f227, %f109; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs33, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd27, %r12; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs33, [%rd28]; BB0_9: setp.gt.u16 %p12, %rs33, %rs1; cvt.u32.u16 %r53, %rs33; and.b32 %r54, %r53, 255; selp.b32 %r55, %r7, %r54, %p12; selp.b32 %r56, %r54, %r7, %p12; add.s32 %r57, %r56, 1; mul.lo.s32 %r58, %r57, %r56; shr.u32 %r59, %r58, 1; add.s32 %r60, %r59, %r55; mul.wide.s32 %rd29, %r60, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f12, [%rd31]; setp.ne.s16 %p13, %rs14, 0; mov.f32 %f237, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f238, %f237; mov.f32 %f239, %f237; @%p15 bra BB0_13; setp.neu.f32 %p16, %f10, 0f00000000; @%p16 bra BB0_12; mul.f32 %f113, %f12, 0f3F000000; div.rn.f32 %f114, %f113, %f11; mul.f32 %f115, %f114, %f99; fma.rn.f32 %f225, %f3, %f115, %f1; mul.f32 %f116, %f1, %f115; sub.f32 %f227, %f3, %f116; mov.f32 %f226, %f2; BB0_12: mul.f32 %f117, %f99, %f99; add.f32 %f118, %f11, %f11; div.rn.f32 %f119, %f118, %f117; sub.f32 %f120, %f225, %f1; sub.f32 %f121, %f226, %f2; sub.f32 %f122, %f227, %f3; fma.rn.f32 %f123, %f120, %f119, 0f00000000; fma.rn.f32 %f238, %f121, %f119, 0f00000000; fma.rn.f32 %f124, %f119, %f122, 0f00000000; div.rn.f32 %f125, %f12, %f99; mul.f32 %f126, %f227, %f125; sub.f32 %f237, %f123, %f126; fma.rn.f32 %f239, %f225, %f125, %f124; BB0_13: add.s32 %r13, %r1, 1; @%p7 bra BB0_15; rem.s32 %r61, %r13, %r36; add.s32 %r62, %r61, %r36; rem.s32 %r123, %r62, %r36; bra.uni BB0_16; BB0_15: add.s32 %r63, %r36, -1; min.s32 %r123, %r13, %r63; BB0_16: add.s32 %r17, %r123, %r5; setp.ge.s32 %p18, %r13, %r36; mov.f32 %f231, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f232, %f231; mov.f32 %f233, %f231; @%p20 bra BB0_18; mul.wide.s32 %rd32, %r17, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f231, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f232, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f233, [%rd35]; BB0_18: mul.f32 %f130, %f232, %f232; fma.rn.f32 %f131, %f231, %f231, %f130; fma.rn.f32 %f30, %f233, %f233, %f131; setp.eq.f32 %p21, %f30, 0f00000000; mov.u16 %rs34, %rs1; @%p21 bra BB0_20; cvt.s64.s32 %rd36, %r17; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs34, [%rd37]; BB0_20: setp.gt.u16 %p22, %rs34, %rs1; cvt.u32.u16 %r64, %rs34; and.b32 %r65, %r64, 255; selp.b32 %r66, %r7, %r65, %p22; selp.b32 %r67, %r65, %r7, %p22; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; add.s32 %r71, %r70, %r66; mul.wide.s32 %rd38, %r71, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f31, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f32, [%rd40]; and.pred %p25, %p21, %p13; @%p25 bra BB0_24; setp.neu.f32 %p26, %f30, 0f00000000; @%p26 bra BB0_23; mul.f32 %f132, %f32, 0f3F000000; div.rn.f32 %f133, %f132, %f31; mul.f32 %f134, %f133, %f99; mul.f32 %f135, %f3, %f134; sub.f32 %f231, %f1, %f135; fma.rn.f32 %f233, %f1, %f134, %f3; mov.f32 %f232, %f2; BB0_23: mul.f32 %f136, %f99, %f99; add.f32 %f137, %f31, %f31; div.rn.f32 %f138, %f137, %f136; sub.f32 %f139, %f231, %f1; sub.f32 %f140, %f232, %f2; sub.f32 %f141, %f233, %f3; fma.rn.f32 %f142, %f139, %f138, %f237; fma.rn.f32 %f238, %f140, %f138, %f238; fma.rn.f32 %f143, %f138, %f141, %f239; div.rn.f32 %f144, %f32, %f99; fma.rn.f32 %f237, %f233, %f144, %f142; mul.f32 %f145, %f231, %f144; sub.f32 %f239, %f143, %f145; BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r18, %r2, -1; @%p27 bra BB0_26; rem.s32 %r72, %r18, %r37; add.s32 %r73, %r72, %r37; rem.s32 %r124, %r73, %r37; bra.uni BB0_27; BB0_26: mov.u32 %r74, 0; max.s32 %r124, %r18, %r74; BB0_27: add.s32 %r75, %r124, %r4; mad.lo.s32 %r22, %r75, %r36, %r1; setp.lt.s32 %p29, %r18, 0; mov.f32 %f240, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f241, %f240; mov.f32 %f242, %f240; @%p30 bra BB0_29; mul.wide.s32 %rd41, %r22, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f240, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f241, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f242, [%rd44]; BB0_29: mul.f32 %f149, %f241, %f241; fma.rn.f32 %f150, %f240, %f240, %f149; fma.rn.f32 %f50, %f242, %f242, %f150; setp.eq.f32 %p31, %f50, 0f00000000; mov.u16 %rs35, %rs1; @%p31 bra BB0_31; cvt.s64.s32 %rd45, %r22; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs35, [%rd46]; BB0_31: setp.gt.u16 %p32, %rs35, %rs1; cvt.u32.u16 %r76, %rs35; and.b32 %r77, %r76, 255; selp.b32 %r78, %r7, %r77, %p32; selp.b32 %r79, %r77, %r7, %p32; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd47, %r83, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f51, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f52, [%rd49]; and.pred %p35, %p31, %p13; @%p35 bra BB0_35; setp.neu.f32 %p36, %f50, 0f00000000; @%p36 bra BB0_34; mul.f32 %f151, %f52, 0f3F000000; div.rn.f32 %f152, %f151, %f51; mul.f32 %f153, %f152, %f100; fma.rn.f32 %f241, %f3, %f153, %f2; mul.f32 %f154, %f2, %f153; sub.f32 %f242, %f3, %f154; mov.f32 %f240, %f1; BB0_34: mul.f32 %f155, %f100, %f100; add.f32 %f156, %f51, %f51; div.rn.f32 %f157, %f156, %f155; sub.f32 %f158, %f240, %f1; sub.f32 %f159, %f241, %f2; sub.f32 %f160, %f242, %f3; fma.rn.f32 %f237, %f158, %f157, %f237; fma.rn.f32 %f161, %f159, %f157, %f238; fma.rn.f32 %f162, %f157, %f160, %f239; div.rn.f32 %f163, %f52, %f100; mul.f32 %f164, %f242, %f163; sub.f32 %f238, %f161, %f164; fma.rn.f32 %f239, %f241, %f163, %f162; BB0_35: add.s32 %r23, %r2, 1; @%p27 bra BB0_37; rem.s32 %r84, %r23, %r37; add.s32 %r85, %r84, %r37; rem.s32 %r125, %r85, %r37; bra.uni BB0_38; BB0_37: add.s32 %r86, %r37, -1; min.s32 %r125, %r23, %r86; BB0_38: add.s32 %r87, %r125, %r4; mad.lo.s32 %r27, %r87, %r36, %r1; setp.ge.s32 %p38, %r23, %r37; mov.f32 %f249, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f250, %f249; mov.f32 %f251, %f249; @%p40 bra BB0_40; mul.wide.s32 %rd50, %r27, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f249, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f250, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f251, [%rd53]; BB0_40: mul.f32 %f168, %f250, %f250; fma.rn.f32 %f169, %f249, %f249, %f168; fma.rn.f32 %f70, %f251, %f251, %f169; setp.eq.f32 %p41, %f70, 0f00000000; mov.u16 %rs36, %rs1; @%p41 bra BB0_42; cvt.s64.s32 %rd54, %r27; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs36, [%rd55]; BB0_42: setp.gt.u16 %p42, %rs36, %rs1; cvt.u32.u16 %r88, %rs36; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p42; selp.b32 %r91, %r89, %r7, %p42; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd56, %r95, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f71, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f72, [%rd58]; and.pred %p45, %p41, %p13; @%p45 bra BB0_46; setp.neu.f32 %p46, %f70, 0f00000000; @%p46 bra BB0_45; mul.f32 %f170, %f72, 0f3F000000; div.rn.f32 %f171, %f170, %f71; mul.f32 %f172, %f171, %f100; mul.f32 %f173, %f3, %f172; sub.f32 %f250, %f2, %f173; fma.rn.f32 %f251, %f2, %f172, %f3; mov.f32 %f249, %f1; BB0_45: mul.f32 %f174, %f100, %f100; add.f32 %f175, %f71, %f71; div.rn.f32 %f176, %f175, %f174; sub.f32 %f177, %f249, %f1; sub.f32 %f178, %f250, %f2; sub.f32 %f179, %f251, %f3; fma.rn.f32 %f237, %f177, %f176, %f237; fma.rn.f32 %f180, %f178, %f176, %f238; fma.rn.f32 %f181, %f176, %f179, %f239; div.rn.f32 %f182, %f72, %f100; fma.rn.f32 %f238, %f251, %f182, %f180; mul.f32 %f183, %f250, %f182; sub.f32 %f239, %f181, %f183; BB0_46: setp.eq.s32 %p47, %r38, 1; @%p47 bra BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r28, %r3, -1; @%p48 bra BB0_49; rem.s32 %r96, %r28, %r38; add.s32 %r97, %r96, %r38; rem.s32 %r126, %r97, %r38; bra.uni BB0_50; BB0_49: mov.u32 %r98, 0; max.s32 %r126, %r28, %r98; BB0_50: mad.lo.s32 %r99, %r126, %r37, %r2; mad.lo.s32 %r100, %r99, %r36, %r1; cvt.s64.s32 %rd59, %r100; mul.wide.s32 %rd60, %r100, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f184, [%rd61]; ld.global.nc.f32 %f185, [%rd62]; mul.f32 %f186, %f185, %f185; fma.rn.f32 %f187, %f184, %f184, %f186; ld.global.nc.f32 %f188, [%rd63]; fma.rn.f32 %f189, %f188, %f188, %f187; setp.eq.f32 %p49, %f189, 0f00000000; selp.f32 %f190, %f1, %f184, %p49; selp.f32 %f191, %f2, %f185, %p49; selp.f32 %f192, %f3, %f188, %p49; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs26, [%rd64]; setp.gt.u16 %p50, %rs26, %rs1; cvt.u32.u16 %r101, %rs26; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p50; selp.b32 %r104, %r102, %r7, %p50; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd65, %r108, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f193, [%rd66]; add.f32 %f194, %f193, %f193; mul.f32 %f84, %f101, %f101; div.rn.f32 %f195, %f194, %f84; sub.f32 %f196, %f190, %f1; sub.f32 %f197, %f191, %f2; sub.f32 %f198, %f192, %f3; fma.rn.f32 %f85, %f195, %f196, %f237; fma.rn.f32 %f86, %f195, %f197, %f238; fma.rn.f32 %f87, %f195, %f198, %f239; add.s32 %r32, %r3, 1; @%p48 bra BB0_52; rem.s32 %r109, %r32, %r38; add.s32 %r110, %r109, %r38; rem.s32 %r127, %r110, %r38; bra.uni BB0_53; BB0_52: add.s32 %r111, %r38, -1; min.s32 %r127, %r32, %r111; BB0_53: mad.lo.s32 %r112, %r127, %r37, %r2; mad.lo.s32 %r113, %r112, %r36, %r1; cvt.s64.s32 %rd67, %r113; mul.wide.s32 %rd68, %r113, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f199, [%rd69]; ld.global.nc.f32 %f200, [%rd70]; mul.f32 %f201, %f200, %f200; fma.rn.f32 %f202, %f199, %f199, %f201; ld.global.nc.f32 %f203, [%rd71]; fma.rn.f32 %f204, %f203, %f203, %f202; setp.eq.f32 %p52, %f204, 0f00000000; selp.f32 %f205, %f3, %f203, %p52; selp.f32 %f206, %f2, %f200, %p52; selp.f32 %f207, %f1, %f199, %p52; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs30, [%rd72]; setp.gt.u16 %p53, %rs30, %rs1; cvt.u32.u16 %r114, %rs30; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p53; selp.b32 %r117, %r115, %r7, %p53; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd73, %r121, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f208, [%rd74]; add.f32 %f209, %f208, %f208; div.rn.f32 %f210, %f209, %f84; sub.f32 %f211, %f207, %f1; sub.f32 %f212, %f206, %f2; sub.f32 %f213, %f205, %f3; fma.rn.f32 %f237, %f210, %f211, %f85; fma.rn.f32 %f238, %f210, %f212, %f86; fma.rn.f32 %f239, %f210, %f213, %f87; BB0_54: setp.eq.s64 %p54, %rd10, 0; @%p54 bra BB0_56; cvta.to.global.u64 %rd75, %rd10; add.s64 %rd77, %rd75, %rd17; ld.global.nc.f32 %f214, [%rd77]; mul.f32 %f261, %f214, %f261; BB0_56: setp.eq.f32 %p55, %f261, 0f00000000; mov.f32 %f262, 0f00000000; @%p55 bra BB0_58; rcp.rn.f32 %f262, %f261; BB0_58: cvta.to.global.u64 %rd78, %rd9; cvta.to.global.u64 %rd79, %rd8; cvta.to.global.u64 %rd80, %rd7; add.s64 %rd82, %rd80, %rd17; ld.global.f32 %f216, [%rd82]; fma.rn.f32 %f217, %f237, %f262, %f216; st.global.f32 [%rd82], %f217; add.s64 %rd83, %rd79, %rd17; ld.global.f32 %f218, [%rd83]; fma.rn.f32 %f219, %f238, %f262, %f218; st.global.f32 [%rd83], %f219; add.s64 %rd84, %rd78, %rd17; ld.global.f32 %f220, [%rd84]; fma.rn.f32 %f221, %f239, %f262, %f220; st.global.f32 [%rd84], %f221; BB0_59: ret; } ` adddmi_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<56>; .reg .b16 %rs<37>; .reg .f32 %f<263>; .reg .b32 %r<128>; .reg .b64 %rd<85>; ld.param.u64 %rd7, [adddmi_param_0]; ld.param.u64 %rd8, [adddmi_param_1]; ld.param.u64 %rd9, [adddmi_param_2]; ld.param.u64 %rd11, [adddmi_param_3]; ld.param.u64 %rd12, [adddmi_param_4]; ld.param.u64 %rd13, [adddmi_param_5]; ld.param.u64 %rd10, [adddmi_param_6]; ld.param.f32 %f261, [adddmi_param_7]; ld.param.u64 %rd14, [adddmi_param_8]; ld.param.u64 %rd15, [adddmi_param_9]; ld.param.u64 %rd16, [adddmi_param_10]; ld.param.f32 %f99, [adddmi_param_11]; ld.param.f32 %f100, [adddmi_param_12]; ld.param.f32 %f101, [adddmi_param_13]; ld.param.u32 %r36, [adddmi_param_14]; ld.param.u32 %r37, [adddmi_param_15]; ld.param.u32 %r38, [adddmi_param_16]; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; cvta.to.global.u64 %rd1, %rd15; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd16; cvta.to.global.u64 %rd4, %rd13; cvta.to.global.u64 %rd5, %rd12; cvta.to.global.u64 %rd6, %rd11; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r39, %r40, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r42, %r43, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r45, %r46, %r47; setp.ge.s32 %p1, %r2, %r37; setp.ge.s32 %p2, %r1, %r36; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra BB0_59; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd17, %r6, 4; add.s64 %rd18, %rd6, %rd17; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd5, %rd17; add.s64 %rd21, %rd4, %rd17; add.s64 %rd22, %rd3, %rd19; ld.global.nc.u8 %rs1, [%rd22]; cvt.u32.u16 %r49, %rs1; and.b32 %r7, %r49, 255; ld.global.nc.f32 %f1, [%rd18]; ld.global.nc.f32 %f2, [%rd20]; mul.f32 %f102, %f2, %f2; fma.rn.f32 %f103, %f1, %f1, %f102; ld.global.nc.f32 %f3, [%rd21]; fma.rn.f32 %f104, %f3, %f3, %f103; setp.eq.f32 %p6, %f104, 0f00000000; @%p6 bra BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r50, %r8, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r122, %r51, %r36; bra.uni BB0_5; BB0_4: mov.u32 %r52, 0; max.s32 %r122, %r8, %r52; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f225, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f226, %f225; mov.f32 %f227, %f225; @%p10 bra BB0_7; mul.wide.s32 %rd23, %r12, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f225, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f226, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f227, [%rd26]; BB0_7: mul.f32 %f108, %f226, %f226; fma.rn.f32 %f109, %f225, %f225, %f108; fma.rn.f32 %f10, %f227, %f227, %f109; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs33, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd27, %r12; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs33, [%rd28]; BB0_9: setp.gt.u16 %p12, %rs33, %rs1; cvt.u32.u16 %r53, %rs33; and.b32 %r54, %r53, 255; selp.b32 %r55, %r7, %r54, %p12; selp.b32 %r56, %r54, %r7, %p12; add.s32 %r57, %r56, 1; mul.lo.s32 %r58, %r57, %r56; shr.u32 %r59, %r58, 1; add.s32 %r60, %r59, %r55; mul.wide.s32 %rd29, %r60, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f12, [%rd31]; setp.ne.s16 %p13, %rs14, 0; mov.f32 %f237, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f238, %f237; mov.f32 %f239, %f237; @%p15 bra BB0_13; setp.neu.f32 %p16, %f10, 0f00000000; @%p16 bra BB0_12; mul.f32 %f113, %f12, 0f3F000000; div.rn.f32 %f114, %f113, %f11; mul.f32 %f115, %f114, %f99; fma.rn.f32 %f225, %f3, %f115, %f1; mul.f32 %f116, %f1, %f115; sub.f32 %f227, %f3, %f116; mov.f32 %f226, %f2; BB0_12: mul.f32 %f117, %f99, %f99; add.f32 %f118, %f11, %f11; div.rn.f32 %f119, %f118, %f117; sub.f32 %f120, %f225, %f1; sub.f32 %f121, %f226, %f2; sub.f32 %f122, %f227, %f3; fma.rn.f32 %f123, %f120, %f119, 0f00000000; fma.rn.f32 %f238, %f121, %f119, 0f00000000; fma.rn.f32 %f124, %f119, %f122, 0f00000000; div.rn.f32 %f125, %f12, %f99; mul.f32 %f126, %f227, %f125; sub.f32 %f237, %f123, %f126; fma.rn.f32 %f239, %f225, %f125, %f124; BB0_13: add.s32 %r13, %r1, 1; @%p7 bra BB0_15; rem.s32 %r61, %r13, %r36; add.s32 %r62, %r61, %r36; rem.s32 %r123, %r62, %r36; bra.uni BB0_16; BB0_15: add.s32 %r63, %r36, -1; min.s32 %r123, %r13, %r63; BB0_16: add.s32 %r17, %r123, %r5; setp.ge.s32 %p18, %r13, %r36; mov.f32 %f231, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f232, %f231; mov.f32 %f233, %f231; @%p20 bra BB0_18; mul.wide.s32 %rd32, %r17, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f231, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f232, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f233, [%rd35]; BB0_18: mul.f32 %f130, %f232, %f232; fma.rn.f32 %f131, %f231, %f231, %f130; fma.rn.f32 %f30, %f233, %f233, %f131; setp.eq.f32 %p21, %f30, 0f00000000; mov.u16 %rs34, %rs1; @%p21 bra BB0_20; cvt.s64.s32 %rd36, %r17; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs34, [%rd37]; BB0_20: setp.gt.u16 %p22, %rs34, %rs1; cvt.u32.u16 %r64, %rs34; and.b32 %r65, %r64, 255; selp.b32 %r66, %r7, %r65, %p22; selp.b32 %r67, %r65, %r7, %p22; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; add.s32 %r71, %r70, %r66; mul.wide.s32 %rd38, %r71, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f31, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f32, [%rd40]; and.pred %p25, %p21, %p13; @%p25 bra BB0_24; setp.neu.f32 %p26, %f30, 0f00000000; @%p26 bra BB0_23; mul.f32 %f132, %f32, 0f3F000000; div.rn.f32 %f133, %f132, %f31; mul.f32 %f134, %f133, %f99; mul.f32 %f135, %f3, %f134; sub.f32 %f231, %f1, %f135; fma.rn.f32 %f233, %f1, %f134, %f3; mov.f32 %f232, %f2; BB0_23: mul.f32 %f136, %f99, %f99; add.f32 %f137, %f31, %f31; div.rn.f32 %f138, %f137, %f136; sub.f32 %f139, %f231, %f1; sub.f32 %f140, %f232, %f2; sub.f32 %f141, %f233, %f3; fma.rn.f32 %f142, %f139, %f138, %f237; fma.rn.f32 %f238, %f140, %f138, %f238; fma.rn.f32 %f143, %f138, %f141, %f239; div.rn.f32 %f144, %f32, %f99; fma.rn.f32 %f237, %f233, %f144, %f142; mul.f32 %f145, %f231, %f144; sub.f32 %f239, %f143, %f145; BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r18, %r2, -1; @%p27 bra BB0_26; rem.s32 %r72, %r18, %r37; add.s32 %r73, %r72, %r37; rem.s32 %r124, %r73, %r37; bra.uni BB0_27; BB0_26: mov.u32 %r74, 0; max.s32 %r124, %r18, %r74; BB0_27: add.s32 %r75, %r124, %r4; mad.lo.s32 %r22, %r75, %r36, %r1; setp.lt.s32 %p29, %r18, 0; mov.f32 %f240, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f241, %f240; mov.f32 %f242, %f240; @%p30 bra BB0_29; mul.wide.s32 %rd41, %r22, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f240, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f241, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f242, [%rd44]; BB0_29: mul.f32 %f149, %f241, %f241; fma.rn.f32 %f150, %f240, %f240, %f149; fma.rn.f32 %f50, %f242, %f242, %f150; setp.eq.f32 %p31, %f50, 0f00000000; mov.u16 %rs35, %rs1; @%p31 bra BB0_31; cvt.s64.s32 %rd45, %r22; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs35, [%rd46]; BB0_31: setp.gt.u16 %p32, %rs35, %rs1; cvt.u32.u16 %r76, %rs35; and.b32 %r77, %r76, 255; selp.b32 %r78, %r7, %r77, %p32; selp.b32 %r79, %r77, %r7, %p32; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd47, %r83, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f51, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f52, [%rd49]; and.pred %p35, %p31, %p13; @%p35 bra BB0_35; setp.neu.f32 %p36, %f50, 0f00000000; @%p36 bra BB0_34; mul.f32 %f151, %f52, 0f3F000000; div.rn.f32 %f152, %f151, %f51; mul.f32 %f153, %f152, %f100; fma.rn.f32 %f241, %f3, %f153, %f2; mul.f32 %f154, %f2, %f153; sub.f32 %f242, %f3, %f154; mov.f32 %f240, %f1; BB0_34: mul.f32 %f155, %f100, %f100; add.f32 %f156, %f51, %f51; div.rn.f32 %f157, %f156, %f155; sub.f32 %f158, %f240, %f1; sub.f32 %f159, %f241, %f2; sub.f32 %f160, %f242, %f3; fma.rn.f32 %f237, %f158, %f157, %f237; fma.rn.f32 %f161, %f159, %f157, %f238; fma.rn.f32 %f162, %f157, %f160, %f239; div.rn.f32 %f163, %f52, %f100; mul.f32 %f164, %f242, %f163; sub.f32 %f238, %f161, %f164; fma.rn.f32 %f239, %f241, %f163, %f162; BB0_35: add.s32 %r23, %r2, 1; @%p27 bra BB0_37; rem.s32 %r84, %r23, %r37; add.s32 %r85, %r84, %r37; rem.s32 %r125, %r85, %r37; bra.uni BB0_38; BB0_37: add.s32 %r86, %r37, -1; min.s32 %r125, %r23, %r86; BB0_38: add.s32 %r87, %r125, %r4; mad.lo.s32 %r27, %r87, %r36, %r1; setp.ge.s32 %p38, %r23, %r37; mov.f32 %f249, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f250, %f249; mov.f32 %f251, %f249; @%p40 bra BB0_40; mul.wide.s32 %rd50, %r27, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f249, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f250, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f251, [%rd53]; BB0_40: mul.f32 %f168, %f250, %f250; fma.rn.f32 %f169, %f249, %f249, %f168; fma.rn.f32 %f70, %f251, %f251, %f169; setp.eq.f32 %p41, %f70, 0f00000000; mov.u16 %rs36, %rs1; @%p41 bra BB0_42; cvt.s64.s32 %rd54, %r27; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs36, [%rd55]; BB0_42: setp.gt.u16 %p42, %rs36, %rs1; cvt.u32.u16 %r88, %rs36; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p42; selp.b32 %r91, %r89, %r7, %p42; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd56, %r95, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f71, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f72, [%rd58]; and.pred %p45, %p41, %p13; @%p45 bra BB0_46; setp.neu.f32 %p46, %f70, 0f00000000; @%p46 bra BB0_45; mul.f32 %f170, %f72, 0f3F000000; div.rn.f32 %f171, %f170, %f71; mul.f32 %f172, %f171, %f100; mul.f32 %f173, %f3, %f172; sub.f32 %f250, %f2, %f173; fma.rn.f32 %f251, %f2, %f172, %f3; mov.f32 %f249, %f1; BB0_45: mul.f32 %f174, %f100, %f100; add.f32 %f175, %f71, %f71; div.rn.f32 %f176, %f175, %f174; sub.f32 %f177, %f249, %f1; sub.f32 %f178, %f250, %f2; sub.f32 %f179, %f251, %f3; fma.rn.f32 %f237, %f177, %f176, %f237; fma.rn.f32 %f180, %f178, %f176, %f238; fma.rn.f32 %f181, %f176, %f179, %f239; div.rn.f32 %f182, %f72, %f100; fma.rn.f32 %f238, %f251, %f182, %f180; mul.f32 %f183, %f250, %f182; sub.f32 %f239, %f181, %f183; BB0_46: setp.eq.s32 %p47, %r38, 1; @%p47 bra BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r28, %r3, -1; @%p48 bra BB0_49; rem.s32 %r96, %r28, %r38; add.s32 %r97, %r96, %r38; rem.s32 %r126, %r97, %r38; bra.uni BB0_50; BB0_49: mov.u32 %r98, 0; max.s32 %r126, %r28, %r98; BB0_50: mad.lo.s32 %r99, %r126, %r37, %r2; mad.lo.s32 %r100, %r99, %r36, %r1; cvt.s64.s32 %rd59, %r100; mul.wide.s32 %rd60, %r100, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f184, [%rd61]; ld.global.nc.f32 %f185, [%rd62]; mul.f32 %f186, %f185, %f185; fma.rn.f32 %f187, %f184, %f184, %f186; ld.global.nc.f32 %f188, [%rd63]; fma.rn.f32 %f189, %f188, %f188, %f187; setp.eq.f32 %p49, %f189, 0f00000000; selp.f32 %f190, %f1, %f184, %p49; selp.f32 %f191, %f2, %f185, %p49; selp.f32 %f192, %f3, %f188, %p49; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs26, [%rd64]; setp.gt.u16 %p50, %rs26, %rs1; cvt.u32.u16 %r101, %rs26; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p50; selp.b32 %r104, %r102, %r7, %p50; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd65, %r108, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f193, [%rd66]; add.f32 %f194, %f193, %f193; mul.f32 %f84, %f101, %f101; div.rn.f32 %f195, %f194, %f84; sub.f32 %f196, %f190, %f1; sub.f32 %f197, %f191, %f2; sub.f32 %f198, %f192, %f3; fma.rn.f32 %f85, %f195, %f196, %f237; fma.rn.f32 %f86, %f195, %f197, %f238; fma.rn.f32 %f87, %f195, %f198, %f239; add.s32 %r32, %r3, 1; @%p48 bra BB0_52; rem.s32 %r109, %r32, %r38; add.s32 %r110, %r109, %r38; rem.s32 %r127, %r110, %r38; bra.uni BB0_53; BB0_52: add.s32 %r111, %r38, -1; min.s32 %r127, %r32, %r111; BB0_53: mad.lo.s32 %r112, %r127, %r37, %r2; mad.lo.s32 %r113, %r112, %r36, %r1; cvt.s64.s32 %rd67, %r113; mul.wide.s32 %rd68, %r113, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f199, [%rd69]; ld.global.nc.f32 %f200, [%rd70]; mul.f32 %f201, %f200, %f200; fma.rn.f32 %f202, %f199, %f199, %f201; ld.global.nc.f32 %f203, [%rd71]; fma.rn.f32 %f204, %f203, %f203, %f202; setp.eq.f32 %p52, %f204, 0f00000000; selp.f32 %f205, %f3, %f203, %p52; selp.f32 %f206, %f2, %f200, %p52; selp.f32 %f207, %f1, %f199, %p52; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs30, [%rd72]; setp.gt.u16 %p53, %rs30, %rs1; cvt.u32.u16 %r114, %rs30; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p53; selp.b32 %r117, %r115, %r7, %p53; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd73, %r121, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f208, [%rd74]; add.f32 %f209, %f208, %f208; div.rn.f32 %f210, %f209, %f84; sub.f32 %f211, %f207, %f1; sub.f32 %f212, %f206, %f2; sub.f32 %f213, %f205, %f3; fma.rn.f32 %f237, %f210, %f211, %f85; fma.rn.f32 %f238, %f210, %f212, %f86; fma.rn.f32 %f239, %f210, %f213, %f87; BB0_54: setp.eq.s64 %p54, %rd10, 0; @%p54 bra BB0_56; cvta.to.global.u64 %rd75, %rd10; add.s64 %rd77, %rd75, %rd17; ld.global.nc.f32 %f214, [%rd77]; mul.f32 %f261, %f214, %f261; BB0_56: setp.eq.f32 %p55, %f261, 0f00000000; mov.f32 %f262, 0f00000000; @%p55 bra BB0_58; rcp.rn.f32 %f262, %f261; BB0_58: cvta.to.global.u64 %rd78, %rd9; cvta.to.global.u64 %rd79, %rd8; cvta.to.global.u64 %rd80, %rd7; add.s64 %rd82, %rd80, %rd17; ld.global.f32 %f216, [%rd82]; fma.rn.f32 %f217, %f237, %f262, %f216; st.global.f32 [%rd82], %f217; add.s64 %rd83, %rd79, %rd17; ld.global.f32 %f218, [%rd83]; fma.rn.f32 %f219, %f238, %f262, %f218; st.global.f32 [%rd83], %f219; add.s64 %rd84, %rd78, %rd17; ld.global.f32 %f220, [%rd84]; fma.rn.f32 %f221, %f239, %f262, %f220; st.global.f32 [%rd84], %f221; BB0_59: ret; } ` adddmi_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<56>; .reg .b16 %rs<37>; .reg .f32 %f<263>; .reg .b32 %r<128>; .reg .b64 %rd<85>; ld.param.u64 %rd7, [adddmi_param_0]; ld.param.u64 %rd8, [adddmi_param_1]; ld.param.u64 %rd9, [adddmi_param_2]; ld.param.u64 %rd11, [adddmi_param_3]; ld.param.u64 %rd12, [adddmi_param_4]; ld.param.u64 %rd13, [adddmi_param_5]; ld.param.u64 %rd10, [adddmi_param_6]; ld.param.f32 %f261, [adddmi_param_7]; ld.param.u64 %rd14, [adddmi_param_8]; ld.param.u64 %rd15, [adddmi_param_9]; ld.param.u64 %rd16, [adddmi_param_10]; ld.param.f32 %f99, [adddmi_param_11]; ld.param.f32 %f100, [adddmi_param_12]; ld.param.f32 %f101, [adddmi_param_13]; ld.param.u32 %r36, [adddmi_param_14]; ld.param.u32 %r37, [adddmi_param_15]; ld.param.u32 %r38, [adddmi_param_16]; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; cvta.to.global.u64 %rd1, %rd15; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd16; cvta.to.global.u64 %rd4, %rd13; cvta.to.global.u64 %rd5, %rd12; cvta.to.global.u64 %rd6, %rd11; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r39, %r40, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r42, %r43, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r45, %r46, %r47; setp.ge.s32 %p1, %r2, %r37; setp.ge.s32 %p2, %r1, %r36; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra BB0_59; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd17, %r6, 4; add.s64 %rd18, %rd6, %rd17; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd5, %rd17; add.s64 %rd21, %rd4, %rd17; add.s64 %rd22, %rd3, %rd19; ld.global.nc.u8 %rs1, [%rd22]; cvt.u32.u16 %r49, %rs1; and.b32 %r7, %r49, 255; ld.global.nc.f32 %f1, [%rd18]; ld.global.nc.f32 %f2, [%rd20]; mul.f32 %f102, %f2, %f2; fma.rn.f32 %f103, %f1, %f1, %f102; ld.global.nc.f32 %f3, [%rd21]; fma.rn.f32 %f104, %f3, %f3, %f103; setp.eq.f32 %p6, %f104, 0f00000000; @%p6 bra BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r50, %r8, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r122, %r51, %r36; bra.uni BB0_5; BB0_4: mov.u32 %r52, 0; max.s32 %r122, %r8, %r52; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f225, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f226, %f225; mov.f32 %f227, %f225; @%p10 bra BB0_7; mul.wide.s32 %rd23, %r12, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f225, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f226, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f227, [%rd26]; BB0_7: mul.f32 %f108, %f226, %f226; fma.rn.f32 %f109, %f225, %f225, %f108; fma.rn.f32 %f10, %f227, %f227, %f109; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs33, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd27, %r12; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs33, [%rd28]; BB0_9: setp.gt.u16 %p12, %rs33, %rs1; cvt.u32.u16 %r53, %rs33; and.b32 %r54, %r53, 255; selp.b32 %r55, %r7, %r54, %p12; selp.b32 %r56, %r54, %r7, %p12; add.s32 %r57, %r56, 1; mul.lo.s32 %r58, %r57, %r56; shr.u32 %r59, %r58, 1; add.s32 %r60, %r59, %r55; mul.wide.s32 %rd29, %r60, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f12, [%rd31]; setp.ne.s16 %p13, %rs14, 0; mov.f32 %f237, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f238, %f237; mov.f32 %f239, %f237; @%p15 bra BB0_13; setp.neu.f32 %p16, %f10, 0f00000000; @%p16 bra BB0_12; mul.f32 %f113, %f12, 0f3F000000; div.rn.f32 %f114, %f113, %f11; mul.f32 %f115, %f114, %f99; fma.rn.f32 %f225, %f3, %f115, %f1; mul.f32 %f116, %f1, %f115; sub.f32 %f227, %f3, %f116; mov.f32 %f226, %f2; BB0_12: mul.f32 %f117, %f99, %f99; add.f32 %f118, %f11, %f11; div.rn.f32 %f119, %f118, %f117; sub.f32 %f120, %f225, %f1; sub.f32 %f121, %f226, %f2; sub.f32 %f122, %f227, %f3; fma.rn.f32 %f123, %f120, %f119, 0f00000000; fma.rn.f32 %f238, %f121, %f119, 0f00000000; fma.rn.f32 %f124, %f119, %f122, 0f00000000; div.rn.f32 %f125, %f12, %f99; mul.f32 %f126, %f227, %f125; sub.f32 %f237, %f123, %f126; fma.rn.f32 %f239, %f225, %f125, %f124; BB0_13: add.s32 %r13, %r1, 1; @%p7 bra BB0_15; rem.s32 %r61, %r13, %r36; add.s32 %r62, %r61, %r36; rem.s32 %r123, %r62, %r36; bra.uni BB0_16; BB0_15: add.s32 %r63, %r36, -1; min.s32 %r123, %r13, %r63; BB0_16: add.s32 %r17, %r123, %r5; setp.ge.s32 %p18, %r13, %r36; mov.f32 %f231, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f232, %f231; mov.f32 %f233, %f231; @%p20 bra BB0_18; mul.wide.s32 %rd32, %r17, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f231, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f232, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f233, [%rd35]; BB0_18: mul.f32 %f130, %f232, %f232; fma.rn.f32 %f131, %f231, %f231, %f130; fma.rn.f32 %f30, %f233, %f233, %f131; setp.eq.f32 %p21, %f30, 0f00000000; mov.u16 %rs34, %rs1; @%p21 bra BB0_20; cvt.s64.s32 %rd36, %r17; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs34, [%rd37]; BB0_20: setp.gt.u16 %p22, %rs34, %rs1; cvt.u32.u16 %r64, %rs34; and.b32 %r65, %r64, 255; selp.b32 %r66, %r7, %r65, %p22; selp.b32 %r67, %r65, %r7, %p22; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; add.s32 %r71, %r70, %r66; mul.wide.s32 %rd38, %r71, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f31, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f32, [%rd40]; and.pred %p25, %p21, %p13; @%p25 bra BB0_24; setp.neu.f32 %p26, %f30, 0f00000000; @%p26 bra BB0_23; mul.f32 %f132, %f32, 0f3F000000; div.rn.f32 %f133, %f132, %f31; mul.f32 %f134, %f133, %f99; mul.f32 %f135, %f3, %f134; sub.f32 %f231, %f1, %f135; fma.rn.f32 %f233, %f1, %f134, %f3; mov.f32 %f232, %f2; BB0_23: mul.f32 %f136, %f99, %f99; add.f32 %f137, %f31, %f31; div.rn.f32 %f138, %f137, %f136; sub.f32 %f139, %f231, %f1; sub.f32 %f140, %f232, %f2; sub.f32 %f141, %f233, %f3; fma.rn.f32 %f142, %f139, %f138, %f237; fma.rn.f32 %f238, %f140, %f138, %f238; fma.rn.f32 %f143, %f138, %f141, %f239; div.rn.f32 %f144, %f32, %f99; fma.rn.f32 %f237, %f233, %f144, %f142; mul.f32 %f145, %f231, %f144; sub.f32 %f239, %f143, %f145; BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r18, %r2, -1; @%p27 bra BB0_26; rem.s32 %r72, %r18, %r37; add.s32 %r73, %r72, %r37; rem.s32 %r124, %r73, %r37; bra.uni BB0_27; BB0_26: mov.u32 %r74, 0; max.s32 %r124, %r18, %r74; BB0_27: add.s32 %r75, %r124, %r4; mad.lo.s32 %r22, %r75, %r36, %r1; setp.lt.s32 %p29, %r18, 0; mov.f32 %f240, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f241, %f240; mov.f32 %f242, %f240; @%p30 bra BB0_29; mul.wide.s32 %rd41, %r22, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f240, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f241, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f242, [%rd44]; BB0_29: mul.f32 %f149, %f241, %f241; fma.rn.f32 %f150, %f240, %f240, %f149; fma.rn.f32 %f50, %f242, %f242, %f150; setp.eq.f32 %p31, %f50, 0f00000000; mov.u16 %rs35, %rs1; @%p31 bra BB0_31; cvt.s64.s32 %rd45, %r22; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs35, [%rd46]; BB0_31: setp.gt.u16 %p32, %rs35, %rs1; cvt.u32.u16 %r76, %rs35; and.b32 %r77, %r76, 255; selp.b32 %r78, %r7, %r77, %p32; selp.b32 %r79, %r77, %r7, %p32; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd47, %r83, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f51, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f52, [%rd49]; and.pred %p35, %p31, %p13; @%p35 bra BB0_35; setp.neu.f32 %p36, %f50, 0f00000000; @%p36 bra BB0_34; mul.f32 %f151, %f52, 0f3F000000; div.rn.f32 %f152, %f151, %f51; mul.f32 %f153, %f152, %f100; fma.rn.f32 %f241, %f3, %f153, %f2; mul.f32 %f154, %f2, %f153; sub.f32 %f242, %f3, %f154; mov.f32 %f240, %f1; BB0_34: mul.f32 %f155, %f100, %f100; add.f32 %f156, %f51, %f51; div.rn.f32 %f157, %f156, %f155; sub.f32 %f158, %f240, %f1; sub.f32 %f159, %f241, %f2; sub.f32 %f160, %f242, %f3; fma.rn.f32 %f237, %f158, %f157, %f237; fma.rn.f32 %f161, %f159, %f157, %f238; fma.rn.f32 %f162, %f157, %f160, %f239; div.rn.f32 %f163, %f52, %f100; mul.f32 %f164, %f242, %f163; sub.f32 %f238, %f161, %f164; fma.rn.f32 %f239, %f241, %f163, %f162; BB0_35: add.s32 %r23, %r2, 1; @%p27 bra BB0_37; rem.s32 %r84, %r23, %r37; add.s32 %r85, %r84, %r37; rem.s32 %r125, %r85, %r37; bra.uni BB0_38; BB0_37: add.s32 %r86, %r37, -1; min.s32 %r125, %r23, %r86; BB0_38: add.s32 %r87, %r125, %r4; mad.lo.s32 %r27, %r87, %r36, %r1; setp.ge.s32 %p38, %r23, %r37; mov.f32 %f249, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f250, %f249; mov.f32 %f251, %f249; @%p40 bra BB0_40; mul.wide.s32 %rd50, %r27, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f249, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f250, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f251, [%rd53]; BB0_40: mul.f32 %f168, %f250, %f250; fma.rn.f32 %f169, %f249, %f249, %f168; fma.rn.f32 %f70, %f251, %f251, %f169; setp.eq.f32 %p41, %f70, 0f00000000; mov.u16 %rs36, %rs1; @%p41 bra BB0_42; cvt.s64.s32 %rd54, %r27; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs36, [%rd55]; BB0_42: setp.gt.u16 %p42, %rs36, %rs1; cvt.u32.u16 %r88, %rs36; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p42; selp.b32 %r91, %r89, %r7, %p42; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd56, %r95, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f71, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f72, [%rd58]; and.pred %p45, %p41, %p13; @%p45 bra BB0_46; setp.neu.f32 %p46, %f70, 0f00000000; @%p46 bra BB0_45; mul.f32 %f170, %f72, 0f3F000000; div.rn.f32 %f171, %f170, %f71; mul.f32 %f172, %f171, %f100; mul.f32 %f173, %f3, %f172; sub.f32 %f250, %f2, %f173; fma.rn.f32 %f251, %f2, %f172, %f3; mov.f32 %f249, %f1; BB0_45: mul.f32 %f174, %f100, %f100; add.f32 %f175, %f71, %f71; div.rn.f32 %f176, %f175, %f174; sub.f32 %f177, %f249, %f1; sub.f32 %f178, %f250, %f2; sub.f32 %f179, %f251, %f3; fma.rn.f32 %f237, %f177, %f176, %f237; fma.rn.f32 %f180, %f178, %f176, %f238; fma.rn.f32 %f181, %f176, %f179, %f239; div.rn.f32 %f182, %f72, %f100; fma.rn.f32 %f238, %f251, %f182, %f180; mul.f32 %f183, %f250, %f182; sub.f32 %f239, %f181, %f183; BB0_46: setp.eq.s32 %p47, %r38, 1; @%p47 bra BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r28, %r3, -1; @%p48 bra BB0_49; rem.s32 %r96, %r28, %r38; add.s32 %r97, %r96, %r38; rem.s32 %r126, %r97, %r38; bra.uni BB0_50; BB0_49: mov.u32 %r98, 0; max.s32 %r126, %r28, %r98; BB0_50: mad.lo.s32 %r99, %r126, %r37, %r2; mad.lo.s32 %r100, %r99, %r36, %r1; cvt.s64.s32 %rd59, %r100; mul.wide.s32 %rd60, %r100, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f184, [%rd61]; ld.global.nc.f32 %f185, [%rd62]; mul.f32 %f186, %f185, %f185; fma.rn.f32 %f187, %f184, %f184, %f186; ld.global.nc.f32 %f188, [%rd63]; fma.rn.f32 %f189, %f188, %f188, %f187; setp.eq.f32 %p49, %f189, 0f00000000; selp.f32 %f190, %f1, %f184, %p49; selp.f32 %f191, %f2, %f185, %p49; selp.f32 %f192, %f3, %f188, %p49; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs26, [%rd64]; setp.gt.u16 %p50, %rs26, %rs1; cvt.u32.u16 %r101, %rs26; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p50; selp.b32 %r104, %r102, %r7, %p50; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd65, %r108, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f193, [%rd66]; add.f32 %f194, %f193, %f193; mul.f32 %f84, %f101, %f101; div.rn.f32 %f195, %f194, %f84; sub.f32 %f196, %f190, %f1; sub.f32 %f197, %f191, %f2; sub.f32 %f198, %f192, %f3; fma.rn.f32 %f85, %f195, %f196, %f237; fma.rn.f32 %f86, %f195, %f197, %f238; fma.rn.f32 %f87, %f195, %f198, %f239; add.s32 %r32, %r3, 1; @%p48 bra BB0_52; rem.s32 %r109, %r32, %r38; add.s32 %r110, %r109, %r38; rem.s32 %r127, %r110, %r38; bra.uni BB0_53; BB0_52: add.s32 %r111, %r38, -1; min.s32 %r127, %r32, %r111; BB0_53: mad.lo.s32 %r112, %r127, %r37, %r2; mad.lo.s32 %r113, %r112, %r36, %r1; cvt.s64.s32 %rd67, %r113; mul.wide.s32 %rd68, %r113, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f199, [%rd69]; ld.global.nc.f32 %f200, [%rd70]; mul.f32 %f201, %f200, %f200; fma.rn.f32 %f202, %f199, %f199, %f201; ld.global.nc.f32 %f203, [%rd71]; fma.rn.f32 %f204, %f203, %f203, %f202; setp.eq.f32 %p52, %f204, 0f00000000; selp.f32 %f205, %f3, %f203, %p52; selp.f32 %f206, %f2, %f200, %p52; selp.f32 %f207, %f1, %f199, %p52; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs30, [%rd72]; setp.gt.u16 %p53, %rs30, %rs1; cvt.u32.u16 %r114, %rs30; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p53; selp.b32 %r117, %r115, %r7, %p53; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd73, %r121, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f208, [%rd74]; add.f32 %f209, %f208, %f208; div.rn.f32 %f210, %f209, %f84; sub.f32 %f211, %f207, %f1; sub.f32 %f212, %f206, %f2; sub.f32 %f213, %f205, %f3; fma.rn.f32 %f237, %f210, %f211, %f85; fma.rn.f32 %f238, %f210, %f212, %f86; fma.rn.f32 %f239, %f210, %f213, %f87; BB0_54: setp.eq.s64 %p54, %rd10, 0; @%p54 bra BB0_56; cvta.to.global.u64 %rd75, %rd10; add.s64 %rd77, %rd75, %rd17; ld.global.nc.f32 %f214, [%rd77]; mul.f32 %f261, %f214, %f261; BB0_56: setp.eq.f32 %p55, %f261, 0f00000000; mov.f32 %f262, 0f00000000; @%p55 bra BB0_58; rcp.rn.f32 %f262, %f261; BB0_58: cvta.to.global.u64 %rd78, %rd9; cvta.to.global.u64 %rd79, %rd8; cvta.to.global.u64 %rd80, %rd7; add.s64 %rd82, %rd80, %rd17; ld.global.f32 %f216, [%rd82]; fma.rn.f32 %f217, %f237, %f262, %f216; st.global.f32 [%rd82], %f217; add.s64 %rd83, %rd79, %rd17; ld.global.f32 %f218, [%rd83]; fma.rn.f32 %f219, %f238, %f262, %f218; st.global.f32 [%rd83], %f219; add.s64 %rd84, %rd78, %rd17; ld.global.f32 %f220, [%rd84]; fma.rn.f32 %f221, %f239, %f262, %f220; st.global.f32 [%rd84], %f221; BB0_59: ret; } ` adddmi_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<56>; .reg .b16 %rs<37>; .reg .f32 %f<263>; .reg .b32 %r<128>; .reg .b64 %rd<85>; ld.param.u64 %rd7, [adddmi_param_0]; ld.param.u64 %rd8, [adddmi_param_1]; ld.param.u64 %rd9, [adddmi_param_2]; ld.param.u64 %rd11, [adddmi_param_3]; ld.param.u64 %rd12, [adddmi_param_4]; ld.param.u64 %rd13, [adddmi_param_5]; ld.param.u64 %rd10, [adddmi_param_6]; ld.param.f32 %f261, [adddmi_param_7]; ld.param.u64 %rd14, [adddmi_param_8]; ld.param.u64 %rd15, [adddmi_param_9]; ld.param.u64 %rd16, [adddmi_param_10]; ld.param.f32 %f99, [adddmi_param_11]; ld.param.f32 %f100, [adddmi_param_12]; ld.param.f32 %f101, [adddmi_param_13]; ld.param.u32 %r36, [adddmi_param_14]; ld.param.u32 %r37, [adddmi_param_15]; ld.param.u32 %r38, [adddmi_param_16]; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; cvta.to.global.u64 %rd1, %rd15; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd16; cvta.to.global.u64 %rd4, %rd13; cvta.to.global.u64 %rd5, %rd12; cvta.to.global.u64 %rd6, %rd11; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r39, %r40, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r42, %r43, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r45, %r46, %r47; setp.ge.s32 %p1, %r2, %r37; setp.ge.s32 %p2, %r1, %r36; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra BB0_59; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd17, %r6, 4; add.s64 %rd18, %rd6, %rd17; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd5, %rd17; add.s64 %rd21, %rd4, %rd17; add.s64 %rd22, %rd3, %rd19; ld.global.nc.u8 %rs1, [%rd22]; cvt.u32.u16 %r49, %rs1; and.b32 %r7, %r49, 255; ld.global.nc.f32 %f1, [%rd18]; ld.global.nc.f32 %f2, [%rd20]; mul.f32 %f102, %f2, %f2; fma.rn.f32 %f103, %f1, %f1, %f102; ld.global.nc.f32 %f3, [%rd21]; fma.rn.f32 %f104, %f3, %f3, %f103; setp.eq.f32 %p6, %f104, 0f00000000; @%p6 bra BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r50, %r8, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r122, %r51, %r36; bra.uni BB0_5; BB0_4: mov.u32 %r52, 0; max.s32 %r122, %r8, %r52; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f225, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f226, %f225; mov.f32 %f227, %f225; @%p10 bra BB0_7; mul.wide.s32 %rd23, %r12, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f225, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f226, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f227, [%rd26]; BB0_7: mul.f32 %f108, %f226, %f226; fma.rn.f32 %f109, %f225, %f225, %f108; fma.rn.f32 %f10, %f227, %f227, %f109; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs33, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd27, %r12; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs33, [%rd28]; BB0_9: setp.gt.u16 %p12, %rs33, %rs1; cvt.u32.u16 %r53, %rs33; and.b32 %r54, %r53, 255; selp.b32 %r55, %r7, %r54, %p12; selp.b32 %r56, %r54, %r7, %p12; add.s32 %r57, %r56, 1; mul.lo.s32 %r58, %r57, %r56; shr.u32 %r59, %r58, 1; add.s32 %r60, %r59, %r55; mul.wide.s32 %rd29, %r60, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f12, [%rd31]; setp.ne.s16 %p13, %rs14, 0; mov.f32 %f237, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f238, %f237; mov.f32 %f239, %f237; @%p15 bra BB0_13; setp.neu.f32 %p16, %f10, 0f00000000; @%p16 bra BB0_12; mul.f32 %f113, %f12, 0f3F000000; div.rn.f32 %f114, %f113, %f11; mul.f32 %f115, %f114, %f99; fma.rn.f32 %f225, %f3, %f115, %f1; mul.f32 %f116, %f1, %f115; sub.f32 %f227, %f3, %f116; mov.f32 %f226, %f2; BB0_12: mul.f32 %f117, %f99, %f99; add.f32 %f118, %f11, %f11; div.rn.f32 %f119, %f118, %f117; sub.f32 %f120, %f225, %f1; sub.f32 %f121, %f226, %f2; sub.f32 %f122, %f227, %f3; fma.rn.f32 %f123, %f120, %f119, 0f00000000; fma.rn.f32 %f238, %f121, %f119, 0f00000000; fma.rn.f32 %f124, %f119, %f122, 0f00000000; div.rn.f32 %f125, %f12, %f99; mul.f32 %f126, %f227, %f125; sub.f32 %f237, %f123, %f126; fma.rn.f32 %f239, %f225, %f125, %f124; BB0_13: add.s32 %r13, %r1, 1; @%p7 bra BB0_15; rem.s32 %r61, %r13, %r36; add.s32 %r62, %r61, %r36; rem.s32 %r123, %r62, %r36; bra.uni BB0_16; BB0_15: add.s32 %r63, %r36, -1; min.s32 %r123, %r13, %r63; BB0_16: add.s32 %r17, %r123, %r5; setp.ge.s32 %p18, %r13, %r36; mov.f32 %f231, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f232, %f231; mov.f32 %f233, %f231; @%p20 bra BB0_18; mul.wide.s32 %rd32, %r17, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f231, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f232, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f233, [%rd35]; BB0_18: mul.f32 %f130, %f232, %f232; fma.rn.f32 %f131, %f231, %f231, %f130; fma.rn.f32 %f30, %f233, %f233, %f131; setp.eq.f32 %p21, %f30, 0f00000000; mov.u16 %rs34, %rs1; @%p21 bra BB0_20; cvt.s64.s32 %rd36, %r17; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs34, [%rd37]; BB0_20: setp.gt.u16 %p22, %rs34, %rs1; cvt.u32.u16 %r64, %rs34; and.b32 %r65, %r64, 255; selp.b32 %r66, %r7, %r65, %p22; selp.b32 %r67, %r65, %r7, %p22; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; add.s32 %r71, %r70, %r66; mul.wide.s32 %rd38, %r71, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f31, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f32, [%rd40]; and.pred %p25, %p21, %p13; @%p25 bra BB0_24; setp.neu.f32 %p26, %f30, 0f00000000; @%p26 bra BB0_23; mul.f32 %f132, %f32, 0f3F000000; div.rn.f32 %f133, %f132, %f31; mul.f32 %f134, %f133, %f99; mul.f32 %f135, %f3, %f134; sub.f32 %f231, %f1, %f135; fma.rn.f32 %f233, %f1, %f134, %f3; mov.f32 %f232, %f2; BB0_23: mul.f32 %f136, %f99, %f99; add.f32 %f137, %f31, %f31; div.rn.f32 %f138, %f137, %f136; sub.f32 %f139, %f231, %f1; sub.f32 %f140, %f232, %f2; sub.f32 %f141, %f233, %f3; fma.rn.f32 %f142, %f139, %f138, %f237; fma.rn.f32 %f238, %f140, %f138, %f238; fma.rn.f32 %f143, %f138, %f141, %f239; div.rn.f32 %f144, %f32, %f99; fma.rn.f32 %f237, %f233, %f144, %f142; mul.f32 %f145, %f231, %f144; sub.f32 %f239, %f143, %f145; BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r18, %r2, -1; @%p27 bra BB0_26; rem.s32 %r72, %r18, %r37; add.s32 %r73, %r72, %r37; rem.s32 %r124, %r73, %r37; bra.uni BB0_27; BB0_26: mov.u32 %r74, 0; max.s32 %r124, %r18, %r74; BB0_27: add.s32 %r75, %r124, %r4; mad.lo.s32 %r22, %r75, %r36, %r1; setp.lt.s32 %p29, %r18, 0; mov.f32 %f240, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f241, %f240; mov.f32 %f242, %f240; @%p30 bra BB0_29; mul.wide.s32 %rd41, %r22, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f240, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f241, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f242, [%rd44]; BB0_29: mul.f32 %f149, %f241, %f241; fma.rn.f32 %f150, %f240, %f240, %f149; fma.rn.f32 %f50, %f242, %f242, %f150; setp.eq.f32 %p31, %f50, 0f00000000; mov.u16 %rs35, %rs1; @%p31 bra BB0_31; cvt.s64.s32 %rd45, %r22; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs35, [%rd46]; BB0_31: setp.gt.u16 %p32, %rs35, %rs1; cvt.u32.u16 %r76, %rs35; and.b32 %r77, %r76, 255; selp.b32 %r78, %r7, %r77, %p32; selp.b32 %r79, %r77, %r7, %p32; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd47, %r83, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f51, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f52, [%rd49]; and.pred %p35, %p31, %p13; @%p35 bra BB0_35; setp.neu.f32 %p36, %f50, 0f00000000; @%p36 bra BB0_34; mul.f32 %f151, %f52, 0f3F000000; div.rn.f32 %f152, %f151, %f51; mul.f32 %f153, %f152, %f100; fma.rn.f32 %f241, %f3, %f153, %f2; mul.f32 %f154, %f2, %f153; sub.f32 %f242, %f3, %f154; mov.f32 %f240, %f1; BB0_34: mul.f32 %f155, %f100, %f100; add.f32 %f156, %f51, %f51; div.rn.f32 %f157, %f156, %f155; sub.f32 %f158, %f240, %f1; sub.f32 %f159, %f241, %f2; sub.f32 %f160, %f242, %f3; fma.rn.f32 %f237, %f158, %f157, %f237; fma.rn.f32 %f161, %f159, %f157, %f238; fma.rn.f32 %f162, %f157, %f160, %f239; div.rn.f32 %f163, %f52, %f100; mul.f32 %f164, %f242, %f163; sub.f32 %f238, %f161, %f164; fma.rn.f32 %f239, %f241, %f163, %f162; BB0_35: add.s32 %r23, %r2, 1; @%p27 bra BB0_37; rem.s32 %r84, %r23, %r37; add.s32 %r85, %r84, %r37; rem.s32 %r125, %r85, %r37; bra.uni BB0_38; BB0_37: add.s32 %r86, %r37, -1; min.s32 %r125, %r23, %r86; BB0_38: add.s32 %r87, %r125, %r4; mad.lo.s32 %r27, %r87, %r36, %r1; setp.ge.s32 %p38, %r23, %r37; mov.f32 %f249, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f250, %f249; mov.f32 %f251, %f249; @%p40 bra BB0_40; mul.wide.s32 %rd50, %r27, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f249, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f250, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f251, [%rd53]; BB0_40: mul.f32 %f168, %f250, %f250; fma.rn.f32 %f169, %f249, %f249, %f168; fma.rn.f32 %f70, %f251, %f251, %f169; setp.eq.f32 %p41, %f70, 0f00000000; mov.u16 %rs36, %rs1; @%p41 bra BB0_42; cvt.s64.s32 %rd54, %r27; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs36, [%rd55]; BB0_42: setp.gt.u16 %p42, %rs36, %rs1; cvt.u32.u16 %r88, %rs36; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p42; selp.b32 %r91, %r89, %r7, %p42; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd56, %r95, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f71, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f72, [%rd58]; and.pred %p45, %p41, %p13; @%p45 bra BB0_46; setp.neu.f32 %p46, %f70, 0f00000000; @%p46 bra BB0_45; mul.f32 %f170, %f72, 0f3F000000; div.rn.f32 %f171, %f170, %f71; mul.f32 %f172, %f171, %f100; mul.f32 %f173, %f3, %f172; sub.f32 %f250, %f2, %f173; fma.rn.f32 %f251, %f2, %f172, %f3; mov.f32 %f249, %f1; BB0_45: mul.f32 %f174, %f100, %f100; add.f32 %f175, %f71, %f71; div.rn.f32 %f176, %f175, %f174; sub.f32 %f177, %f249, %f1; sub.f32 %f178, %f250, %f2; sub.f32 %f179, %f251, %f3; fma.rn.f32 %f237, %f177, %f176, %f237; fma.rn.f32 %f180, %f178, %f176, %f238; fma.rn.f32 %f181, %f176, %f179, %f239; div.rn.f32 %f182, %f72, %f100; fma.rn.f32 %f238, %f251, %f182, %f180; mul.f32 %f183, %f250, %f182; sub.f32 %f239, %f181, %f183; BB0_46: setp.eq.s32 %p47, %r38, 1; @%p47 bra BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r28, %r3, -1; @%p48 bra BB0_49; rem.s32 %r96, %r28, %r38; add.s32 %r97, %r96, %r38; rem.s32 %r126, %r97, %r38; bra.uni BB0_50; BB0_49: mov.u32 %r98, 0; max.s32 %r126, %r28, %r98; BB0_50: mad.lo.s32 %r99, %r126, %r37, %r2; mad.lo.s32 %r100, %r99, %r36, %r1; cvt.s64.s32 %rd59, %r100; mul.wide.s32 %rd60, %r100, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f184, [%rd61]; ld.global.nc.f32 %f185, [%rd62]; mul.f32 %f186, %f185, %f185; fma.rn.f32 %f187, %f184, %f184, %f186; ld.global.nc.f32 %f188, [%rd63]; fma.rn.f32 %f189, %f188, %f188, %f187; setp.eq.f32 %p49, %f189, 0f00000000; selp.f32 %f190, %f1, %f184, %p49; selp.f32 %f191, %f2, %f185, %p49; selp.f32 %f192, %f3, %f188, %p49; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs26, [%rd64]; setp.gt.u16 %p50, %rs26, %rs1; cvt.u32.u16 %r101, %rs26; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p50; selp.b32 %r104, %r102, %r7, %p50; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd65, %r108, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f193, [%rd66]; add.f32 %f194, %f193, %f193; mul.f32 %f84, %f101, %f101; div.rn.f32 %f195, %f194, %f84; sub.f32 %f196, %f190, %f1; sub.f32 %f197, %f191, %f2; sub.f32 %f198, %f192, %f3; fma.rn.f32 %f85, %f195, %f196, %f237; fma.rn.f32 %f86, %f195, %f197, %f238; fma.rn.f32 %f87, %f195, %f198, %f239; add.s32 %r32, %r3, 1; @%p48 bra BB0_52; rem.s32 %r109, %r32, %r38; add.s32 %r110, %r109, %r38; rem.s32 %r127, %r110, %r38; bra.uni BB0_53; BB0_52: add.s32 %r111, %r38, -1; min.s32 %r127, %r32, %r111; BB0_53: mad.lo.s32 %r112, %r127, %r37, %r2; mad.lo.s32 %r113, %r112, %r36, %r1; cvt.s64.s32 %rd67, %r113; mul.wide.s32 %rd68, %r113, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f199, [%rd69]; ld.global.nc.f32 %f200, [%rd70]; mul.f32 %f201, %f200, %f200; fma.rn.f32 %f202, %f199, %f199, %f201; ld.global.nc.f32 %f203, [%rd71]; fma.rn.f32 %f204, %f203, %f203, %f202; setp.eq.f32 %p52, %f204, 0f00000000; selp.f32 %f205, %f3, %f203, %p52; selp.f32 %f206, %f2, %f200, %p52; selp.f32 %f207, %f1, %f199, %p52; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs30, [%rd72]; setp.gt.u16 %p53, %rs30, %rs1; cvt.u32.u16 %r114, %rs30; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p53; selp.b32 %r117, %r115, %r7, %p53; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd73, %r121, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f208, [%rd74]; add.f32 %f209, %f208, %f208; div.rn.f32 %f210, %f209, %f84; sub.f32 %f211, %f207, %f1; sub.f32 %f212, %f206, %f2; sub.f32 %f213, %f205, %f3; fma.rn.f32 %f237, %f210, %f211, %f85; fma.rn.f32 %f238, %f210, %f212, %f86; fma.rn.f32 %f239, %f210, %f213, %f87; BB0_54: setp.eq.s64 %p54, %rd10, 0; @%p54 bra BB0_56; cvta.to.global.u64 %rd75, %rd10; add.s64 %rd77, %rd75, %rd17; ld.global.nc.f32 %f214, [%rd77]; mul.f32 %f261, %f214, %f261; BB0_56: setp.eq.f32 %p55, %f261, 0f00000000; mov.f32 %f262, 0f00000000; @%p55 bra BB0_58; rcp.rn.f32 %f262, %f261; BB0_58: cvta.to.global.u64 %rd78, %rd9; cvta.to.global.u64 %rd79, %rd8; cvta.to.global.u64 %rd80, %rd7; add.s64 %rd82, %rd80, %rd17; ld.global.f32 %f216, [%rd82]; fma.rn.f32 %f217, %f237, %f262, %f216; st.global.f32 [%rd82], %f217; add.s64 %rd83, %rd79, %rd17; ld.global.f32 %f218, [%rd83]; fma.rn.f32 %f219, %f238, %f262, %f218; st.global.f32 [%rd83], %f219; add.s64 %rd84, %rd78, %rd17; ld.global.f32 %f220, [%rd84]; fma.rn.f32 %f221, %f239, %f262, %f220; st.global.f32 [%rd84], %f221; BB0_59: ret; } ` adddmi_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl adddmi .visible .entry adddmi( .param .u64 adddmi_param_0, .param .u64 adddmi_param_1, .param .u64 adddmi_param_2, .param .u64 adddmi_param_3, .param .u64 adddmi_param_4, .param .u64 adddmi_param_5, .param .u64 adddmi_param_6, .param .f32 adddmi_param_7, .param .u64 adddmi_param_8, .param .u64 adddmi_param_9, .param .u64 adddmi_param_10, .param .f32 adddmi_param_11, .param .f32 adddmi_param_12, .param .f32 adddmi_param_13, .param .u32 adddmi_param_14, .param .u32 adddmi_param_15, .param .u32 adddmi_param_16, .param .u8 adddmi_param_17, .param .u8 adddmi_param_18 ) { .reg .pred %p<56>; .reg .b16 %rs<37>; .reg .f32 %f<263>; .reg .b32 %r<128>; .reg .b64 %rd<85>; ld.param.u64 %rd7, [adddmi_param_0]; ld.param.u64 %rd8, [adddmi_param_1]; ld.param.u64 %rd9, [adddmi_param_2]; ld.param.u64 %rd11, [adddmi_param_3]; ld.param.u64 %rd12, [adddmi_param_4]; ld.param.u64 %rd13, [adddmi_param_5]; ld.param.u64 %rd10, [adddmi_param_6]; ld.param.f32 %f261, [adddmi_param_7]; ld.param.u64 %rd14, [adddmi_param_8]; ld.param.u64 %rd15, [adddmi_param_9]; ld.param.u64 %rd16, [adddmi_param_10]; ld.param.f32 %f99, [adddmi_param_11]; ld.param.f32 %f100, [adddmi_param_12]; ld.param.f32 %f101, [adddmi_param_13]; ld.param.u32 %r36, [adddmi_param_14]; ld.param.u32 %r37, [adddmi_param_15]; ld.param.u32 %r38, [adddmi_param_16]; ld.param.u8 %rs14, [adddmi_param_18]; ld.param.u8 %rs13, [adddmi_param_17]; cvta.to.global.u64 %rd1, %rd15; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd16; cvta.to.global.u64 %rd4, %rd13; cvta.to.global.u64 %rd5, %rd12; cvta.to.global.u64 %rd6, %rd11; mov.u32 %r39, %ntid.x; mov.u32 %r40, %ctaid.x; mov.u32 %r41, %tid.x; mad.lo.s32 %r1, %r39, %r40, %r41; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.y; mov.u32 %r44, %tid.y; mad.lo.s32 %r2, %r42, %r43, %r44; mov.u32 %r45, %ntid.z; mov.u32 %r46, %ctaid.z; mov.u32 %r47, %tid.z; mad.lo.s32 %r3, %r45, %r46, %r47; setp.ge.s32 %p1, %r2, %r37; setp.ge.s32 %p2, %r1, %r36; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r38; or.pred %p5, %p3, %p4; @%p5 bra BB0_59; mul.lo.s32 %r4, %r3, %r37; add.s32 %r48, %r4, %r2; mul.lo.s32 %r5, %r48, %r36; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd17, %r6, 4; add.s64 %rd18, %rd6, %rd17; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd5, %rd17; add.s64 %rd21, %rd4, %rd17; add.s64 %rd22, %rd3, %rd19; ld.global.nc.u8 %rs1, [%rd22]; cvt.u32.u16 %r49, %rs1; and.b32 %r7, %r49, 255; ld.global.nc.f32 %f1, [%rd18]; ld.global.nc.f32 %f2, [%rd20]; mul.f32 %f102, %f2, %f2; fma.rn.f32 %f103, %f1, %f1, %f102; ld.global.nc.f32 %f3, [%rd21]; fma.rn.f32 %f104, %f3, %f3, %f103; setp.eq.f32 %p6, %f104, 0f00000000; @%p6 bra BB0_59; and.b16 %rs2, %rs13, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r50, %r8, %r36; add.s32 %r51, %r50, %r36; rem.s32 %r122, %r51, %r36; bra.uni BB0_5; BB0_4: mov.u32 %r52, 0; max.s32 %r122, %r8, %r52; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f225, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f226, %f225; mov.f32 %f227, %f225; @%p10 bra BB0_7; mul.wide.s32 %rd23, %r12, 4; add.s64 %rd24, %rd6, %rd23; ld.global.nc.f32 %f225, [%rd24]; add.s64 %rd25, %rd5, %rd23; ld.global.nc.f32 %f226, [%rd25]; add.s64 %rd26, %rd4, %rd23; ld.global.nc.f32 %f227, [%rd26]; BB0_7: mul.f32 %f108, %f226, %f226; fma.rn.f32 %f109, %f225, %f225, %f108; fma.rn.f32 %f10, %f227, %f227, %f109; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs33, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd27, %r12; add.s64 %rd28, %rd3, %rd27; ld.global.nc.u8 %rs33, [%rd28]; BB0_9: setp.gt.u16 %p12, %rs33, %rs1; cvt.u32.u16 %r53, %rs33; and.b32 %r54, %r53, 255; selp.b32 %r55, %r7, %r54, %p12; selp.b32 %r56, %r54, %r7, %p12; add.s32 %r57, %r56, 1; mul.lo.s32 %r58, %r57, %r56; shr.u32 %r59, %r58, 1; add.s32 %r60, %r59, %r55; mul.wide.s32 %rd29, %r60, 4; add.s64 %rd30, %rd2, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.s64 %rd31, %rd1, %rd29; ld.global.nc.f32 %f12, [%rd31]; setp.ne.s16 %p13, %rs14, 0; mov.f32 %f237, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f238, %f237; mov.f32 %f239, %f237; @%p15 bra BB0_13; setp.neu.f32 %p16, %f10, 0f00000000; @%p16 bra BB0_12; mul.f32 %f113, %f12, 0f3F000000; div.rn.f32 %f114, %f113, %f11; mul.f32 %f115, %f114, %f99; fma.rn.f32 %f225, %f3, %f115, %f1; mul.f32 %f116, %f1, %f115; sub.f32 %f227, %f3, %f116; mov.f32 %f226, %f2; BB0_12: mul.f32 %f117, %f99, %f99; add.f32 %f118, %f11, %f11; div.rn.f32 %f119, %f118, %f117; sub.f32 %f120, %f225, %f1; sub.f32 %f121, %f226, %f2; sub.f32 %f122, %f227, %f3; fma.rn.f32 %f123, %f120, %f119, 0f00000000; fma.rn.f32 %f238, %f121, %f119, 0f00000000; fma.rn.f32 %f124, %f119, %f122, 0f00000000; div.rn.f32 %f125, %f12, %f99; mul.f32 %f126, %f227, %f125; sub.f32 %f237, %f123, %f126; fma.rn.f32 %f239, %f225, %f125, %f124; BB0_13: add.s32 %r13, %r1, 1; @%p7 bra BB0_15; rem.s32 %r61, %r13, %r36; add.s32 %r62, %r61, %r36; rem.s32 %r123, %r62, %r36; bra.uni BB0_16; BB0_15: add.s32 %r63, %r36, -1; min.s32 %r123, %r13, %r63; BB0_16: add.s32 %r17, %r123, %r5; setp.ge.s32 %p18, %r13, %r36; mov.f32 %f231, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f232, %f231; mov.f32 %f233, %f231; @%p20 bra BB0_18; mul.wide.s32 %rd32, %r17, 4; add.s64 %rd33, %rd6, %rd32; ld.global.nc.f32 %f231, [%rd33]; add.s64 %rd34, %rd5, %rd32; ld.global.nc.f32 %f232, [%rd34]; add.s64 %rd35, %rd4, %rd32; ld.global.nc.f32 %f233, [%rd35]; BB0_18: mul.f32 %f130, %f232, %f232; fma.rn.f32 %f131, %f231, %f231, %f130; fma.rn.f32 %f30, %f233, %f233, %f131; setp.eq.f32 %p21, %f30, 0f00000000; mov.u16 %rs34, %rs1; @%p21 bra BB0_20; cvt.s64.s32 %rd36, %r17; add.s64 %rd37, %rd3, %rd36; ld.global.nc.u8 %rs34, [%rd37]; BB0_20: setp.gt.u16 %p22, %rs34, %rs1; cvt.u32.u16 %r64, %rs34; and.b32 %r65, %r64, 255; selp.b32 %r66, %r7, %r65, %p22; selp.b32 %r67, %r65, %r7, %p22; add.s32 %r68, %r67, 1; mul.lo.s32 %r69, %r68, %r67; shr.u32 %r70, %r69, 1; add.s32 %r71, %r70, %r66; mul.wide.s32 %rd38, %r71, 4; add.s64 %rd39, %rd2, %rd38; ld.global.nc.f32 %f31, [%rd39]; add.s64 %rd40, %rd1, %rd38; ld.global.nc.f32 %f32, [%rd40]; and.pred %p25, %p21, %p13; @%p25 bra BB0_24; setp.neu.f32 %p26, %f30, 0f00000000; @%p26 bra BB0_23; mul.f32 %f132, %f32, 0f3F000000; div.rn.f32 %f133, %f132, %f31; mul.f32 %f134, %f133, %f99; mul.f32 %f135, %f3, %f134; sub.f32 %f231, %f1, %f135; fma.rn.f32 %f233, %f1, %f134, %f3; mov.f32 %f232, %f2; BB0_23: mul.f32 %f136, %f99, %f99; add.f32 %f137, %f31, %f31; div.rn.f32 %f138, %f137, %f136; sub.f32 %f139, %f231, %f1; sub.f32 %f140, %f232, %f2; sub.f32 %f141, %f233, %f3; fma.rn.f32 %f142, %f139, %f138, %f237; fma.rn.f32 %f238, %f140, %f138, %f238; fma.rn.f32 %f143, %f138, %f141, %f239; div.rn.f32 %f144, %f32, %f99; fma.rn.f32 %f237, %f233, %f144, %f142; mul.f32 %f145, %f231, %f144; sub.f32 %f239, %f143, %f145; BB0_24: and.b16 %rs7, %rs13, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r18, %r2, -1; @%p27 bra BB0_26; rem.s32 %r72, %r18, %r37; add.s32 %r73, %r72, %r37; rem.s32 %r124, %r73, %r37; bra.uni BB0_27; BB0_26: mov.u32 %r74, 0; max.s32 %r124, %r18, %r74; BB0_27: add.s32 %r75, %r124, %r4; mad.lo.s32 %r22, %r75, %r36, %r1; setp.lt.s32 %p29, %r18, 0; mov.f32 %f240, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f241, %f240; mov.f32 %f242, %f240; @%p30 bra BB0_29; mul.wide.s32 %rd41, %r22, 4; add.s64 %rd42, %rd6, %rd41; ld.global.nc.f32 %f240, [%rd42]; add.s64 %rd43, %rd5, %rd41; ld.global.nc.f32 %f241, [%rd43]; add.s64 %rd44, %rd4, %rd41; ld.global.nc.f32 %f242, [%rd44]; BB0_29: mul.f32 %f149, %f241, %f241; fma.rn.f32 %f150, %f240, %f240, %f149; fma.rn.f32 %f50, %f242, %f242, %f150; setp.eq.f32 %p31, %f50, 0f00000000; mov.u16 %rs35, %rs1; @%p31 bra BB0_31; cvt.s64.s32 %rd45, %r22; add.s64 %rd46, %rd3, %rd45; ld.global.nc.u8 %rs35, [%rd46]; BB0_31: setp.gt.u16 %p32, %rs35, %rs1; cvt.u32.u16 %r76, %rs35; and.b32 %r77, %r76, 255; selp.b32 %r78, %r7, %r77, %p32; selp.b32 %r79, %r77, %r7, %p32; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd47, %r83, 4; add.s64 %rd48, %rd2, %rd47; ld.global.nc.f32 %f51, [%rd48]; add.s64 %rd49, %rd1, %rd47; ld.global.nc.f32 %f52, [%rd49]; and.pred %p35, %p31, %p13; @%p35 bra BB0_35; setp.neu.f32 %p36, %f50, 0f00000000; @%p36 bra BB0_34; mul.f32 %f151, %f52, 0f3F000000; div.rn.f32 %f152, %f151, %f51; mul.f32 %f153, %f152, %f100; fma.rn.f32 %f241, %f3, %f153, %f2; mul.f32 %f154, %f2, %f153; sub.f32 %f242, %f3, %f154; mov.f32 %f240, %f1; BB0_34: mul.f32 %f155, %f100, %f100; add.f32 %f156, %f51, %f51; div.rn.f32 %f157, %f156, %f155; sub.f32 %f158, %f240, %f1; sub.f32 %f159, %f241, %f2; sub.f32 %f160, %f242, %f3; fma.rn.f32 %f237, %f158, %f157, %f237; fma.rn.f32 %f161, %f159, %f157, %f238; fma.rn.f32 %f162, %f157, %f160, %f239; div.rn.f32 %f163, %f52, %f100; mul.f32 %f164, %f242, %f163; sub.f32 %f238, %f161, %f164; fma.rn.f32 %f239, %f241, %f163, %f162; BB0_35: add.s32 %r23, %r2, 1; @%p27 bra BB0_37; rem.s32 %r84, %r23, %r37; add.s32 %r85, %r84, %r37; rem.s32 %r125, %r85, %r37; bra.uni BB0_38; BB0_37: add.s32 %r86, %r37, -1; min.s32 %r125, %r23, %r86; BB0_38: add.s32 %r87, %r125, %r4; mad.lo.s32 %r27, %r87, %r36, %r1; setp.ge.s32 %p38, %r23, %r37; mov.f32 %f249, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f250, %f249; mov.f32 %f251, %f249; @%p40 bra BB0_40; mul.wide.s32 %rd50, %r27, 4; add.s64 %rd51, %rd6, %rd50; ld.global.nc.f32 %f249, [%rd51]; add.s64 %rd52, %rd5, %rd50; ld.global.nc.f32 %f250, [%rd52]; add.s64 %rd53, %rd4, %rd50; ld.global.nc.f32 %f251, [%rd53]; BB0_40: mul.f32 %f168, %f250, %f250; fma.rn.f32 %f169, %f249, %f249, %f168; fma.rn.f32 %f70, %f251, %f251, %f169; setp.eq.f32 %p41, %f70, 0f00000000; mov.u16 %rs36, %rs1; @%p41 bra BB0_42; cvt.s64.s32 %rd54, %r27; add.s64 %rd55, %rd3, %rd54; ld.global.nc.u8 %rs36, [%rd55]; BB0_42: setp.gt.u16 %p42, %rs36, %rs1; cvt.u32.u16 %r88, %rs36; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p42; selp.b32 %r91, %r89, %r7, %p42; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd56, %r95, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f71, [%rd57]; add.s64 %rd58, %rd1, %rd56; ld.global.nc.f32 %f72, [%rd58]; and.pred %p45, %p41, %p13; @%p45 bra BB0_46; setp.neu.f32 %p46, %f70, 0f00000000; @%p46 bra BB0_45; mul.f32 %f170, %f72, 0f3F000000; div.rn.f32 %f171, %f170, %f71; mul.f32 %f172, %f171, %f100; mul.f32 %f173, %f3, %f172; sub.f32 %f250, %f2, %f173; fma.rn.f32 %f251, %f2, %f172, %f3; mov.f32 %f249, %f1; BB0_45: mul.f32 %f174, %f100, %f100; add.f32 %f175, %f71, %f71; div.rn.f32 %f176, %f175, %f174; sub.f32 %f177, %f249, %f1; sub.f32 %f178, %f250, %f2; sub.f32 %f179, %f251, %f3; fma.rn.f32 %f237, %f177, %f176, %f237; fma.rn.f32 %f180, %f178, %f176, %f238; fma.rn.f32 %f181, %f176, %f179, %f239; div.rn.f32 %f182, %f72, %f100; fma.rn.f32 %f238, %f251, %f182, %f180; mul.f32 %f183, %f250, %f182; sub.f32 %f239, %f181, %f183; BB0_46: setp.eq.s32 %p47, %r38, 1; @%p47 bra BB0_54; and.b16 %rs12, %rs13, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r28, %r3, -1; @%p48 bra BB0_49; rem.s32 %r96, %r28, %r38; add.s32 %r97, %r96, %r38; rem.s32 %r126, %r97, %r38; bra.uni BB0_50; BB0_49: mov.u32 %r98, 0; max.s32 %r126, %r28, %r98; BB0_50: mad.lo.s32 %r99, %r126, %r37, %r2; mad.lo.s32 %r100, %r99, %r36, %r1; cvt.s64.s32 %rd59, %r100; mul.wide.s32 %rd60, %r100, 4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd60; add.s64 %rd63, %rd4, %rd60; ld.global.nc.f32 %f184, [%rd61]; ld.global.nc.f32 %f185, [%rd62]; mul.f32 %f186, %f185, %f185; fma.rn.f32 %f187, %f184, %f184, %f186; ld.global.nc.f32 %f188, [%rd63]; fma.rn.f32 %f189, %f188, %f188, %f187; setp.eq.f32 %p49, %f189, 0f00000000; selp.f32 %f190, %f1, %f184, %p49; selp.f32 %f191, %f2, %f185, %p49; selp.f32 %f192, %f3, %f188, %p49; add.s64 %rd64, %rd3, %rd59; ld.global.nc.u8 %rs26, [%rd64]; setp.gt.u16 %p50, %rs26, %rs1; cvt.u32.u16 %r101, %rs26; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p50; selp.b32 %r104, %r102, %r7, %p50; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd65, %r108, 4; add.s64 %rd66, %rd2, %rd65; ld.global.nc.f32 %f193, [%rd66]; add.f32 %f194, %f193, %f193; mul.f32 %f84, %f101, %f101; div.rn.f32 %f195, %f194, %f84; sub.f32 %f196, %f190, %f1; sub.f32 %f197, %f191, %f2; sub.f32 %f198, %f192, %f3; fma.rn.f32 %f85, %f195, %f196, %f237; fma.rn.f32 %f86, %f195, %f197, %f238; fma.rn.f32 %f87, %f195, %f198, %f239; add.s32 %r32, %r3, 1; @%p48 bra BB0_52; rem.s32 %r109, %r32, %r38; add.s32 %r110, %r109, %r38; rem.s32 %r127, %r110, %r38; bra.uni BB0_53; BB0_52: add.s32 %r111, %r38, -1; min.s32 %r127, %r32, %r111; BB0_53: mad.lo.s32 %r112, %r127, %r37, %r2; mad.lo.s32 %r113, %r112, %r36, %r1; cvt.s64.s32 %rd67, %r113; mul.wide.s32 %rd68, %r113, 4; add.s64 %rd69, %rd6, %rd68; add.s64 %rd70, %rd5, %rd68; add.s64 %rd71, %rd4, %rd68; ld.global.nc.f32 %f199, [%rd69]; ld.global.nc.f32 %f200, [%rd70]; mul.f32 %f201, %f200, %f200; fma.rn.f32 %f202, %f199, %f199, %f201; ld.global.nc.f32 %f203, [%rd71]; fma.rn.f32 %f204, %f203, %f203, %f202; setp.eq.f32 %p52, %f204, 0f00000000; selp.f32 %f205, %f3, %f203, %p52; selp.f32 %f206, %f2, %f200, %p52; selp.f32 %f207, %f1, %f199, %p52; add.s64 %rd72, %rd3, %rd67; ld.global.nc.u8 %rs30, [%rd72]; setp.gt.u16 %p53, %rs30, %rs1; cvt.u32.u16 %r114, %rs30; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p53; selp.b32 %r117, %r115, %r7, %p53; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd73, %r121, 4; add.s64 %rd74, %rd2, %rd73; ld.global.nc.f32 %f208, [%rd74]; add.f32 %f209, %f208, %f208; div.rn.f32 %f210, %f209, %f84; sub.f32 %f211, %f207, %f1; sub.f32 %f212, %f206, %f2; sub.f32 %f213, %f205, %f3; fma.rn.f32 %f237, %f210, %f211, %f85; fma.rn.f32 %f238, %f210, %f212, %f86; fma.rn.f32 %f239, %f210, %f213, %f87; BB0_54: setp.eq.s64 %p54, %rd10, 0; @%p54 bra BB0_56; cvta.to.global.u64 %rd75, %rd10; add.s64 %rd77, %rd75, %rd17; ld.global.nc.f32 %f214, [%rd77]; mul.f32 %f261, %f214, %f261; BB0_56: setp.eq.f32 %p55, %f261, 0f00000000; mov.f32 %f262, 0f00000000; @%p55 bra BB0_58; rcp.rn.f32 %f262, %f261; BB0_58: cvta.to.global.u64 %rd78, %rd9; cvta.to.global.u64 %rd79, %rd8; cvta.to.global.u64 %rd80, %rd7; add.s64 %rd82, %rd80, %rd17; ld.global.f32 %f216, [%rd82]; fma.rn.f32 %f217, %f237, %f262, %f216; st.global.f32 [%rd82], %f217; add.s64 %rd83, %rd79, %rd17; ld.global.f32 %f218, [%rd83]; fma.rn.f32 %f219, %f238, %f262, %f218; st.global.f32 [%rd83], %f219; add.s64 %rd84, %rd78, %rd17; ld.global.f32 %f220, [%rd84]; fma.rn.f32 %f221, %f239, %f262, %f220; st.global.f32 [%rd84], %f221; BB0_59: ret; } ` ) mumax3-3.10/cuda/dmibulk.cu000066400000000000000000000211551371432437400155570ustar00rootroot00000000000000#include #include "exchange.h" #include "float3.h" #include "stencil.h" #include "amul.h" // Exchange + Dzyaloshinskii-Moriya interaction for bulk material. // Energy: // // E = D M . rot(M) // // Effective field: // // Hx = 2A/Bs nabla²Mx + 2D/Bs dzMy - 2D/Bs dyMz // Hy = 2A/Bs nabla²My + 2D/Bs dxMz - 2D/Bs dzMx // Hz = 2A/Bs nabla²Mz + 2D/Bs dyMx - 2D/Bs dxMy // // Boundary conditions: // // 2A dxMx = 0 // D Mz + 2A dxMy = 0 // -D My + 2A dxMz = 0 // // -D Mz + 2A dyMx = 0 // 2A dyMy = 0 // D Mx + 2A dyMz = 0 // // D My + 2A dzMx = 0 // -D Mx + 2A dzMy = 0 // 2A dzMz = 0 // extern "C" __global__ void adddmibulk(float* __restrict__ Hx, float* __restrict__ Hy, float* __restrict__ Hz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ Ms_, float Ms_mul, float* __restrict__ aLUT2d, float* __restrict__ DLUT2d, uint8_t* __restrict__ regions, float cx, float cy, float cz, int Nx, int Ny, int Nz, uint8_t PBC, uint8_t OpenBC) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } int I = idx(ix, iy, iz); // central cell index float3 h = make_float3(0.0,0.0,0.0); // add to H float3 m0 = make_float3(mx[I], my[I], mz[I]); // central m uint8_t r0 = regions[I]; int i_; // neighbor index if(is0(m0)) { return; } // x derivatives (along length) { float3 m1 = make_float3(0.0f, 0.0f, 0.0f); // left neighbor i_ = idx(lclampx(ix-1), iy, iz); // load neighbor m if inside grid, keep 0 otherwise if (ix-1 >= 0 || PBCx) { m1 = make_float3(mx[i_], my[i_], mz[i_]); } int r1 = is0(m1)? r0 : regions[i_]; float A = aLUT2d[symidx(r0, r1)]; float D = DLUT2d[symidx(r0, r1)]; float D_2A = D/(2.0f*A); if (!is0(m1) || !OpenBC){ // do nothing at an open boundary if (is0(m1)) { // neighbor missing m1.x = m0.x; m1.y = m0.y - (-cx * D_2A * m0.z); m1.z = m0.z + (-cx * D_2A * m0.y); } h += (2.0f*A/(cx*cx)) * (m1 - m0); // exchange h.y += (D/cx)*(-m1.z); h.z -= (D/cx)*(-m1.y); } } { float3 m2 = make_float3(0.0f, 0.0f, 0.0f); // right neighbor i_ = idx(hclampx(ix+1), iy, iz); if (ix+1 < Nx || PBCx) { m2 = make_float3(mx[i_], my[i_], mz[i_]); } int r1 = is0(m2)? r0 : regions[i_]; float A = aLUT2d[symidx(r0, r1)]; float D = DLUT2d[symidx(r0, r1)]; float D_2A = D/(2.0f*A); if (!is0(m2) || !OpenBC){ if (is0(m2)) { m2.x = m0.x; m2.y = m0.y - (+cx * D_2A * m0.z); m2.z = m0.z + (+cx * D_2A * m0.y); } h += (2.0f*A/(cx*cx)) * (m2 - m0); h.y += (D/cx)*(m2.z); h.z -= (D/cx)*(m2.y); } } // y derivatives (along height) { float3 m1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, lclampy(iy-1), iz); if (iy-1 >= 0 || PBCy) { m1 = make_float3(mx[i_], my[i_], mz[i_]); } int r1 = is0(m1)? r0 : regions[i_]; float A = aLUT2d[symidx(r0, r1)]; float D = DLUT2d[symidx(r0, r1)]; float D_2A = D/(2.0f*A); if (!is0(m1) || !OpenBC){ if (is0(m1)) { m1.x = m0.x + (-cy * D_2A * m0.z); m1.y = m0.y; m1.z = m0.z - (-cy * D_2A * m0.x); } h += (2.0f*A/(cy*cy)) * (m1 - m0); h.x -= (D/cy)*(-m1.z); h.z += (D/cy)*(-m1.x); } } { float3 m2 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, hclampy(iy+1), iz); if (iy+1 < Ny || PBCy) { m2 = make_float3(mx[i_], my[i_], mz[i_]); } int r1 = is0(m2)? r0 : regions[i_]; float A = aLUT2d[symidx(r0, r1)]; float D = DLUT2d[symidx(r0, r1)]; float D_2A = D/(2.0f*A); if (!is0(m2) || !OpenBC){ if (is0(m2)) { m2.x = m0.x + (+cy * D_2A * m0.z); m2.y = m0.y; m2.z = m0.z - (+cy * D_2A * m0.x); } h += (2.0f*A/(cy*cy)) * (m2 - m0); h.x -= (D/cy)*(m2.z); h.z += (D/cy)*(m2.x); } } // only take vertical derivative for 3D sim if (Nz != 1) { // bottom neighbor { float3 m1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, iy, lclampz(iz-1)); if (iz-1 >= 0 || PBCz) { m1 = make_float3(mx[i_], my[i_], mz[i_]); } int r1 = is0(m1)? r0 : regions[i_]; float A = aLUT2d[symidx(r0, r1)]; float D = DLUT2d[symidx(r0, r1)]; float D_2A = D/(2.0f*A); if (!is0(m1) || !OpenBC){ if (is0(m1)) { m1.x = m0.x - (-cz * D_2A * m0.y); m1.y = m0.y + (-cz * D_2A * m0.x); m1.z = m0.z; } h += (2.0f*A/(cz*cz)) * (m1 - m0); h.x += (D/cz)*(- m1.y); h.y -= (D/cz)*(- m1.x); } } // top neighbor { float3 m2 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, iy, hclampz(iz+1)); if (iz+1 < Nz || PBCz) { m2 = make_float3(mx[i_], my[i_], mz[i_]); } int r1 = is0(m2)? r0 : regions[i_]; float A = aLUT2d[symidx(r0, r1)]; float D = DLUT2d[symidx(r0, r1)]; float D_2A = D/(2.0f*A); if (!is0(m2) || !OpenBC){ if (is0(m2)) { m2.x = m0.x - (+cz * D_2A * m0.y); m2.y = m0.y + (+cz * D_2A * m0.x); m2.z = m0.z; } h += (2.0f*A/(cz*cz)) * (m2 - m0); h.x += (D/cz)*(m2.y ); h.y -= (D/cz)*(m2.x ); } } } // write back, result is H + Hdmi + Hex float invMs = inv_Msat(Ms_, Ms_mul, I); Hx[I] += h.x*invMs; Hy[I] += h.y*invMs; Hz[I] += h.z*invMs; } // Note on boundary conditions. // // We need the derivative and laplacian of m in point A, but e.g. C lies out of the boundaries. // We use the boundary condition in B (derivative of the magnetization) to extrapolate m to point C: // m_C = m_A + (dm/dx)|_B * cellsize // // When point C is inside the boundary, we just use its actual value. // // Then we can take the central derivative in A: // (dm/dx)|_A = (m_C - m_D) / (2*cellsize) // And the laplacian: // lapl(m)|_A = (m_C + m_D - 2*m_A) / (cellsize^2) // // All these operations should be second order as they involve only central derivatives. // // ------------------------------------------------------------------ * // | | C | // | | ** | // | | *** | // | | *** | // | | *** | // | | *** | // | B | // | *** | | // | *** | | // | **** | | // | **** | | // | **** | | // | ** A | | // | ***** | | // | ****** | | // | ********* | | // |D ******** | | // | | | // +----------------+----------------+-----------------+---------------+ // -1 -0.5 0 0.5 1 // x mumax3-3.10/cuda/dmibulk.go000066400000000000000000000014241371432437400155520ustar00rootroot00000000000000package cuda import ( "unsafe" "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Add effective field due to bulk Dzyaloshinskii-Moriya interaction to Beff. // See dmibulk.cu func AddDMIBulk(Beff *data.Slice, m *data.Slice, Aex_red, D_red SymmLUT, Msat MSlice, regions *Bytes, mesh *data.Mesh, OpenBC bool) { cellsize := mesh.CellSize() N := Beff.Size() util.Argument(m.Size() == N) cfg := make3DConf(N) var openBC byte if OpenBC { openBC = 1 } k_adddmibulk_async(Beff.DevPtr(X), Beff.DevPtr(Y), Beff.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), Msat.DevPtr(0), Msat.Mul(0), unsafe.Pointer(Aex_red), unsafe.Pointer(D_red), regions.Ptr, float32(cellsize[X]), float32(cellsize[Y]), float32(cellsize[Z]), N[X], N[Y], N[Z], mesh.PBC_code(), openBC, cfg) } mumax3-3.10/cuda/dmibulk_wrapper.go000066400000000000000000006732421371432437400173270ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for adddmibulk kernel var adddmibulk_code cu.Function // Stores the arguments for adddmibulk kernel invocation type adddmibulk_args_t struct { arg_Hx unsafe.Pointer arg_Hy unsafe.Pointer arg_Hz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_Ms_ unsafe.Pointer arg_Ms_mul float32 arg_aLUT2d unsafe.Pointer arg_DLUT2d unsafe.Pointer arg_regions unsafe.Pointer arg_cx float32 arg_cy float32 arg_cz float32 arg_Nx int arg_Ny int arg_Nz int arg_PBC byte arg_OpenBC byte argptr [19]unsafe.Pointer sync.Mutex } // Stores the arguments for adddmibulk kernel invocation var adddmibulk_args adddmibulk_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. adddmibulk_args.argptr[0] = unsafe.Pointer(&adddmibulk_args.arg_Hx) adddmibulk_args.argptr[1] = unsafe.Pointer(&adddmibulk_args.arg_Hy) adddmibulk_args.argptr[2] = unsafe.Pointer(&adddmibulk_args.arg_Hz) adddmibulk_args.argptr[3] = unsafe.Pointer(&adddmibulk_args.arg_mx) adddmibulk_args.argptr[4] = unsafe.Pointer(&adddmibulk_args.arg_my) adddmibulk_args.argptr[5] = unsafe.Pointer(&adddmibulk_args.arg_mz) adddmibulk_args.argptr[6] = unsafe.Pointer(&adddmibulk_args.arg_Ms_) adddmibulk_args.argptr[7] = unsafe.Pointer(&adddmibulk_args.arg_Ms_mul) adddmibulk_args.argptr[8] = unsafe.Pointer(&adddmibulk_args.arg_aLUT2d) adddmibulk_args.argptr[9] = unsafe.Pointer(&adddmibulk_args.arg_DLUT2d) adddmibulk_args.argptr[10] = unsafe.Pointer(&adddmibulk_args.arg_regions) adddmibulk_args.argptr[11] = unsafe.Pointer(&adddmibulk_args.arg_cx) adddmibulk_args.argptr[12] = unsafe.Pointer(&adddmibulk_args.arg_cy) adddmibulk_args.argptr[13] = unsafe.Pointer(&adddmibulk_args.arg_cz) adddmibulk_args.argptr[14] = unsafe.Pointer(&adddmibulk_args.arg_Nx) adddmibulk_args.argptr[15] = unsafe.Pointer(&adddmibulk_args.arg_Ny) adddmibulk_args.argptr[16] = unsafe.Pointer(&adddmibulk_args.arg_Nz) adddmibulk_args.argptr[17] = unsafe.Pointer(&adddmibulk_args.arg_PBC) adddmibulk_args.argptr[18] = unsafe.Pointer(&adddmibulk_args.arg_OpenBC) } // Wrapper for adddmibulk CUDA kernel, asynchronous. func k_adddmibulk_async(Hx unsafe.Pointer, Hy unsafe.Pointer, Hz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, aLUT2d unsafe.Pointer, DLUT2d unsafe.Pointer, regions unsafe.Pointer, cx float32, cy float32, cz float32, Nx int, Ny int, Nz int, PBC byte, OpenBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("adddmibulk") } adddmibulk_args.Lock() defer adddmibulk_args.Unlock() if adddmibulk_code == 0 { adddmibulk_code = fatbinLoad(adddmibulk_map, "adddmibulk") } adddmibulk_args.arg_Hx = Hx adddmibulk_args.arg_Hy = Hy adddmibulk_args.arg_Hz = Hz adddmibulk_args.arg_mx = mx adddmibulk_args.arg_my = my adddmibulk_args.arg_mz = mz adddmibulk_args.arg_Ms_ = Ms_ adddmibulk_args.arg_Ms_mul = Ms_mul adddmibulk_args.arg_aLUT2d = aLUT2d adddmibulk_args.arg_DLUT2d = DLUT2d adddmibulk_args.arg_regions = regions adddmibulk_args.arg_cx = cx adddmibulk_args.arg_cy = cy adddmibulk_args.arg_cz = cz adddmibulk_args.arg_Nx = Nx adddmibulk_args.arg_Ny = Ny adddmibulk_args.arg_Nz = Nz adddmibulk_args.arg_PBC = PBC adddmibulk_args.arg_OpenBC = OpenBC args := adddmibulk_args.argptr[:] cu.LaunchKernel(adddmibulk_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("adddmibulk") } } // maps compute capability on PTX code for adddmibulk kernel. var adddmibulk_map = map[int]string{0: "", 30: adddmibulk_ptx_30, 32: adddmibulk_ptx_32, 35: adddmibulk_ptx_35, 37: adddmibulk_ptx_37, 50: adddmibulk_ptx_50, 52: adddmibulk_ptx_52, 53: adddmibulk_ptx_53, 60: adddmibulk_ptx_60, 61: adddmibulk_ptx_61, 62: adddmibulk_ptx_62, 70: adddmibulk_ptx_70, 72: adddmibulk_ptx_72, 75: adddmibulk_ptx_75} // adddmibulk PTX code for various compute capabilities. const ( adddmibulk_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<73>; .reg .b16 %rs<47>; .reg .f32 %f<292>; .reg .b32 %r<238>; .reg .b64 %rd<121>; ld.param.u64 %rd13, [adddmibulk_param_0]; ld.param.u64 %rd14, [adddmibulk_param_1]; ld.param.u64 %rd15, [adddmibulk_param_2]; ld.param.u64 %rd16, [adddmibulk_param_3]; ld.param.u64 %rd17, [adddmibulk_param_4]; ld.param.u64 %rd18, [adddmibulk_param_5]; ld.param.u64 %rd19, [adddmibulk_param_6]; ld.param.f32 %f290, [adddmibulk_param_7]; ld.param.u64 %rd20, [adddmibulk_param_8]; ld.param.u64 %rd21, [adddmibulk_param_9]; ld.param.u64 %rd22, [adddmibulk_param_10]; ld.param.f32 %f87, [adddmibulk_param_11]; ld.param.f32 %f88, [adddmibulk_param_12]; ld.param.f32 %f89, [adddmibulk_param_13]; ld.param.u32 %r34, [adddmibulk_param_14]; ld.param.u32 %r35, [adddmibulk_param_15]; ld.param.u32 %r36, [adddmibulk_param_16]; ld.param.u8 %rs17, [adddmibulk_param_18]; ld.param.u8 %rs16, [adddmibulk_param_17]; mov.u32 %r37, %ntid.x; mov.u32 %r38, %ctaid.x; mov.u32 %r39, %tid.x; mad.lo.s32 %r1, %r37, %r38, %r39; mov.u32 %r40, %ntid.y; mov.u32 %r41, %ctaid.y; mov.u32 %r42, %tid.y; mad.lo.s32 %r2, %r40, %r41, %r42; mov.u32 %r43, %ntid.z; mov.u32 %r44, %ctaid.z; mov.u32 %r45, %tid.z; mad.lo.s32 %r3, %r43, %r44, %r45; setp.ge.s32 %p1, %r2, %r35; setp.ge.s32 %p2, %r1, %r34; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r36; or.pred %p5, %p3, %p4; @%p5 bra BB0_62; cvta.to.global.u64 %rd23, %rd22; cvta.to.global.u64 %rd24, %rd18; cvta.to.global.u64 %rd25, %rd17; cvta.to.global.u64 %rd26, %rd16; mad.lo.s32 %r46, %r3, %r35, %r2; mad.lo.s32 %r47, %r46, %r34, %r1; mul.wide.s32 %rd27, %r47, 4; add.s64 %rd28, %rd26, %rd27; cvt.s64.s32 %rd29, %r47; add.s64 %rd30, %rd25, %rd27; add.s64 %rd31, %rd24, %rd27; add.s64 %rd32, %rd23, %rd29; ld.global.u8 %rs1, [%rd32]; ld.global.f32 %f1, [%rd28]; ld.global.f32 %f2, [%rd30]; mul.f32 %f90, %f2, %f2; fma.rn.f32 %f91, %f1, %f1, %f90; ld.global.f32 %f3, [%rd31]; fma.rn.f32 %f92, %f3, %f3, %f91; setp.eq.f32 %p6, %f92, 0f00000000; @%p6 bra BB0_62; and.b16 %rs18, %rs16, 1; setp.eq.b16 %p7, %rs18, 1; add.s32 %r4, %r1, -1; @!%p7 bra BB0_4; bra.uni BB0_3; BB0_3: rem.s32 %r52, %r4, %r34; add.s32 %r53, %r52, %r34; rem.s32 %r232, %r53, %r34; bra.uni BB0_5; BB0_4: mov.u32 %r54, 0; max.s32 %r232, %r4, %r54; BB0_5: mad.lo.s32 %r8, %r46, %r34, %r232; setp.eq.b16 %p8, %rs18, 1; not.pred %p9, %p8; setp.lt.s32 %p10, %r4, 0; mov.f32 %f254, 0f00000000; and.pred %p11, %p10, %p9; mov.f32 %f255, %f254; mov.f32 %f256, %f254; @%p11 bra BB0_7; mul.wide.s32 %rd34, %r8, 4; add.s64 %rd35, %rd26, %rd34; ld.global.f32 %f254, [%rd35]; add.s64 %rd37, %rd25, %rd34; ld.global.f32 %f255, [%rd37]; add.s64 %rd39, %rd24, %rd34; ld.global.f32 %f256, [%rd39]; BB0_7: mul.f32 %f96, %f255, %f255; fma.rn.f32 %f97, %f254, %f254, %f96; fma.rn.f32 %f10, %f256, %f256, %f97; setp.eq.f32 %p12, %f10, 0f00000000; mov.u16 %rs41, %rs1; @%p12 bra BB0_9; cvt.s64.s32 %rd41, %r8; add.s64 %rd42, %rd23, %rd41; ld.global.u8 %rs41, [%rd42]; BB0_9: cvt.u32.u16 %r64, %rs1; and.b32 %r65, %r64, 255; setp.gt.u16 %p13, %rs41, %rs1; cvt.u32.u16 %r66, %rs41; and.b32 %r67, %r66, 255; selp.b32 %r68, %r65, %r67, %p13; selp.b32 %r69, %r67, %r65, %p13; add.s32 %r70, %r69, 1; mul.lo.s32 %r71, %r70, %r69; shr.u32 %r72, %r71, 1; add.s32 %r73, %r72, %r68; cvta.to.global.u64 %rd43, %rd20; mul.wide.s32 %rd44, %r73, 4; add.s64 %rd1, %rd43, %rd44; cvta.to.global.u64 %rd45, %rd21; add.s64 %rd2, %rd45, %rd44; setp.ne.s16 %p14, %rs17, 0; mov.f32 %f263, 0f00000000; and.pred %p16, %p12, %p14; mov.f32 %f264, %f263; mov.f32 %f265, %f263; @%p16 bra BB0_11; ld.global.f32 %f101, [%rd1]; add.f32 %f102, %f101, %f101; ld.global.f32 %f103, [%rd2]; div.rn.f32 %f104, %f103, %f102; mul.f32 %f105, %f104, %f87; fma.rn.f32 %f106, %f3, %f105, %f2; mul.f32 %f107, %f2, %f105; sub.f32 %f108, %f3, %f107; selp.f32 %f109, %f1, %f254, %p12; selp.f32 %f110, %f106, %f255, %p12; selp.f32 %f111, %f108, %f256, %p12; mul.f32 %f112, %f87, %f87; div.rn.f32 %f113, %f102, %f112; sub.f32 %f114, %f109, %f1; sub.f32 %f115, %f110, %f2; sub.f32 %f116, %f111, %f3; fma.rn.f32 %f263, %f114, %f113, 0f00000000; fma.rn.f32 %f117, %f115, %f113, 0f00000000; fma.rn.f32 %f118, %f116, %f113, 0f00000000; div.rn.f32 %f119, %f103, %f87; mul.f32 %f120, %f111, %f119; sub.f32 %f264, %f117, %f120; fma.rn.f32 %f265, %f110, %f119, %f118; BB0_11: setp.eq.b16 %p18, %rs18, 1; add.s32 %r9, %r1, 1; @!%p18 bra BB0_13; bra.uni BB0_12; BB0_12: rem.s32 %r78, %r9, %r34; add.s32 %r79, %r78, %r34; rem.s32 %r233, %r79, %r34; bra.uni BB0_14; BB0_13: add.s32 %r80, %r34, -1; min.s32 %r233, %r9, %r80; BB0_14: setp.eq.b16 %p19, %rs18, 1; not.pred %p20, %p19; mad.lo.s32 %r13, %r46, %r34, %r233; setp.ge.s32 %p21, %r9, %r34; mov.f32 %f260, 0f00000000; and.pred %p22, %p21, %p20; mov.f32 %f261, %f260; mov.f32 %f262, %f260; @%p22 bra BB0_16; mul.wide.s32 %rd47, %r13, 4; add.s64 %rd48, %rd26, %rd47; ld.global.f32 %f260, [%rd48]; add.s64 %rd50, %rd25, %rd47; ld.global.f32 %f261, [%rd50]; add.s64 %rd52, %rd24, %rd47; ld.global.f32 %f262, [%rd52]; BB0_16: mul.f32 %f124, %f261, %f261; fma.rn.f32 %f125, %f260, %f260, %f124; fma.rn.f32 %f23, %f262, %f262, %f125; setp.eq.f32 %p23, %f23, 0f00000000; mov.u16 %rs42, %rs1; @%p23 bra BB0_18; cvt.s64.s32 %rd54, %r13; add.s64 %rd55, %rd23, %rd54; ld.global.u8 %rs42, [%rd55]; BB0_18: setp.gt.u16 %p24, %rs42, %rs1; cvt.u32.u16 %r92, %rs42; and.b32 %r93, %r92, 255; selp.b32 %r94, %r65, %r93, %p24; selp.b32 %r95, %r93, %r65, %p24; add.s32 %r96, %r95, 1; mul.lo.s32 %r97, %r96, %r95; shr.u32 %r98, %r97, 1; add.s32 %r99, %r98, %r94; mul.wide.s32 %rd57, %r99, 4; add.s64 %rd3, %rd43, %rd57; add.s64 %rd4, %rd45, %rd57; and.pred %p27, %p23, %p14; @%p27 bra BB0_20; ld.global.f32 %f126, [%rd3]; add.f32 %f127, %f126, %f126; ld.global.f32 %f128, [%rd4]; div.rn.f32 %f129, %f128, %f127; mul.f32 %f130, %f129, %f87; mul.f32 %f131, %f3, %f130; sub.f32 %f132, %f2, %f131; fma.rn.f32 %f133, %f2, %f130, %f3; selp.f32 %f134, %f1, %f260, %p23; selp.f32 %f135, %f132, %f261, %p23; selp.f32 %f136, %f133, %f262, %p23; mul.f32 %f137, %f87, %f87; div.rn.f32 %f138, %f127, %f137; sub.f32 %f139, %f134, %f1; sub.f32 %f140, %f135, %f2; sub.f32 %f141, %f136, %f3; fma.rn.f32 %f263, %f139, %f138, %f263; fma.rn.f32 %f142, %f140, %f138, %f264; fma.rn.f32 %f143, %f141, %f138, %f265; div.rn.f32 %f144, %f128, %f87; fma.rn.f32 %f264, %f136, %f144, %f142; mul.f32 %f145, %f135, %f144; sub.f32 %f265, %f143, %f145; BB0_20: and.b16 %rs6, %rs16, 2; setp.eq.s16 %p29, %rs6, 0; add.s32 %r14, %r2, -1; @%p29 bra BB0_22; rem.s32 %r104, %r14, %r35; add.s32 %r105, %r104, %r35; rem.s32 %r234, %r105, %r35; bra.uni BB0_23; BB0_22: mov.u32 %r106, 0; max.s32 %r234, %r14, %r106; BB0_23: mad.lo.s32 %r111, %r3, %r35, %r234; mad.lo.s32 %r18, %r111, %r34, %r1; setp.lt.s32 %p31, %r14, 0; mov.f32 %f266, 0f00000000; and.pred %p32, %p31, %p29; mov.f32 %f267, %f266; mov.f32 %f268, %f266; @%p32 bra BB0_25; mul.wide.s32 %rd60, %r18, 4; add.s64 %rd61, %rd26, %rd60; ld.global.f32 %f266, [%rd61]; add.s64 %rd63, %rd25, %rd60; ld.global.f32 %f267, [%rd63]; add.s64 %rd65, %rd24, %rd60; ld.global.f32 %f268, [%rd65]; BB0_25: mul.f32 %f149, %f267, %f267; fma.rn.f32 %f150, %f266, %f266, %f149; fma.rn.f32 %f36, %f268, %f268, %f150; setp.eq.f32 %p33, %f36, 0f00000000; mov.u16 %rs43, %rs1; @%p33 bra BB0_27; cvt.s64.s32 %rd67, %r18; add.s64 %rd68, %rd23, %rd67; ld.global.u8 %rs43, [%rd68]; BB0_27: setp.gt.u16 %p34, %rs43, %rs1; cvt.u32.u16 %r118, %rs43; and.b32 %r119, %r118, 255; selp.b32 %r120, %r65, %r119, %p34; selp.b32 %r121, %r119, %r65, %p34; add.s32 %r122, %r121, 1; mul.lo.s32 %r123, %r122, %r121; shr.u32 %r124, %r123, 1; add.s32 %r125, %r124, %r120; mul.wide.s32 %rd70, %r125, 4; add.s64 %rd5, %rd43, %rd70; add.s64 %rd6, %rd45, %rd70; and.pred %p37, %p33, %p14; @%p37 bra BB0_29; ld.global.f32 %f151, [%rd5]; add.f32 %f152, %f151, %f151; ld.global.f32 %f153, [%rd6]; div.rn.f32 %f154, %f153, %f152; mul.f32 %f155, %f154, %f88; mul.f32 %f156, %f3, %f155; sub.f32 %f157, %f1, %f156; fma.rn.f32 %f158, %f1, %f155, %f3; selp.f32 %f159, %f157, %f266, %p33; selp.f32 %f160, %f2, %f267, %p33; selp.f32 %f161, %f158, %f268, %p33; mul.f32 %f162, %f88, %f88; div.rn.f32 %f163, %f152, %f162; sub.f32 %f164, %f159, %f1; sub.f32 %f165, %f160, %f2; sub.f32 %f166, %f161, %f3; fma.rn.f32 %f167, %f164, %f163, %f263; fma.rn.f32 %f264, %f165, %f163, %f264; fma.rn.f32 %f168, %f166, %f163, %f265; div.rn.f32 %f169, %f153, %f88; fma.rn.f32 %f263, %f161, %f169, %f167; mul.f32 %f170, %f159, %f169; sub.f32 %f265, %f168, %f170; BB0_29: add.s32 %r19, %r2, 1; @%p29 bra BB0_31; rem.s32 %r130, %r19, %r35; add.s32 %r131, %r130, %r35; rem.s32 %r235, %r131, %r35; bra.uni BB0_32; BB0_31: add.s32 %r132, %r35, -1; min.s32 %r235, %r19, %r132; BB0_32: shr.u16 %rs30, %rs16, 1; and.b16 %rs31, %rs30, 1; setp.eq.b16 %p40, %rs31, 1; not.pred %p41, %p40; mad.lo.s32 %r137, %r3, %r35, %r235; mad.lo.s32 %r23, %r137, %r34, %r1; setp.ge.s32 %p42, %r19, %r35; mov.f32 %f272, 0f00000000; and.pred %p43, %p42, %p41; mov.f32 %f273, %f272; mov.f32 %f274, %f272; @%p43 bra BB0_34; mul.wide.s32 %rd73, %r23, 4; add.s64 %rd74, %rd26, %rd73; ld.global.f32 %f272, [%rd74]; add.s64 %rd76, %rd25, %rd73; ld.global.f32 %f273, [%rd76]; add.s64 %rd78, %rd24, %rd73; ld.global.f32 %f274, [%rd78]; BB0_34: mul.f32 %f174, %f273, %f273; fma.rn.f32 %f175, %f272, %f272, %f174; fma.rn.f32 %f49, %f274, %f274, %f175; setp.eq.f32 %p44, %f49, 0f00000000; mov.u16 %rs44, %rs1; @%p44 bra BB0_36; cvt.s64.s32 %rd80, %r23; add.s64 %rd81, %rd23, %rd80; ld.global.u8 %rs44, [%rd81]; BB0_36: setp.gt.u16 %p45, %rs44, %rs1; cvt.u32.u16 %r144, %rs44; and.b32 %r145, %r144, 255; selp.b32 %r146, %r65, %r145, %p45; selp.b32 %r147, %r145, %r65, %p45; add.s32 %r148, %r147, 1; mul.lo.s32 %r149, %r148, %r147; shr.u32 %r150, %r149, 1; add.s32 %r151, %r150, %r146; mul.wide.s32 %rd83, %r151, 4; add.s64 %rd7, %rd43, %rd83; add.s64 %rd8, %rd45, %rd83; and.pred %p48, %p44, %p14; @%p48 bra BB0_38; ld.global.f32 %f176, [%rd7]; add.f32 %f177, %f176, %f176; ld.global.f32 %f178, [%rd8]; div.rn.f32 %f179, %f178, %f177; mul.f32 %f180, %f179, %f88; fma.rn.f32 %f181, %f3, %f180, %f1; mul.f32 %f182, %f1, %f180; sub.f32 %f183, %f3, %f182; selp.f32 %f184, %f181, %f272, %p44; selp.f32 %f185, %f2, %f273, %p44; selp.f32 %f186, %f183, %f274, %p44; mul.f32 %f187, %f88, %f88; div.rn.f32 %f188, %f177, %f187; sub.f32 %f189, %f184, %f1; sub.f32 %f190, %f185, %f2; sub.f32 %f191, %f186, %f3; fma.rn.f32 %f192, %f189, %f188, %f263; fma.rn.f32 %f264, %f190, %f188, %f264; fma.rn.f32 %f193, %f191, %f188, %f265; div.rn.f32 %f194, %f178, %f88; mul.f32 %f195, %f186, %f194; sub.f32 %f263, %f192, %f195; fma.rn.f32 %f265, %f184, %f194, %f193; BB0_38: setp.eq.s32 %p50, %r36, 1; @%p50 bra BB0_57; and.b16 %rs11, %rs16, 4; setp.eq.s16 %p51, %rs11, 0; add.s32 %r24, %r3, -1; @%p51 bra BB0_41; rem.s32 %r156, %r24, %r36; add.s32 %r157, %r156, %r36; rem.s32 %r236, %r157, %r36; bra.uni BB0_42; BB0_41: mov.u32 %r158, 0; max.s32 %r236, %r24, %r158; BB0_42: mad.lo.s32 %r163, %r236, %r35, %r2; mad.lo.s32 %r28, %r163, %r34, %r1; setp.lt.s32 %p53, %r24, 0; mov.f32 %f278, 0f00000000; and.pred %p54, %p53, %p51; mov.f32 %f279, %f278; mov.f32 %f280, %f278; @%p54 bra BB0_44; mul.wide.s32 %rd86, %r28, 4; add.s64 %rd87, %rd26, %rd86; ld.global.f32 %f278, [%rd87]; add.s64 %rd89, %rd25, %rd86; ld.global.f32 %f279, [%rd89]; add.s64 %rd91, %rd24, %rd86; ld.global.f32 %f280, [%rd91]; BB0_44: mul.f32 %f199, %f279, %f279; fma.rn.f32 %f200, %f278, %f278, %f199; fma.rn.f32 %f62, %f280, %f280, %f200; setp.eq.f32 %p55, %f62, 0f00000000; mov.u16 %rs45, %rs1; @%p55 bra BB0_46; cvt.s64.s32 %rd93, %r28; add.s64 %rd94, %rd23, %rd93; ld.global.u8 %rs45, [%rd94]; BB0_46: setp.gt.u16 %p56, %rs45, %rs1; cvt.u32.u16 %r170, %rs45; and.b32 %r171, %r170, 255; selp.b32 %r172, %r65, %r171, %p56; selp.b32 %r173, %r171, %r65, %p56; add.s32 %r174, %r173, 1; mul.lo.s32 %r175, %r174, %r173; shr.u32 %r176, %r175, 1; add.s32 %r177, %r176, %r172; mul.wide.s32 %rd96, %r177, 4; add.s64 %rd9, %rd43, %rd96; add.s64 %rd10, %rd45, %rd96; and.pred %p59, %p55, %p14; @%p59 bra BB0_48; ld.global.f32 %f201, [%rd9]; add.f32 %f202, %f201, %f201; ld.global.f32 %f203, [%rd10]; div.rn.f32 %f204, %f203, %f202; mul.f32 %f205, %f204, %f89; fma.rn.f32 %f206, %f2, %f205, %f1; mul.f32 %f207, %f1, %f205; sub.f32 %f208, %f2, %f207; selp.f32 %f209, %f206, %f278, %p55; selp.f32 %f210, %f208, %f279, %p55; selp.f32 %f211, %f3, %f280, %p55; mul.f32 %f212, %f89, %f89; div.rn.f32 %f213, %f202, %f212; sub.f32 %f214, %f209, %f1; sub.f32 %f215, %f210, %f2; sub.f32 %f216, %f211, %f3; fma.rn.f32 %f217, %f214, %f213, %f263; fma.rn.f32 %f218, %f215, %f213, %f264; fma.rn.f32 %f265, %f216, %f213, %f265; div.rn.f32 %f219, %f203, %f89; mul.f32 %f220, %f210, %f219; sub.f32 %f263, %f217, %f220; fma.rn.f32 %f264, %f209, %f219, %f218; BB0_48: add.s32 %r29, %r3, 1; @%p51 bra BB0_50; rem.s32 %r182, %r29, %r36; add.s32 %r183, %r182, %r36; rem.s32 %r237, %r183, %r36; bra.uni BB0_51; BB0_50: add.s32 %r184, %r36, -1; min.s32 %r237, %r29, %r184; BB0_51: mad.lo.s32 %r189, %r237, %r35, %r2; mad.lo.s32 %r33, %r189, %r34, %r1; setp.ge.s32 %p62, %r29, %r36; mov.f32 %f284, 0f00000000; and.pred %p64, %p62, %p51; mov.f32 %f285, %f284; mov.f32 %f286, %f284; @%p64 bra BB0_53; mul.wide.s32 %rd99, %r33, 4; add.s64 %rd100, %rd26, %rd99; ld.global.f32 %f286, [%rd100]; add.s64 %rd102, %rd25, %rd99; ld.global.f32 %f285, [%rd102]; add.s64 %rd104, %rd24, %rd99; ld.global.f32 %f284, [%rd104]; BB0_53: mul.f32 %f224, %f286, %f286; fma.rn.f32 %f225, %f285, %f285, %f224; fma.rn.f32 %f75, %f284, %f284, %f225; setp.eq.f32 %p65, %f75, 0f00000000; mov.u16 %rs46, %rs1; @%p65 bra BB0_55; cvt.s64.s32 %rd106, %r33; add.s64 %rd107, %rd23, %rd106; ld.global.u8 %rs46, [%rd107]; BB0_55: setp.gt.u16 %p66, %rs46, %rs1; cvt.u32.u16 %r196, %rs46; and.b32 %r197, %r196, 255; selp.b32 %r198, %r65, %r197, %p66; selp.b32 %r199, %r197, %r65, %p66; add.s32 %r200, %r199, 1; mul.lo.s32 %r201, %r200, %r199; shr.u32 %r202, %r201, 1; add.s32 %r203, %r202, %r198; mul.wide.s32 %rd109, %r203, 4; add.s64 %rd11, %rd43, %rd109; add.s64 %rd12, %rd45, %rd109; and.pred %p69, %p65, %p14; @%p69 bra BB0_57; ld.global.f32 %f226, [%rd11]; add.f32 %f227, %f226, %f226; ld.global.f32 %f228, [%rd12]; div.rn.f32 %f229, %f228, %f227; mul.f32 %f230, %f229, %f89; mul.f32 %f231, %f2, %f230; sub.f32 %f232, %f1, %f231; fma.rn.f32 %f233, %f1, %f230, %f2; selp.f32 %f234, %f3, %f284, %p65; selp.f32 %f235, %f233, %f285, %p65; selp.f32 %f236, %f232, %f286, %p65; mul.f32 %f237, %f89, %f89; div.rn.f32 %f238, %f227, %f237; sub.f32 %f239, %f236, %f1; sub.f32 %f240, %f235, %f2; sub.f32 %f241, %f234, %f3; fma.rn.f32 %f242, %f239, %f238, %f263; fma.rn.f32 %f243, %f240, %f238, %f264; fma.rn.f32 %f265, %f241, %f238, %f265; div.rn.f32 %f244, %f228, %f89; fma.rn.f32 %f263, %f235, %f244, %f242; mul.f32 %f245, %f236, %f244; sub.f32 %f264, %f243, %f245; BB0_57: setp.eq.s64 %p71, %rd19, 0; @%p71 bra BB0_59; cvta.to.global.u64 %rd111, %rd19; add.s64 %rd113, %rd111, %rd27; ld.global.f32 %f246, [%rd113]; mul.f32 %f290, %f246, %f290; BB0_59: setp.eq.f32 %p72, %f290, 0f00000000; mov.f32 %f291, 0f00000000; @%p72 bra BB0_61; rcp.rn.f32 %f291, %f290; BB0_61: cvta.to.global.u64 %rd114, %rd13; add.s64 %rd116, %rd114, %rd27; ld.global.f32 %f248, [%rd116]; fma.rn.f32 %f249, %f263, %f291, %f248; st.global.f32 [%rd116], %f249; cvta.to.global.u64 %rd117, %rd14; add.s64 %rd118, %rd117, %rd27; ld.global.f32 %f250, [%rd118]; fma.rn.f32 %f251, %f264, %f291, %f250; st.global.f32 [%rd118], %f251; cvta.to.global.u64 %rd119, %rd15; add.s64 %rd120, %rd119, %rd27; ld.global.f32 %f252, [%rd120]; fma.rn.f32 %f253, %f265, %f291, %f252; st.global.f32 [%rd120], %f253; BB0_62: ret; } ` adddmibulk_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<70>; .reg .b16 %rs<43>; .reg .f32 %f<292>; .reg .b32 %r<128>; .reg .b64 %rd<87>; ld.param.u64 %rd9, [adddmibulk_param_0]; ld.param.u64 %rd10, [adddmibulk_param_1]; ld.param.u64 %rd11, [adddmibulk_param_2]; ld.param.u64 %rd13, [adddmibulk_param_3]; ld.param.u64 %rd14, [adddmibulk_param_4]; ld.param.u64 %rd15, [adddmibulk_param_5]; ld.param.u64 %rd12, [adddmibulk_param_6]; ld.param.f32 %f290, [adddmibulk_param_7]; ld.param.u64 %rd16, [adddmibulk_param_8]; ld.param.u64 %rd17, [adddmibulk_param_9]; ld.param.u64 %rd18, [adddmibulk_param_10]; ld.param.f32 %f87, [adddmibulk_param_11]; ld.param.f32 %f88, [adddmibulk_param_12]; ld.param.f32 %f89, [adddmibulk_param_13]; ld.param.u32 %r43, [adddmibulk_param_14]; ld.param.u32 %r44, [adddmibulk_param_15]; ld.param.u32 %r45, [adddmibulk_param_16]; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; cvta.to.global.u64 %rd1, %rd17; cvta.to.global.u64 %rd2, %rd16; cvta.to.global.u64 %rd3, %rd18; cvta.to.global.u64 %rd4, %rd15; cvta.to.global.u64 %rd5, %rd14; cvta.to.global.u64 %rd6, %rd13; mov.u32 %r46, %ntid.x; mov.u32 %r47, %ctaid.x; mov.u32 %r48, %tid.x; mad.lo.s32 %r1, %r46, %r47, %r48; mov.u32 %r49, %ntid.y; mov.u32 %r50, %ctaid.y; mov.u32 %r51, %tid.y; mad.lo.s32 %r2, %r49, %r50, %r51; mov.u32 %r52, %ntid.z; mov.u32 %r53, %ctaid.z; mov.u32 %r54, %tid.z; mad.lo.s32 %r3, %r52, %r53, %r54; setp.ge.s32 %p1, %r2, %r44; setp.ge.s32 %p2, %r1, %r43; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r45; or.pred %p5, %p3, %p4; @%p5 bra BB0_62; mul.lo.s32 %r4, %r3, %r44; add.s32 %r55, %r4, %r2; mul.lo.s32 %r5, %r55, %r43; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd19, %r6, 4; add.s64 %rd20, %rd6, %rd19; cvt.s64.s32 %rd21, %r6; add.s64 %rd22, %rd5, %rd19; add.s64 %rd23, %rd4, %rd19; add.s64 %rd24, %rd3, %rd21; ld.global.nc.u8 %rs1, [%rd24]; cvt.u32.u16 %r56, %rs1; and.b32 %r7, %r56, 255; ld.global.nc.f32 %f1, [%rd20]; ld.global.nc.f32 %f2, [%rd22]; mul.f32 %f90, %f2, %f2; fma.rn.f32 %f91, %f1, %f1, %f90; ld.global.nc.f32 %f3, [%rd23]; fma.rn.f32 %f92, %f3, %f3, %f91; setp.eq.f32 %p6, %f92, 0f00000000; @%p6 bra BB0_62; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r57, %r8, %r43; add.s32 %r58, %r57, %r43; rem.s32 %r122, %r58, %r43; bra.uni BB0_5; BB0_4: mov.u32 %r59, 0; max.s32 %r122, %r8, %r59; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f254, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f255, %f254; mov.f32 %f256, %f254; @%p10 bra BB0_7; mul.wide.s32 %rd25, %r12, 4; add.s64 %rd26, %rd6, %rd25; ld.global.nc.f32 %f254, [%rd26]; add.s64 %rd27, %rd5, %rd25; ld.global.nc.f32 %f255, [%rd27]; add.s64 %rd28, %rd4, %rd25; ld.global.nc.f32 %f256, [%rd28]; BB0_7: mul.f32 %f96, %f255, %f255; fma.rn.f32 %f97, %f254, %f254, %f96; fma.rn.f32 %f10, %f256, %f256, %f97; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs37, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd29, %r12; add.s64 %rd30, %rd3, %rd29; ld.global.nc.u8 %rs37, [%rd30]; BB0_9: setp.gt.u16 %p12, %rs37, %rs1; cvt.u32.u16 %r60, %rs37; and.b32 %r61, %r60, 255; selp.b32 %r62, %r7, %r61, %p12; selp.b32 %r63, %r61, %r7, %p12; add.s32 %r64, %r63, 1; mul.lo.s32 %r65, %r64, %r63; shr.u32 %r66, %r65, 1; add.s32 %r13, %r66, %r62; setp.ne.s16 %p13, %rs18, 0; mov.f32 %f263, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f264, %f263; mov.f32 %f265, %f263; @%p15 bra BB0_11; mul.wide.s32 %rd31, %r13, 4; add.s64 %rd32, %rd2, %rd31; ld.global.nc.f32 %f101, [%rd32]; add.f32 %f102, %f101, %f101; add.s64 %rd33, %rd1, %rd31; ld.global.nc.f32 %f103, [%rd33]; div.rn.f32 %f104, %f103, %f102; mul.f32 %f105, %f104, %f87; fma.rn.f32 %f106, %f3, %f105, %f2; mul.f32 %f107, %f2, %f105; sub.f32 %f108, %f3, %f107; selp.f32 %f109, %f1, %f254, %p11; selp.f32 %f110, %f106, %f255, %p11; selp.f32 %f111, %f108, %f256, %p11; mul.f32 %f112, %f87, %f87; div.rn.f32 %f113, %f102, %f112; sub.f32 %f114, %f109, %f1; sub.f32 %f115, %f110, %f2; sub.f32 %f116, %f111, %f3; fma.rn.f32 %f263, %f114, %f113, 0f00000000; fma.rn.f32 %f117, %f115, %f113, 0f00000000; fma.rn.f32 %f118, %f116, %f113, 0f00000000; div.rn.f32 %f119, %f103, %f87; mul.f32 %f120, %f111, %f119; sub.f32 %f264, %f117, %f120; fma.rn.f32 %f265, %f110, %f119, %f118; BB0_11: add.s32 %r14, %r1, 1; @%p7 bra BB0_13; rem.s32 %r67, %r14, %r43; add.s32 %r68, %r67, %r43; rem.s32 %r123, %r68, %r43; bra.uni BB0_14; BB0_13: add.s32 %r69, %r43, -1; min.s32 %r123, %r14, %r69; BB0_14: add.s32 %r18, %r123, %r5; setp.ge.s32 %p18, %r14, %r43; mov.f32 %f260, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f261, %f260; mov.f32 %f262, %f260; @%p20 bra BB0_16; mul.wide.s32 %rd34, %r18, 4; add.s64 %rd35, %rd6, %rd34; ld.global.nc.f32 %f260, [%rd35]; add.s64 %rd36, %rd5, %rd34; ld.global.nc.f32 %f261, [%rd36]; add.s64 %rd37, %rd4, %rd34; ld.global.nc.f32 %f262, [%rd37]; BB0_16: mul.f32 %f124, %f261, %f261; fma.rn.f32 %f125, %f260, %f260, %f124; fma.rn.f32 %f23, %f262, %f262, %f125; setp.eq.f32 %p21, %f23, 0f00000000; mov.u16 %rs38, %rs1; @%p21 bra BB0_18; cvt.s64.s32 %rd38, %r18; add.s64 %rd39, %rd3, %rd38; ld.global.nc.u8 %rs38, [%rd39]; BB0_18: setp.gt.u16 %p22, %rs38, %rs1; cvt.u32.u16 %r70, %rs38; and.b32 %r71, %r70, 255; selp.b32 %r72, %r7, %r71, %p22; selp.b32 %r73, %r71, %r7, %p22; add.s32 %r74, %r73, 1; mul.lo.s32 %r75, %r74, %r73; shr.u32 %r76, %r75, 1; add.s32 %r19, %r76, %r72; and.pred %p25, %p21, %p13; @%p25 bra BB0_20; mul.wide.s32 %rd40, %r19, 4; add.s64 %rd41, %rd2, %rd40; ld.global.nc.f32 %f126, [%rd41]; add.f32 %f127, %f126, %f126; add.s64 %rd42, %rd1, %rd40; ld.global.nc.f32 %f128, [%rd42]; div.rn.f32 %f129, %f128, %f127; mul.f32 %f130, %f129, %f87; mul.f32 %f131, %f3, %f130; sub.f32 %f132, %f2, %f131; fma.rn.f32 %f133, %f2, %f130, %f3; selp.f32 %f134, %f1, %f260, %p21; selp.f32 %f135, %f132, %f261, %p21; selp.f32 %f136, %f133, %f262, %p21; mul.f32 %f137, %f87, %f87; div.rn.f32 %f138, %f127, %f137; sub.f32 %f139, %f134, %f1; sub.f32 %f140, %f135, %f2; sub.f32 %f141, %f136, %f3; fma.rn.f32 %f263, %f139, %f138, %f263; fma.rn.f32 %f142, %f140, %f138, %f264; fma.rn.f32 %f143, %f141, %f138, %f265; div.rn.f32 %f144, %f128, %f87; fma.rn.f32 %f264, %f136, %f144, %f142; mul.f32 %f145, %f135, %f144; sub.f32 %f265, %f143, %f145; BB0_20: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r20, %r2, -1; @%p27 bra BB0_22; rem.s32 %r77, %r20, %r44; add.s32 %r78, %r77, %r44; rem.s32 %r124, %r78, %r44; bra.uni BB0_23; BB0_22: mov.u32 %r79, 0; max.s32 %r124, %r20, %r79; BB0_23: add.s32 %r80, %r124, %r4; mad.lo.s32 %r24, %r80, %r43, %r1; setp.lt.s32 %p29, %r20, 0; mov.f32 %f266, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f267, %f266; mov.f32 %f268, %f266; @%p30 bra BB0_25; mul.wide.s32 %rd43, %r24, 4; add.s64 %rd44, %rd6, %rd43; ld.global.nc.f32 %f266, [%rd44]; add.s64 %rd45, %rd5, %rd43; ld.global.nc.f32 %f267, [%rd45]; add.s64 %rd46, %rd4, %rd43; ld.global.nc.f32 %f268, [%rd46]; BB0_25: mul.f32 %f149, %f267, %f267; fma.rn.f32 %f150, %f266, %f266, %f149; fma.rn.f32 %f36, %f268, %f268, %f150; setp.eq.f32 %p31, %f36, 0f00000000; mov.u16 %rs39, %rs1; @%p31 bra BB0_27; cvt.s64.s32 %rd47, %r24; add.s64 %rd48, %rd3, %rd47; ld.global.nc.u8 %rs39, [%rd48]; BB0_27: setp.gt.u16 %p32, %rs39, %rs1; cvt.u32.u16 %r81, %rs39; and.b32 %r82, %r81, 255; selp.b32 %r83, %r7, %r82, %p32; selp.b32 %r84, %r82, %r7, %p32; add.s32 %r85, %r84, 1; mul.lo.s32 %r86, %r85, %r84; shr.u32 %r87, %r86, 1; add.s32 %r25, %r87, %r83; and.pred %p35, %p31, %p13; @%p35 bra BB0_29; mul.wide.s32 %rd49, %r25, 4; add.s64 %rd50, %rd2, %rd49; ld.global.nc.f32 %f151, [%rd50]; add.f32 %f152, %f151, %f151; add.s64 %rd51, %rd1, %rd49; ld.global.nc.f32 %f153, [%rd51]; div.rn.f32 %f154, %f153, %f152; mul.f32 %f155, %f154, %f88; mul.f32 %f156, %f3, %f155; sub.f32 %f157, %f1, %f156; fma.rn.f32 %f158, %f1, %f155, %f3; selp.f32 %f159, %f157, %f266, %p31; selp.f32 %f160, %f2, %f267, %p31; selp.f32 %f161, %f158, %f268, %p31; mul.f32 %f162, %f88, %f88; div.rn.f32 %f163, %f152, %f162; sub.f32 %f164, %f159, %f1; sub.f32 %f165, %f160, %f2; sub.f32 %f166, %f161, %f3; fma.rn.f32 %f167, %f164, %f163, %f263; fma.rn.f32 %f264, %f165, %f163, %f264; fma.rn.f32 %f168, %f166, %f163, %f265; div.rn.f32 %f169, %f153, %f88; fma.rn.f32 %f263, %f161, %f169, %f167; mul.f32 %f170, %f159, %f169; sub.f32 %f265, %f168, %f170; BB0_29: add.s32 %r26, %r2, 1; @%p27 bra BB0_31; rem.s32 %r88, %r26, %r44; add.s32 %r89, %r88, %r44; rem.s32 %r125, %r89, %r44; bra.uni BB0_32; BB0_31: add.s32 %r90, %r44, -1; min.s32 %r125, %r26, %r90; BB0_32: add.s32 %r91, %r125, %r4; mad.lo.s32 %r30, %r91, %r43, %r1; setp.ge.s32 %p38, %r26, %r44; mov.f32 %f272, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f273, %f272; mov.f32 %f274, %f272; @%p40 bra BB0_34; mul.wide.s32 %rd52, %r30, 4; add.s64 %rd53, %rd6, %rd52; ld.global.nc.f32 %f272, [%rd53]; add.s64 %rd54, %rd5, %rd52; ld.global.nc.f32 %f273, [%rd54]; add.s64 %rd55, %rd4, %rd52; ld.global.nc.f32 %f274, [%rd55]; BB0_34: mul.f32 %f174, %f273, %f273; fma.rn.f32 %f175, %f272, %f272, %f174; fma.rn.f32 %f49, %f274, %f274, %f175; setp.eq.f32 %p41, %f49, 0f00000000; mov.u16 %rs40, %rs1; @%p41 bra BB0_36; cvt.s64.s32 %rd56, %r30; add.s64 %rd57, %rd3, %rd56; ld.global.nc.u8 %rs40, [%rd57]; BB0_36: setp.gt.u16 %p42, %rs40, %rs1; cvt.u32.u16 %r92, %rs40; and.b32 %r93, %r92, 255; selp.b32 %r94, %r7, %r93, %p42; selp.b32 %r95, %r93, %r7, %p42; add.s32 %r96, %r95, 1; mul.lo.s32 %r97, %r96, %r95; shr.u32 %r98, %r97, 1; add.s32 %r31, %r98, %r94; and.pred %p45, %p41, %p13; @%p45 bra BB0_38; mul.wide.s32 %rd58, %r31, 4; add.s64 %rd59, %rd2, %rd58; ld.global.nc.f32 %f176, [%rd59]; add.f32 %f177, %f176, %f176; add.s64 %rd60, %rd1, %rd58; ld.global.nc.f32 %f178, [%rd60]; div.rn.f32 %f179, %f178, %f177; mul.f32 %f180, %f179, %f88; fma.rn.f32 %f181, %f3, %f180, %f1; mul.f32 %f182, %f1, %f180; sub.f32 %f183, %f3, %f182; selp.f32 %f184, %f181, %f272, %p41; selp.f32 %f185, %f2, %f273, %p41; selp.f32 %f186, %f183, %f274, %p41; mul.f32 %f187, %f88, %f88; div.rn.f32 %f188, %f177, %f187; sub.f32 %f189, %f184, %f1; sub.f32 %f190, %f185, %f2; sub.f32 %f191, %f186, %f3; fma.rn.f32 %f192, %f189, %f188, %f263; fma.rn.f32 %f264, %f190, %f188, %f264; fma.rn.f32 %f193, %f191, %f188, %f265; div.rn.f32 %f194, %f178, %f88; mul.f32 %f195, %f186, %f194; sub.f32 %f263, %f192, %f195; fma.rn.f32 %f265, %f184, %f194, %f193; BB0_38: setp.eq.s32 %p47, %r45, 1; @%p47 bra BB0_57; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r32, %r3, -1; @%p48 bra BB0_41; rem.s32 %r99, %r32, %r45; add.s32 %r100, %r99, %r45; rem.s32 %r126, %r100, %r45; bra.uni BB0_42; BB0_41: mov.u32 %r101, 0; max.s32 %r126, %r32, %r101; BB0_42: mad.lo.s32 %r102, %r126, %r44, %r2; mad.lo.s32 %r36, %r102, %r43, %r1; setp.lt.s32 %p50, %r32, 0; mov.f32 %f278, 0f00000000; and.pred %p51, %p50, %p48; mov.f32 %f279, %f278; mov.f32 %f280, %f278; @%p51 bra BB0_44; mul.wide.s32 %rd61, %r36, 4; add.s64 %rd62, %rd6, %rd61; ld.global.nc.f32 %f278, [%rd62]; add.s64 %rd63, %rd5, %rd61; ld.global.nc.f32 %f279, [%rd63]; add.s64 %rd64, %rd4, %rd61; ld.global.nc.f32 %f280, [%rd64]; BB0_44: mul.f32 %f199, %f279, %f279; fma.rn.f32 %f200, %f278, %f278, %f199; fma.rn.f32 %f62, %f280, %f280, %f200; setp.eq.f32 %p52, %f62, 0f00000000; mov.u16 %rs41, %rs1; @%p52 bra BB0_46; cvt.s64.s32 %rd65, %r36; add.s64 %rd66, %rd3, %rd65; ld.global.nc.u8 %rs41, [%rd66]; BB0_46: setp.gt.u16 %p53, %rs41, %rs1; cvt.u32.u16 %r103, %rs41; and.b32 %r104, %r103, 255; selp.b32 %r105, %r7, %r104, %p53; selp.b32 %r106, %r104, %r7, %p53; add.s32 %r107, %r106, 1; mul.lo.s32 %r108, %r107, %r106; shr.u32 %r109, %r108, 1; add.s32 %r37, %r109, %r105; and.pred %p56, %p52, %p13; @%p56 bra BB0_48; mul.wide.s32 %rd67, %r37, 4; add.s64 %rd68, %rd2, %rd67; ld.global.nc.f32 %f201, [%rd68]; add.f32 %f202, %f201, %f201; add.s64 %rd69, %rd1, %rd67; ld.global.nc.f32 %f203, [%rd69]; div.rn.f32 %f204, %f203, %f202; mul.f32 %f205, %f204, %f89; fma.rn.f32 %f206, %f2, %f205, %f1; mul.f32 %f207, %f1, %f205; sub.f32 %f208, %f2, %f207; selp.f32 %f209, %f206, %f278, %p52; selp.f32 %f210, %f208, %f279, %p52; selp.f32 %f211, %f3, %f280, %p52; mul.f32 %f212, %f89, %f89; div.rn.f32 %f213, %f202, %f212; sub.f32 %f214, %f209, %f1; sub.f32 %f215, %f210, %f2; sub.f32 %f216, %f211, %f3; fma.rn.f32 %f217, %f214, %f213, %f263; fma.rn.f32 %f218, %f215, %f213, %f264; fma.rn.f32 %f265, %f216, %f213, %f265; div.rn.f32 %f219, %f203, %f89; mul.f32 %f220, %f210, %f219; sub.f32 %f263, %f217, %f220; fma.rn.f32 %f264, %f209, %f219, %f218; BB0_48: add.s32 %r38, %r3, 1; @%p48 bra BB0_50; rem.s32 %r110, %r38, %r45; add.s32 %r111, %r110, %r45; rem.s32 %r127, %r111, %r45; bra.uni BB0_51; BB0_50: add.s32 %r112, %r45, -1; min.s32 %r127, %r38, %r112; BB0_51: mad.lo.s32 %r113, %r127, %r44, %r2; mad.lo.s32 %r42, %r113, %r43, %r1; setp.ge.s32 %p59, %r38, %r45; mov.f32 %f284, 0f00000000; and.pred %p61, %p59, %p48; mov.f32 %f285, %f284; mov.f32 %f286, %f284; @%p61 bra BB0_53; mul.wide.s32 %rd70, %r42, 4; add.s64 %rd71, %rd6, %rd70; ld.global.nc.f32 %f286, [%rd71]; add.s64 %rd72, %rd5, %rd70; ld.global.nc.f32 %f285, [%rd72]; add.s64 %rd73, %rd4, %rd70; ld.global.nc.f32 %f284, [%rd73]; BB0_53: mul.f32 %f224, %f286, %f286; fma.rn.f32 %f225, %f285, %f285, %f224; fma.rn.f32 %f75, %f284, %f284, %f225; setp.eq.f32 %p62, %f75, 0f00000000; mov.u16 %rs42, %rs1; @%p62 bra BB0_55; cvt.s64.s32 %rd74, %r42; add.s64 %rd75, %rd3, %rd74; ld.global.nc.u8 %rs42, [%rd75]; BB0_55: setp.gt.u16 %p63, %rs42, %rs1; cvt.u32.u16 %r114, %rs42; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p63; selp.b32 %r117, %r115, %r7, %p63; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd76, %r121, 4; add.s64 %rd7, %rd2, %rd76; add.s64 %rd8, %rd1, %rd76; and.pred %p66, %p62, %p13; @%p66 bra BB0_57; ld.global.nc.f32 %f226, [%rd7]; add.f32 %f227, %f226, %f226; ld.global.nc.f32 %f228, [%rd8]; div.rn.f32 %f229, %f228, %f227; mul.f32 %f230, %f229, %f89; mul.f32 %f231, %f2, %f230; sub.f32 %f232, %f1, %f231; fma.rn.f32 %f233, %f1, %f230, %f2; selp.f32 %f234, %f3, %f284, %p62; selp.f32 %f235, %f233, %f285, %p62; selp.f32 %f236, %f232, %f286, %p62; mul.f32 %f237, %f89, %f89; div.rn.f32 %f238, %f227, %f237; sub.f32 %f239, %f236, %f1; sub.f32 %f240, %f235, %f2; sub.f32 %f241, %f234, %f3; fma.rn.f32 %f242, %f239, %f238, %f263; fma.rn.f32 %f243, %f240, %f238, %f264; fma.rn.f32 %f265, %f241, %f238, %f265; div.rn.f32 %f244, %f228, %f89; fma.rn.f32 %f263, %f235, %f244, %f242; mul.f32 %f245, %f236, %f244; sub.f32 %f264, %f243, %f245; BB0_57: setp.eq.s64 %p68, %rd12, 0; @%p68 bra BB0_59; cvta.to.global.u64 %rd77, %rd12; add.s64 %rd79, %rd77, %rd19; ld.global.nc.f32 %f246, [%rd79]; mul.f32 %f290, %f246, %f290; BB0_59: setp.eq.f32 %p69, %f290, 0f00000000; mov.f32 %f291, 0f00000000; @%p69 bra BB0_61; rcp.rn.f32 %f291, %f290; BB0_61: cvta.to.global.u64 %rd80, %rd11; cvta.to.global.u64 %rd81, %rd10; cvta.to.global.u64 %rd82, %rd9; add.s64 %rd84, %rd82, %rd19; ld.global.f32 %f248, [%rd84]; fma.rn.f32 %f249, %f263, %f291, %f248; st.global.f32 [%rd84], %f249; add.s64 %rd85, %rd81, %rd19; ld.global.f32 %f250, [%rd85]; fma.rn.f32 %f251, %f264, %f291, %f250; st.global.f32 [%rd85], %f251; add.s64 %rd86, %rd80, %rd19; ld.global.f32 %f252, [%rd86]; fma.rn.f32 %f253, %f265, %f291, %f252; st.global.f32 [%rd86], %f253; BB0_62: ret; } ` adddmibulk_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<70>; .reg .b16 %rs<43>; .reg .f32 %f<292>; .reg .b32 %r<128>; .reg .b64 %rd<87>; ld.param.u64 %rd9, [adddmibulk_param_0]; ld.param.u64 %rd10, [adddmibulk_param_1]; ld.param.u64 %rd11, [adddmibulk_param_2]; ld.param.u64 %rd13, [adddmibulk_param_3]; ld.param.u64 %rd14, [adddmibulk_param_4]; ld.param.u64 %rd15, [adddmibulk_param_5]; ld.param.u64 %rd12, [adddmibulk_param_6]; ld.param.f32 %f290, [adddmibulk_param_7]; ld.param.u64 %rd16, [adddmibulk_param_8]; ld.param.u64 %rd17, [adddmibulk_param_9]; ld.param.u64 %rd18, [adddmibulk_param_10]; ld.param.f32 %f87, [adddmibulk_param_11]; ld.param.f32 %f88, [adddmibulk_param_12]; ld.param.f32 %f89, [adddmibulk_param_13]; ld.param.u32 %r43, [adddmibulk_param_14]; ld.param.u32 %r44, [adddmibulk_param_15]; ld.param.u32 %r45, [adddmibulk_param_16]; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; cvta.to.global.u64 %rd1, %rd17; cvta.to.global.u64 %rd2, %rd16; cvta.to.global.u64 %rd3, %rd18; cvta.to.global.u64 %rd4, %rd15; cvta.to.global.u64 %rd5, %rd14; cvta.to.global.u64 %rd6, %rd13; mov.u32 %r46, %ntid.x; mov.u32 %r47, %ctaid.x; mov.u32 %r48, %tid.x; mad.lo.s32 %r1, %r46, %r47, %r48; mov.u32 %r49, %ntid.y; mov.u32 %r50, %ctaid.y; mov.u32 %r51, %tid.y; mad.lo.s32 %r2, %r49, %r50, %r51; mov.u32 %r52, %ntid.z; mov.u32 %r53, %ctaid.z; mov.u32 %r54, %tid.z; mad.lo.s32 %r3, %r52, %r53, %r54; setp.ge.s32 %p1, %r2, %r44; setp.ge.s32 %p2, %r1, %r43; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r45; or.pred %p5, %p3, %p4; @%p5 bra BB0_62; mul.lo.s32 %r4, %r3, %r44; add.s32 %r55, %r4, %r2; mul.lo.s32 %r5, %r55, %r43; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd19, %r6, 4; add.s64 %rd20, %rd6, %rd19; cvt.s64.s32 %rd21, %r6; add.s64 %rd22, %rd5, %rd19; add.s64 %rd23, %rd4, %rd19; add.s64 %rd24, %rd3, %rd21; ld.global.nc.u8 %rs1, [%rd24]; cvt.u32.u16 %r56, %rs1; and.b32 %r7, %r56, 255; ld.global.nc.f32 %f1, [%rd20]; ld.global.nc.f32 %f2, [%rd22]; mul.f32 %f90, %f2, %f2; fma.rn.f32 %f91, %f1, %f1, %f90; ld.global.nc.f32 %f3, [%rd23]; fma.rn.f32 %f92, %f3, %f3, %f91; setp.eq.f32 %p6, %f92, 0f00000000; @%p6 bra BB0_62; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r57, %r8, %r43; add.s32 %r58, %r57, %r43; rem.s32 %r122, %r58, %r43; bra.uni BB0_5; BB0_4: mov.u32 %r59, 0; max.s32 %r122, %r8, %r59; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f254, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f255, %f254; mov.f32 %f256, %f254; @%p10 bra BB0_7; mul.wide.s32 %rd25, %r12, 4; add.s64 %rd26, %rd6, %rd25; ld.global.nc.f32 %f254, [%rd26]; add.s64 %rd27, %rd5, %rd25; ld.global.nc.f32 %f255, [%rd27]; add.s64 %rd28, %rd4, %rd25; ld.global.nc.f32 %f256, [%rd28]; BB0_7: mul.f32 %f96, %f255, %f255; fma.rn.f32 %f97, %f254, %f254, %f96; fma.rn.f32 %f10, %f256, %f256, %f97; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs37, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd29, %r12; add.s64 %rd30, %rd3, %rd29; ld.global.nc.u8 %rs37, [%rd30]; BB0_9: setp.gt.u16 %p12, %rs37, %rs1; cvt.u32.u16 %r60, %rs37; and.b32 %r61, %r60, 255; selp.b32 %r62, %r7, %r61, %p12; selp.b32 %r63, %r61, %r7, %p12; add.s32 %r64, %r63, 1; mul.lo.s32 %r65, %r64, %r63; shr.u32 %r66, %r65, 1; add.s32 %r13, %r66, %r62; setp.ne.s16 %p13, %rs18, 0; mov.f32 %f263, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f264, %f263; mov.f32 %f265, %f263; @%p15 bra BB0_11; mul.wide.s32 %rd31, %r13, 4; add.s64 %rd32, %rd2, %rd31; ld.global.nc.f32 %f101, [%rd32]; add.f32 %f102, %f101, %f101; add.s64 %rd33, %rd1, %rd31; ld.global.nc.f32 %f103, [%rd33]; div.rn.f32 %f104, %f103, %f102; mul.f32 %f105, %f104, %f87; fma.rn.f32 %f106, %f3, %f105, %f2; mul.f32 %f107, %f2, %f105; sub.f32 %f108, %f3, %f107; selp.f32 %f109, %f1, %f254, %p11; selp.f32 %f110, %f106, %f255, %p11; selp.f32 %f111, %f108, %f256, %p11; mul.f32 %f112, %f87, %f87; div.rn.f32 %f113, %f102, %f112; sub.f32 %f114, %f109, %f1; sub.f32 %f115, %f110, %f2; sub.f32 %f116, %f111, %f3; fma.rn.f32 %f263, %f114, %f113, 0f00000000; fma.rn.f32 %f117, %f115, %f113, 0f00000000; fma.rn.f32 %f118, %f116, %f113, 0f00000000; div.rn.f32 %f119, %f103, %f87; mul.f32 %f120, %f111, %f119; sub.f32 %f264, %f117, %f120; fma.rn.f32 %f265, %f110, %f119, %f118; BB0_11: add.s32 %r14, %r1, 1; @%p7 bra BB0_13; rem.s32 %r67, %r14, %r43; add.s32 %r68, %r67, %r43; rem.s32 %r123, %r68, %r43; bra.uni BB0_14; BB0_13: add.s32 %r69, %r43, -1; min.s32 %r123, %r14, %r69; BB0_14: add.s32 %r18, %r123, %r5; setp.ge.s32 %p18, %r14, %r43; mov.f32 %f260, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f261, %f260; mov.f32 %f262, %f260; @%p20 bra BB0_16; mul.wide.s32 %rd34, %r18, 4; add.s64 %rd35, %rd6, %rd34; ld.global.nc.f32 %f260, [%rd35]; add.s64 %rd36, %rd5, %rd34; ld.global.nc.f32 %f261, [%rd36]; add.s64 %rd37, %rd4, %rd34; ld.global.nc.f32 %f262, [%rd37]; BB0_16: mul.f32 %f124, %f261, %f261; fma.rn.f32 %f125, %f260, %f260, %f124; fma.rn.f32 %f23, %f262, %f262, %f125; setp.eq.f32 %p21, %f23, 0f00000000; mov.u16 %rs38, %rs1; @%p21 bra BB0_18; cvt.s64.s32 %rd38, %r18; add.s64 %rd39, %rd3, %rd38; ld.global.nc.u8 %rs38, [%rd39]; BB0_18: setp.gt.u16 %p22, %rs38, %rs1; cvt.u32.u16 %r70, %rs38; and.b32 %r71, %r70, 255; selp.b32 %r72, %r7, %r71, %p22; selp.b32 %r73, %r71, %r7, %p22; add.s32 %r74, %r73, 1; mul.lo.s32 %r75, %r74, %r73; shr.u32 %r76, %r75, 1; add.s32 %r19, %r76, %r72; and.pred %p25, %p21, %p13; @%p25 bra BB0_20; mul.wide.s32 %rd40, %r19, 4; add.s64 %rd41, %rd2, %rd40; ld.global.nc.f32 %f126, [%rd41]; add.f32 %f127, %f126, %f126; add.s64 %rd42, %rd1, %rd40; ld.global.nc.f32 %f128, [%rd42]; div.rn.f32 %f129, %f128, %f127; mul.f32 %f130, %f129, %f87; mul.f32 %f131, %f3, %f130; sub.f32 %f132, %f2, %f131; fma.rn.f32 %f133, %f2, %f130, %f3; selp.f32 %f134, %f1, %f260, %p21; selp.f32 %f135, %f132, %f261, %p21; selp.f32 %f136, %f133, %f262, %p21; mul.f32 %f137, %f87, %f87; div.rn.f32 %f138, %f127, %f137; sub.f32 %f139, %f134, %f1; sub.f32 %f140, %f135, %f2; sub.f32 %f141, %f136, %f3; fma.rn.f32 %f263, %f139, %f138, %f263; fma.rn.f32 %f142, %f140, %f138, %f264; fma.rn.f32 %f143, %f141, %f138, %f265; div.rn.f32 %f144, %f128, %f87; fma.rn.f32 %f264, %f136, %f144, %f142; mul.f32 %f145, %f135, %f144; sub.f32 %f265, %f143, %f145; BB0_20: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r20, %r2, -1; @%p27 bra BB0_22; rem.s32 %r77, %r20, %r44; add.s32 %r78, %r77, %r44; rem.s32 %r124, %r78, %r44; bra.uni BB0_23; BB0_22: mov.u32 %r79, 0; max.s32 %r124, %r20, %r79; BB0_23: add.s32 %r80, %r124, %r4; mad.lo.s32 %r24, %r80, %r43, %r1; setp.lt.s32 %p29, %r20, 0; mov.f32 %f266, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f267, %f266; mov.f32 %f268, %f266; @%p30 bra BB0_25; mul.wide.s32 %rd43, %r24, 4; add.s64 %rd44, %rd6, %rd43; ld.global.nc.f32 %f266, [%rd44]; add.s64 %rd45, %rd5, %rd43; ld.global.nc.f32 %f267, [%rd45]; add.s64 %rd46, %rd4, %rd43; ld.global.nc.f32 %f268, [%rd46]; BB0_25: mul.f32 %f149, %f267, %f267; fma.rn.f32 %f150, %f266, %f266, %f149; fma.rn.f32 %f36, %f268, %f268, %f150; setp.eq.f32 %p31, %f36, 0f00000000; mov.u16 %rs39, %rs1; @%p31 bra BB0_27; cvt.s64.s32 %rd47, %r24; add.s64 %rd48, %rd3, %rd47; ld.global.nc.u8 %rs39, [%rd48]; BB0_27: setp.gt.u16 %p32, %rs39, %rs1; cvt.u32.u16 %r81, %rs39; and.b32 %r82, %r81, 255; selp.b32 %r83, %r7, %r82, %p32; selp.b32 %r84, %r82, %r7, %p32; add.s32 %r85, %r84, 1; mul.lo.s32 %r86, %r85, %r84; shr.u32 %r87, %r86, 1; add.s32 %r25, %r87, %r83; and.pred %p35, %p31, %p13; @%p35 bra BB0_29; mul.wide.s32 %rd49, %r25, 4; add.s64 %rd50, %rd2, %rd49; ld.global.nc.f32 %f151, [%rd50]; add.f32 %f152, %f151, %f151; add.s64 %rd51, %rd1, %rd49; ld.global.nc.f32 %f153, [%rd51]; div.rn.f32 %f154, %f153, %f152; mul.f32 %f155, %f154, %f88; mul.f32 %f156, %f3, %f155; sub.f32 %f157, %f1, %f156; fma.rn.f32 %f158, %f1, %f155, %f3; selp.f32 %f159, %f157, %f266, %p31; selp.f32 %f160, %f2, %f267, %p31; selp.f32 %f161, %f158, %f268, %p31; mul.f32 %f162, %f88, %f88; div.rn.f32 %f163, %f152, %f162; sub.f32 %f164, %f159, %f1; sub.f32 %f165, %f160, %f2; sub.f32 %f166, %f161, %f3; fma.rn.f32 %f167, %f164, %f163, %f263; fma.rn.f32 %f264, %f165, %f163, %f264; fma.rn.f32 %f168, %f166, %f163, %f265; div.rn.f32 %f169, %f153, %f88; fma.rn.f32 %f263, %f161, %f169, %f167; mul.f32 %f170, %f159, %f169; sub.f32 %f265, %f168, %f170; BB0_29: add.s32 %r26, %r2, 1; @%p27 bra BB0_31; rem.s32 %r88, %r26, %r44; add.s32 %r89, %r88, %r44; rem.s32 %r125, %r89, %r44; bra.uni BB0_32; BB0_31: add.s32 %r90, %r44, -1; min.s32 %r125, %r26, %r90; BB0_32: add.s32 %r91, %r125, %r4; mad.lo.s32 %r30, %r91, %r43, %r1; setp.ge.s32 %p38, %r26, %r44; mov.f32 %f272, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f273, %f272; mov.f32 %f274, %f272; @%p40 bra BB0_34; mul.wide.s32 %rd52, %r30, 4; add.s64 %rd53, %rd6, %rd52; ld.global.nc.f32 %f272, [%rd53]; add.s64 %rd54, %rd5, %rd52; ld.global.nc.f32 %f273, [%rd54]; add.s64 %rd55, %rd4, %rd52; ld.global.nc.f32 %f274, [%rd55]; BB0_34: mul.f32 %f174, %f273, %f273; fma.rn.f32 %f175, %f272, %f272, %f174; fma.rn.f32 %f49, %f274, %f274, %f175; setp.eq.f32 %p41, %f49, 0f00000000; mov.u16 %rs40, %rs1; @%p41 bra BB0_36; cvt.s64.s32 %rd56, %r30; add.s64 %rd57, %rd3, %rd56; ld.global.nc.u8 %rs40, [%rd57]; BB0_36: setp.gt.u16 %p42, %rs40, %rs1; cvt.u32.u16 %r92, %rs40; and.b32 %r93, %r92, 255; selp.b32 %r94, %r7, %r93, %p42; selp.b32 %r95, %r93, %r7, %p42; add.s32 %r96, %r95, 1; mul.lo.s32 %r97, %r96, %r95; shr.u32 %r98, %r97, 1; add.s32 %r31, %r98, %r94; and.pred %p45, %p41, %p13; @%p45 bra BB0_38; mul.wide.s32 %rd58, %r31, 4; add.s64 %rd59, %rd2, %rd58; ld.global.nc.f32 %f176, [%rd59]; add.f32 %f177, %f176, %f176; add.s64 %rd60, %rd1, %rd58; ld.global.nc.f32 %f178, [%rd60]; div.rn.f32 %f179, %f178, %f177; mul.f32 %f180, %f179, %f88; fma.rn.f32 %f181, %f3, %f180, %f1; mul.f32 %f182, %f1, %f180; sub.f32 %f183, %f3, %f182; selp.f32 %f184, %f181, %f272, %p41; selp.f32 %f185, %f2, %f273, %p41; selp.f32 %f186, %f183, %f274, %p41; mul.f32 %f187, %f88, %f88; div.rn.f32 %f188, %f177, %f187; sub.f32 %f189, %f184, %f1; sub.f32 %f190, %f185, %f2; sub.f32 %f191, %f186, %f3; fma.rn.f32 %f192, %f189, %f188, %f263; fma.rn.f32 %f264, %f190, %f188, %f264; fma.rn.f32 %f193, %f191, %f188, %f265; div.rn.f32 %f194, %f178, %f88; mul.f32 %f195, %f186, %f194; sub.f32 %f263, %f192, %f195; fma.rn.f32 %f265, %f184, %f194, %f193; BB0_38: setp.eq.s32 %p47, %r45, 1; @%p47 bra BB0_57; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r32, %r3, -1; @%p48 bra BB0_41; rem.s32 %r99, %r32, %r45; add.s32 %r100, %r99, %r45; rem.s32 %r126, %r100, %r45; bra.uni BB0_42; BB0_41: mov.u32 %r101, 0; max.s32 %r126, %r32, %r101; BB0_42: mad.lo.s32 %r102, %r126, %r44, %r2; mad.lo.s32 %r36, %r102, %r43, %r1; setp.lt.s32 %p50, %r32, 0; mov.f32 %f278, 0f00000000; and.pred %p51, %p50, %p48; mov.f32 %f279, %f278; mov.f32 %f280, %f278; @%p51 bra BB0_44; mul.wide.s32 %rd61, %r36, 4; add.s64 %rd62, %rd6, %rd61; ld.global.nc.f32 %f278, [%rd62]; add.s64 %rd63, %rd5, %rd61; ld.global.nc.f32 %f279, [%rd63]; add.s64 %rd64, %rd4, %rd61; ld.global.nc.f32 %f280, [%rd64]; BB0_44: mul.f32 %f199, %f279, %f279; fma.rn.f32 %f200, %f278, %f278, %f199; fma.rn.f32 %f62, %f280, %f280, %f200; setp.eq.f32 %p52, %f62, 0f00000000; mov.u16 %rs41, %rs1; @%p52 bra BB0_46; cvt.s64.s32 %rd65, %r36; add.s64 %rd66, %rd3, %rd65; ld.global.nc.u8 %rs41, [%rd66]; BB0_46: setp.gt.u16 %p53, %rs41, %rs1; cvt.u32.u16 %r103, %rs41; and.b32 %r104, %r103, 255; selp.b32 %r105, %r7, %r104, %p53; selp.b32 %r106, %r104, %r7, %p53; add.s32 %r107, %r106, 1; mul.lo.s32 %r108, %r107, %r106; shr.u32 %r109, %r108, 1; add.s32 %r37, %r109, %r105; and.pred %p56, %p52, %p13; @%p56 bra BB0_48; mul.wide.s32 %rd67, %r37, 4; add.s64 %rd68, %rd2, %rd67; ld.global.nc.f32 %f201, [%rd68]; add.f32 %f202, %f201, %f201; add.s64 %rd69, %rd1, %rd67; ld.global.nc.f32 %f203, [%rd69]; div.rn.f32 %f204, %f203, %f202; mul.f32 %f205, %f204, %f89; fma.rn.f32 %f206, %f2, %f205, %f1; mul.f32 %f207, %f1, %f205; sub.f32 %f208, %f2, %f207; selp.f32 %f209, %f206, %f278, %p52; selp.f32 %f210, %f208, %f279, %p52; selp.f32 %f211, %f3, %f280, %p52; mul.f32 %f212, %f89, %f89; div.rn.f32 %f213, %f202, %f212; sub.f32 %f214, %f209, %f1; sub.f32 %f215, %f210, %f2; sub.f32 %f216, %f211, %f3; fma.rn.f32 %f217, %f214, %f213, %f263; fma.rn.f32 %f218, %f215, %f213, %f264; fma.rn.f32 %f265, %f216, %f213, %f265; div.rn.f32 %f219, %f203, %f89; mul.f32 %f220, %f210, %f219; sub.f32 %f263, %f217, %f220; fma.rn.f32 %f264, %f209, %f219, %f218; BB0_48: add.s32 %r38, %r3, 1; @%p48 bra BB0_50; rem.s32 %r110, %r38, %r45; add.s32 %r111, %r110, %r45; rem.s32 %r127, %r111, %r45; bra.uni BB0_51; BB0_50: add.s32 %r112, %r45, -1; min.s32 %r127, %r38, %r112; BB0_51: mad.lo.s32 %r113, %r127, %r44, %r2; mad.lo.s32 %r42, %r113, %r43, %r1; setp.ge.s32 %p59, %r38, %r45; mov.f32 %f284, 0f00000000; and.pred %p61, %p59, %p48; mov.f32 %f285, %f284; mov.f32 %f286, %f284; @%p61 bra BB0_53; mul.wide.s32 %rd70, %r42, 4; add.s64 %rd71, %rd6, %rd70; ld.global.nc.f32 %f286, [%rd71]; add.s64 %rd72, %rd5, %rd70; ld.global.nc.f32 %f285, [%rd72]; add.s64 %rd73, %rd4, %rd70; ld.global.nc.f32 %f284, [%rd73]; BB0_53: mul.f32 %f224, %f286, %f286; fma.rn.f32 %f225, %f285, %f285, %f224; fma.rn.f32 %f75, %f284, %f284, %f225; setp.eq.f32 %p62, %f75, 0f00000000; mov.u16 %rs42, %rs1; @%p62 bra BB0_55; cvt.s64.s32 %rd74, %r42; add.s64 %rd75, %rd3, %rd74; ld.global.nc.u8 %rs42, [%rd75]; BB0_55: setp.gt.u16 %p63, %rs42, %rs1; cvt.u32.u16 %r114, %rs42; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p63; selp.b32 %r117, %r115, %r7, %p63; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd76, %r121, 4; add.s64 %rd7, %rd2, %rd76; add.s64 %rd8, %rd1, %rd76; and.pred %p66, %p62, %p13; @%p66 bra BB0_57; ld.global.nc.f32 %f226, [%rd7]; add.f32 %f227, %f226, %f226; ld.global.nc.f32 %f228, [%rd8]; div.rn.f32 %f229, %f228, %f227; mul.f32 %f230, %f229, %f89; mul.f32 %f231, %f2, %f230; sub.f32 %f232, %f1, %f231; fma.rn.f32 %f233, %f1, %f230, %f2; selp.f32 %f234, %f3, %f284, %p62; selp.f32 %f235, %f233, %f285, %p62; selp.f32 %f236, %f232, %f286, %p62; mul.f32 %f237, %f89, %f89; div.rn.f32 %f238, %f227, %f237; sub.f32 %f239, %f236, %f1; sub.f32 %f240, %f235, %f2; sub.f32 %f241, %f234, %f3; fma.rn.f32 %f242, %f239, %f238, %f263; fma.rn.f32 %f243, %f240, %f238, %f264; fma.rn.f32 %f265, %f241, %f238, %f265; div.rn.f32 %f244, %f228, %f89; fma.rn.f32 %f263, %f235, %f244, %f242; mul.f32 %f245, %f236, %f244; sub.f32 %f264, %f243, %f245; BB0_57: setp.eq.s64 %p68, %rd12, 0; @%p68 bra BB0_59; cvta.to.global.u64 %rd77, %rd12; add.s64 %rd79, %rd77, %rd19; ld.global.nc.f32 %f246, [%rd79]; mul.f32 %f290, %f246, %f290; BB0_59: setp.eq.f32 %p69, %f290, 0f00000000; mov.f32 %f291, 0f00000000; @%p69 bra BB0_61; rcp.rn.f32 %f291, %f290; BB0_61: cvta.to.global.u64 %rd80, %rd11; cvta.to.global.u64 %rd81, %rd10; cvta.to.global.u64 %rd82, %rd9; add.s64 %rd84, %rd82, %rd19; ld.global.f32 %f248, [%rd84]; fma.rn.f32 %f249, %f263, %f291, %f248; st.global.f32 [%rd84], %f249; add.s64 %rd85, %rd81, %rd19; ld.global.f32 %f250, [%rd85]; fma.rn.f32 %f251, %f264, %f291, %f250; st.global.f32 [%rd85], %f251; add.s64 %rd86, %rd80, %rd19; ld.global.f32 %f252, [%rd86]; fma.rn.f32 %f253, %f265, %f291, %f252; st.global.f32 [%rd86], %f253; BB0_62: ret; } ` adddmibulk_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<70>; .reg .b16 %rs<43>; .reg .f32 %f<292>; .reg .b32 %r<128>; .reg .b64 %rd<87>; ld.param.u64 %rd9, [adddmibulk_param_0]; ld.param.u64 %rd10, [adddmibulk_param_1]; ld.param.u64 %rd11, [adddmibulk_param_2]; ld.param.u64 %rd13, [adddmibulk_param_3]; ld.param.u64 %rd14, [adddmibulk_param_4]; ld.param.u64 %rd15, [adddmibulk_param_5]; ld.param.u64 %rd12, [adddmibulk_param_6]; ld.param.f32 %f290, [adddmibulk_param_7]; ld.param.u64 %rd16, [adddmibulk_param_8]; ld.param.u64 %rd17, [adddmibulk_param_9]; ld.param.u64 %rd18, [adddmibulk_param_10]; ld.param.f32 %f87, [adddmibulk_param_11]; ld.param.f32 %f88, [adddmibulk_param_12]; ld.param.f32 %f89, [adddmibulk_param_13]; ld.param.u32 %r43, [adddmibulk_param_14]; ld.param.u32 %r44, [adddmibulk_param_15]; ld.param.u32 %r45, [adddmibulk_param_16]; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; cvta.to.global.u64 %rd1, %rd17; cvta.to.global.u64 %rd2, %rd16; cvta.to.global.u64 %rd3, %rd18; cvta.to.global.u64 %rd4, %rd15; cvta.to.global.u64 %rd5, %rd14; cvta.to.global.u64 %rd6, %rd13; mov.u32 %r46, %ntid.x; mov.u32 %r47, %ctaid.x; mov.u32 %r48, %tid.x; mad.lo.s32 %r1, %r46, %r47, %r48; mov.u32 %r49, %ntid.y; mov.u32 %r50, %ctaid.y; mov.u32 %r51, %tid.y; mad.lo.s32 %r2, %r49, %r50, %r51; mov.u32 %r52, %ntid.z; mov.u32 %r53, %ctaid.z; mov.u32 %r54, %tid.z; mad.lo.s32 %r3, %r52, %r53, %r54; setp.ge.s32 %p1, %r2, %r44; setp.ge.s32 %p2, %r1, %r43; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r45; or.pred %p5, %p3, %p4; @%p5 bra BB0_62; mul.lo.s32 %r4, %r3, %r44; add.s32 %r55, %r4, %r2; mul.lo.s32 %r5, %r55, %r43; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd19, %r6, 4; add.s64 %rd20, %rd6, %rd19; cvt.s64.s32 %rd21, %r6; add.s64 %rd22, %rd5, %rd19; add.s64 %rd23, %rd4, %rd19; add.s64 %rd24, %rd3, %rd21; ld.global.nc.u8 %rs1, [%rd24]; cvt.u32.u16 %r56, %rs1; and.b32 %r7, %r56, 255; ld.global.nc.f32 %f1, [%rd20]; ld.global.nc.f32 %f2, [%rd22]; mul.f32 %f90, %f2, %f2; fma.rn.f32 %f91, %f1, %f1, %f90; ld.global.nc.f32 %f3, [%rd23]; fma.rn.f32 %f92, %f3, %f3, %f91; setp.eq.f32 %p6, %f92, 0f00000000; @%p6 bra BB0_62; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r57, %r8, %r43; add.s32 %r58, %r57, %r43; rem.s32 %r122, %r58, %r43; bra.uni BB0_5; BB0_4: mov.u32 %r59, 0; max.s32 %r122, %r8, %r59; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f254, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f255, %f254; mov.f32 %f256, %f254; @%p10 bra BB0_7; mul.wide.s32 %rd25, %r12, 4; add.s64 %rd26, %rd6, %rd25; ld.global.nc.f32 %f254, [%rd26]; add.s64 %rd27, %rd5, %rd25; ld.global.nc.f32 %f255, [%rd27]; add.s64 %rd28, %rd4, %rd25; ld.global.nc.f32 %f256, [%rd28]; BB0_7: mul.f32 %f96, %f255, %f255; fma.rn.f32 %f97, %f254, %f254, %f96; fma.rn.f32 %f10, %f256, %f256, %f97; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs37, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd29, %r12; add.s64 %rd30, %rd3, %rd29; ld.global.nc.u8 %rs37, [%rd30]; BB0_9: setp.gt.u16 %p12, %rs37, %rs1; cvt.u32.u16 %r60, %rs37; and.b32 %r61, %r60, 255; selp.b32 %r62, %r7, %r61, %p12; selp.b32 %r63, %r61, %r7, %p12; add.s32 %r64, %r63, 1; mul.lo.s32 %r65, %r64, %r63; shr.u32 %r66, %r65, 1; add.s32 %r13, %r66, %r62; setp.ne.s16 %p13, %rs18, 0; mov.f32 %f263, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f264, %f263; mov.f32 %f265, %f263; @%p15 bra BB0_11; mul.wide.s32 %rd31, %r13, 4; add.s64 %rd32, %rd2, %rd31; ld.global.nc.f32 %f101, [%rd32]; add.f32 %f102, %f101, %f101; add.s64 %rd33, %rd1, %rd31; ld.global.nc.f32 %f103, [%rd33]; div.rn.f32 %f104, %f103, %f102; mul.f32 %f105, %f104, %f87; fma.rn.f32 %f106, %f3, %f105, %f2; mul.f32 %f107, %f2, %f105; sub.f32 %f108, %f3, %f107; selp.f32 %f109, %f1, %f254, %p11; selp.f32 %f110, %f106, %f255, %p11; selp.f32 %f111, %f108, %f256, %p11; mul.f32 %f112, %f87, %f87; div.rn.f32 %f113, %f102, %f112; sub.f32 %f114, %f109, %f1; sub.f32 %f115, %f110, %f2; sub.f32 %f116, %f111, %f3; fma.rn.f32 %f263, %f114, %f113, 0f00000000; fma.rn.f32 %f117, %f115, %f113, 0f00000000; fma.rn.f32 %f118, %f116, %f113, 0f00000000; div.rn.f32 %f119, %f103, %f87; mul.f32 %f120, %f111, %f119; sub.f32 %f264, %f117, %f120; fma.rn.f32 %f265, %f110, %f119, %f118; BB0_11: add.s32 %r14, %r1, 1; @%p7 bra BB0_13; rem.s32 %r67, %r14, %r43; add.s32 %r68, %r67, %r43; rem.s32 %r123, %r68, %r43; bra.uni BB0_14; BB0_13: add.s32 %r69, %r43, -1; min.s32 %r123, %r14, %r69; BB0_14: add.s32 %r18, %r123, %r5; setp.ge.s32 %p18, %r14, %r43; mov.f32 %f260, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f261, %f260; mov.f32 %f262, %f260; @%p20 bra BB0_16; mul.wide.s32 %rd34, %r18, 4; add.s64 %rd35, %rd6, %rd34; ld.global.nc.f32 %f260, [%rd35]; add.s64 %rd36, %rd5, %rd34; ld.global.nc.f32 %f261, [%rd36]; add.s64 %rd37, %rd4, %rd34; ld.global.nc.f32 %f262, [%rd37]; BB0_16: mul.f32 %f124, %f261, %f261; fma.rn.f32 %f125, %f260, %f260, %f124; fma.rn.f32 %f23, %f262, %f262, %f125; setp.eq.f32 %p21, %f23, 0f00000000; mov.u16 %rs38, %rs1; @%p21 bra BB0_18; cvt.s64.s32 %rd38, %r18; add.s64 %rd39, %rd3, %rd38; ld.global.nc.u8 %rs38, [%rd39]; BB0_18: setp.gt.u16 %p22, %rs38, %rs1; cvt.u32.u16 %r70, %rs38; and.b32 %r71, %r70, 255; selp.b32 %r72, %r7, %r71, %p22; selp.b32 %r73, %r71, %r7, %p22; add.s32 %r74, %r73, 1; mul.lo.s32 %r75, %r74, %r73; shr.u32 %r76, %r75, 1; add.s32 %r19, %r76, %r72; and.pred %p25, %p21, %p13; @%p25 bra BB0_20; mul.wide.s32 %rd40, %r19, 4; add.s64 %rd41, %rd2, %rd40; ld.global.nc.f32 %f126, [%rd41]; add.f32 %f127, %f126, %f126; add.s64 %rd42, %rd1, %rd40; ld.global.nc.f32 %f128, [%rd42]; div.rn.f32 %f129, %f128, %f127; mul.f32 %f130, %f129, %f87; mul.f32 %f131, %f3, %f130; sub.f32 %f132, %f2, %f131; fma.rn.f32 %f133, %f2, %f130, %f3; selp.f32 %f134, %f1, %f260, %p21; selp.f32 %f135, %f132, %f261, %p21; selp.f32 %f136, %f133, %f262, %p21; mul.f32 %f137, %f87, %f87; div.rn.f32 %f138, %f127, %f137; sub.f32 %f139, %f134, %f1; sub.f32 %f140, %f135, %f2; sub.f32 %f141, %f136, %f3; fma.rn.f32 %f263, %f139, %f138, %f263; fma.rn.f32 %f142, %f140, %f138, %f264; fma.rn.f32 %f143, %f141, %f138, %f265; div.rn.f32 %f144, %f128, %f87; fma.rn.f32 %f264, %f136, %f144, %f142; mul.f32 %f145, %f135, %f144; sub.f32 %f265, %f143, %f145; BB0_20: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r20, %r2, -1; @%p27 bra BB0_22; rem.s32 %r77, %r20, %r44; add.s32 %r78, %r77, %r44; rem.s32 %r124, %r78, %r44; bra.uni BB0_23; BB0_22: mov.u32 %r79, 0; max.s32 %r124, %r20, %r79; BB0_23: add.s32 %r80, %r124, %r4; mad.lo.s32 %r24, %r80, %r43, %r1; setp.lt.s32 %p29, %r20, 0; mov.f32 %f266, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f267, %f266; mov.f32 %f268, %f266; @%p30 bra BB0_25; mul.wide.s32 %rd43, %r24, 4; add.s64 %rd44, %rd6, %rd43; ld.global.nc.f32 %f266, [%rd44]; add.s64 %rd45, %rd5, %rd43; ld.global.nc.f32 %f267, [%rd45]; add.s64 %rd46, %rd4, %rd43; ld.global.nc.f32 %f268, [%rd46]; BB0_25: mul.f32 %f149, %f267, %f267; fma.rn.f32 %f150, %f266, %f266, %f149; fma.rn.f32 %f36, %f268, %f268, %f150; setp.eq.f32 %p31, %f36, 0f00000000; mov.u16 %rs39, %rs1; @%p31 bra BB0_27; cvt.s64.s32 %rd47, %r24; add.s64 %rd48, %rd3, %rd47; ld.global.nc.u8 %rs39, [%rd48]; BB0_27: setp.gt.u16 %p32, %rs39, %rs1; cvt.u32.u16 %r81, %rs39; and.b32 %r82, %r81, 255; selp.b32 %r83, %r7, %r82, %p32; selp.b32 %r84, %r82, %r7, %p32; add.s32 %r85, %r84, 1; mul.lo.s32 %r86, %r85, %r84; shr.u32 %r87, %r86, 1; add.s32 %r25, %r87, %r83; and.pred %p35, %p31, %p13; @%p35 bra BB0_29; mul.wide.s32 %rd49, %r25, 4; add.s64 %rd50, %rd2, %rd49; ld.global.nc.f32 %f151, [%rd50]; add.f32 %f152, %f151, %f151; add.s64 %rd51, %rd1, %rd49; ld.global.nc.f32 %f153, [%rd51]; div.rn.f32 %f154, %f153, %f152; mul.f32 %f155, %f154, %f88; mul.f32 %f156, %f3, %f155; sub.f32 %f157, %f1, %f156; fma.rn.f32 %f158, %f1, %f155, %f3; selp.f32 %f159, %f157, %f266, %p31; selp.f32 %f160, %f2, %f267, %p31; selp.f32 %f161, %f158, %f268, %p31; mul.f32 %f162, %f88, %f88; div.rn.f32 %f163, %f152, %f162; sub.f32 %f164, %f159, %f1; sub.f32 %f165, %f160, %f2; sub.f32 %f166, %f161, %f3; fma.rn.f32 %f167, %f164, %f163, %f263; fma.rn.f32 %f264, %f165, %f163, %f264; fma.rn.f32 %f168, %f166, %f163, %f265; div.rn.f32 %f169, %f153, %f88; fma.rn.f32 %f263, %f161, %f169, %f167; mul.f32 %f170, %f159, %f169; sub.f32 %f265, %f168, %f170; BB0_29: add.s32 %r26, %r2, 1; @%p27 bra BB0_31; rem.s32 %r88, %r26, %r44; add.s32 %r89, %r88, %r44; rem.s32 %r125, %r89, %r44; bra.uni BB0_32; BB0_31: add.s32 %r90, %r44, -1; min.s32 %r125, %r26, %r90; BB0_32: add.s32 %r91, %r125, %r4; mad.lo.s32 %r30, %r91, %r43, %r1; setp.ge.s32 %p38, %r26, %r44; mov.f32 %f272, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f273, %f272; mov.f32 %f274, %f272; @%p40 bra BB0_34; mul.wide.s32 %rd52, %r30, 4; add.s64 %rd53, %rd6, %rd52; ld.global.nc.f32 %f272, [%rd53]; add.s64 %rd54, %rd5, %rd52; ld.global.nc.f32 %f273, [%rd54]; add.s64 %rd55, %rd4, %rd52; ld.global.nc.f32 %f274, [%rd55]; BB0_34: mul.f32 %f174, %f273, %f273; fma.rn.f32 %f175, %f272, %f272, %f174; fma.rn.f32 %f49, %f274, %f274, %f175; setp.eq.f32 %p41, %f49, 0f00000000; mov.u16 %rs40, %rs1; @%p41 bra BB0_36; cvt.s64.s32 %rd56, %r30; add.s64 %rd57, %rd3, %rd56; ld.global.nc.u8 %rs40, [%rd57]; BB0_36: setp.gt.u16 %p42, %rs40, %rs1; cvt.u32.u16 %r92, %rs40; and.b32 %r93, %r92, 255; selp.b32 %r94, %r7, %r93, %p42; selp.b32 %r95, %r93, %r7, %p42; add.s32 %r96, %r95, 1; mul.lo.s32 %r97, %r96, %r95; shr.u32 %r98, %r97, 1; add.s32 %r31, %r98, %r94; and.pred %p45, %p41, %p13; @%p45 bra BB0_38; mul.wide.s32 %rd58, %r31, 4; add.s64 %rd59, %rd2, %rd58; ld.global.nc.f32 %f176, [%rd59]; add.f32 %f177, %f176, %f176; add.s64 %rd60, %rd1, %rd58; ld.global.nc.f32 %f178, [%rd60]; div.rn.f32 %f179, %f178, %f177; mul.f32 %f180, %f179, %f88; fma.rn.f32 %f181, %f3, %f180, %f1; mul.f32 %f182, %f1, %f180; sub.f32 %f183, %f3, %f182; selp.f32 %f184, %f181, %f272, %p41; selp.f32 %f185, %f2, %f273, %p41; selp.f32 %f186, %f183, %f274, %p41; mul.f32 %f187, %f88, %f88; div.rn.f32 %f188, %f177, %f187; sub.f32 %f189, %f184, %f1; sub.f32 %f190, %f185, %f2; sub.f32 %f191, %f186, %f3; fma.rn.f32 %f192, %f189, %f188, %f263; fma.rn.f32 %f264, %f190, %f188, %f264; fma.rn.f32 %f193, %f191, %f188, %f265; div.rn.f32 %f194, %f178, %f88; mul.f32 %f195, %f186, %f194; sub.f32 %f263, %f192, %f195; fma.rn.f32 %f265, %f184, %f194, %f193; BB0_38: setp.eq.s32 %p47, %r45, 1; @%p47 bra BB0_57; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r32, %r3, -1; @%p48 bra BB0_41; rem.s32 %r99, %r32, %r45; add.s32 %r100, %r99, %r45; rem.s32 %r126, %r100, %r45; bra.uni BB0_42; BB0_41: mov.u32 %r101, 0; max.s32 %r126, %r32, %r101; BB0_42: mad.lo.s32 %r102, %r126, %r44, %r2; mad.lo.s32 %r36, %r102, %r43, %r1; setp.lt.s32 %p50, %r32, 0; mov.f32 %f278, 0f00000000; and.pred %p51, %p50, %p48; mov.f32 %f279, %f278; mov.f32 %f280, %f278; @%p51 bra BB0_44; mul.wide.s32 %rd61, %r36, 4; add.s64 %rd62, %rd6, %rd61; ld.global.nc.f32 %f278, [%rd62]; add.s64 %rd63, %rd5, %rd61; ld.global.nc.f32 %f279, [%rd63]; add.s64 %rd64, %rd4, %rd61; ld.global.nc.f32 %f280, [%rd64]; BB0_44: mul.f32 %f199, %f279, %f279; fma.rn.f32 %f200, %f278, %f278, %f199; fma.rn.f32 %f62, %f280, %f280, %f200; setp.eq.f32 %p52, %f62, 0f00000000; mov.u16 %rs41, %rs1; @%p52 bra BB0_46; cvt.s64.s32 %rd65, %r36; add.s64 %rd66, %rd3, %rd65; ld.global.nc.u8 %rs41, [%rd66]; BB0_46: setp.gt.u16 %p53, %rs41, %rs1; cvt.u32.u16 %r103, %rs41; and.b32 %r104, %r103, 255; selp.b32 %r105, %r7, %r104, %p53; selp.b32 %r106, %r104, %r7, %p53; add.s32 %r107, %r106, 1; mul.lo.s32 %r108, %r107, %r106; shr.u32 %r109, %r108, 1; add.s32 %r37, %r109, %r105; and.pred %p56, %p52, %p13; @%p56 bra BB0_48; mul.wide.s32 %rd67, %r37, 4; add.s64 %rd68, %rd2, %rd67; ld.global.nc.f32 %f201, [%rd68]; add.f32 %f202, %f201, %f201; add.s64 %rd69, %rd1, %rd67; ld.global.nc.f32 %f203, [%rd69]; div.rn.f32 %f204, %f203, %f202; mul.f32 %f205, %f204, %f89; fma.rn.f32 %f206, %f2, %f205, %f1; mul.f32 %f207, %f1, %f205; sub.f32 %f208, %f2, %f207; selp.f32 %f209, %f206, %f278, %p52; selp.f32 %f210, %f208, %f279, %p52; selp.f32 %f211, %f3, %f280, %p52; mul.f32 %f212, %f89, %f89; div.rn.f32 %f213, %f202, %f212; sub.f32 %f214, %f209, %f1; sub.f32 %f215, %f210, %f2; sub.f32 %f216, %f211, %f3; fma.rn.f32 %f217, %f214, %f213, %f263; fma.rn.f32 %f218, %f215, %f213, %f264; fma.rn.f32 %f265, %f216, %f213, %f265; div.rn.f32 %f219, %f203, %f89; mul.f32 %f220, %f210, %f219; sub.f32 %f263, %f217, %f220; fma.rn.f32 %f264, %f209, %f219, %f218; BB0_48: add.s32 %r38, %r3, 1; @%p48 bra BB0_50; rem.s32 %r110, %r38, %r45; add.s32 %r111, %r110, %r45; rem.s32 %r127, %r111, %r45; bra.uni BB0_51; BB0_50: add.s32 %r112, %r45, -1; min.s32 %r127, %r38, %r112; BB0_51: mad.lo.s32 %r113, %r127, %r44, %r2; mad.lo.s32 %r42, %r113, %r43, %r1; setp.ge.s32 %p59, %r38, %r45; mov.f32 %f284, 0f00000000; and.pred %p61, %p59, %p48; mov.f32 %f285, %f284; mov.f32 %f286, %f284; @%p61 bra BB0_53; mul.wide.s32 %rd70, %r42, 4; add.s64 %rd71, %rd6, %rd70; ld.global.nc.f32 %f286, [%rd71]; add.s64 %rd72, %rd5, %rd70; ld.global.nc.f32 %f285, [%rd72]; add.s64 %rd73, %rd4, %rd70; ld.global.nc.f32 %f284, [%rd73]; BB0_53: mul.f32 %f224, %f286, %f286; fma.rn.f32 %f225, %f285, %f285, %f224; fma.rn.f32 %f75, %f284, %f284, %f225; setp.eq.f32 %p62, %f75, 0f00000000; mov.u16 %rs42, %rs1; @%p62 bra BB0_55; cvt.s64.s32 %rd74, %r42; add.s64 %rd75, %rd3, %rd74; ld.global.nc.u8 %rs42, [%rd75]; BB0_55: setp.gt.u16 %p63, %rs42, %rs1; cvt.u32.u16 %r114, %rs42; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p63; selp.b32 %r117, %r115, %r7, %p63; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd76, %r121, 4; add.s64 %rd7, %rd2, %rd76; add.s64 %rd8, %rd1, %rd76; and.pred %p66, %p62, %p13; @%p66 bra BB0_57; ld.global.nc.f32 %f226, [%rd7]; add.f32 %f227, %f226, %f226; ld.global.nc.f32 %f228, [%rd8]; div.rn.f32 %f229, %f228, %f227; mul.f32 %f230, %f229, %f89; mul.f32 %f231, %f2, %f230; sub.f32 %f232, %f1, %f231; fma.rn.f32 %f233, %f1, %f230, %f2; selp.f32 %f234, %f3, %f284, %p62; selp.f32 %f235, %f233, %f285, %p62; selp.f32 %f236, %f232, %f286, %p62; mul.f32 %f237, %f89, %f89; div.rn.f32 %f238, %f227, %f237; sub.f32 %f239, %f236, %f1; sub.f32 %f240, %f235, %f2; sub.f32 %f241, %f234, %f3; fma.rn.f32 %f242, %f239, %f238, %f263; fma.rn.f32 %f243, %f240, %f238, %f264; fma.rn.f32 %f265, %f241, %f238, %f265; div.rn.f32 %f244, %f228, %f89; fma.rn.f32 %f263, %f235, %f244, %f242; mul.f32 %f245, %f236, %f244; sub.f32 %f264, %f243, %f245; BB0_57: setp.eq.s64 %p68, %rd12, 0; @%p68 bra BB0_59; cvta.to.global.u64 %rd77, %rd12; add.s64 %rd79, %rd77, %rd19; ld.global.nc.f32 %f246, [%rd79]; mul.f32 %f290, %f246, %f290; BB0_59: setp.eq.f32 %p69, %f290, 0f00000000; mov.f32 %f291, 0f00000000; @%p69 bra BB0_61; rcp.rn.f32 %f291, %f290; BB0_61: cvta.to.global.u64 %rd80, %rd11; cvta.to.global.u64 %rd81, %rd10; cvta.to.global.u64 %rd82, %rd9; add.s64 %rd84, %rd82, %rd19; ld.global.f32 %f248, [%rd84]; fma.rn.f32 %f249, %f263, %f291, %f248; st.global.f32 [%rd84], %f249; add.s64 %rd85, %rd81, %rd19; ld.global.f32 %f250, [%rd85]; fma.rn.f32 %f251, %f264, %f291, %f250; st.global.f32 [%rd85], %f251; add.s64 %rd86, %rd80, %rd19; ld.global.f32 %f252, [%rd86]; fma.rn.f32 %f253, %f265, %f291, %f252; st.global.f32 [%rd86], %f253; BB0_62: ret; } ` adddmibulk_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<70>; .reg .b16 %rs<43>; .reg .f32 %f<292>; .reg .b32 %r<128>; .reg .b64 %rd<87>; ld.param.u64 %rd9, [adddmibulk_param_0]; ld.param.u64 %rd10, [adddmibulk_param_1]; ld.param.u64 %rd11, [adddmibulk_param_2]; ld.param.u64 %rd13, [adddmibulk_param_3]; ld.param.u64 %rd14, [adddmibulk_param_4]; ld.param.u64 %rd15, [adddmibulk_param_5]; ld.param.u64 %rd12, [adddmibulk_param_6]; ld.param.f32 %f290, [adddmibulk_param_7]; ld.param.u64 %rd16, [adddmibulk_param_8]; ld.param.u64 %rd17, [adddmibulk_param_9]; ld.param.u64 %rd18, [adddmibulk_param_10]; ld.param.f32 %f87, [adddmibulk_param_11]; ld.param.f32 %f88, [adddmibulk_param_12]; ld.param.f32 %f89, [adddmibulk_param_13]; ld.param.u32 %r43, [adddmibulk_param_14]; ld.param.u32 %r44, [adddmibulk_param_15]; ld.param.u32 %r45, [adddmibulk_param_16]; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; cvta.to.global.u64 %rd1, %rd17; cvta.to.global.u64 %rd2, %rd16; cvta.to.global.u64 %rd3, %rd18; cvta.to.global.u64 %rd4, %rd15; cvta.to.global.u64 %rd5, %rd14; cvta.to.global.u64 %rd6, %rd13; mov.u32 %r46, %ntid.x; mov.u32 %r47, %ctaid.x; mov.u32 %r48, %tid.x; mad.lo.s32 %r1, %r46, %r47, %r48; mov.u32 %r49, %ntid.y; mov.u32 %r50, %ctaid.y; mov.u32 %r51, %tid.y; mad.lo.s32 %r2, %r49, %r50, %r51; mov.u32 %r52, %ntid.z; mov.u32 %r53, %ctaid.z; mov.u32 %r54, %tid.z; mad.lo.s32 %r3, %r52, %r53, %r54; setp.ge.s32 %p1, %r2, %r44; setp.ge.s32 %p2, %r1, %r43; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r45; or.pred %p5, %p3, %p4; @%p5 bra BB0_62; mul.lo.s32 %r4, %r3, %r44; add.s32 %r55, %r4, %r2; mul.lo.s32 %r5, %r55, %r43; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd19, %r6, 4; add.s64 %rd20, %rd6, %rd19; cvt.s64.s32 %rd21, %r6; add.s64 %rd22, %rd5, %rd19; add.s64 %rd23, %rd4, %rd19; add.s64 %rd24, %rd3, %rd21; ld.global.nc.u8 %rs1, [%rd24]; cvt.u32.u16 %r56, %rs1; and.b32 %r7, %r56, 255; ld.global.nc.f32 %f1, [%rd20]; ld.global.nc.f32 %f2, [%rd22]; mul.f32 %f90, %f2, %f2; fma.rn.f32 %f91, %f1, %f1, %f90; ld.global.nc.f32 %f3, [%rd23]; fma.rn.f32 %f92, %f3, %f3, %f91; setp.eq.f32 %p6, %f92, 0f00000000; @%p6 bra BB0_62; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r57, %r8, %r43; add.s32 %r58, %r57, %r43; rem.s32 %r122, %r58, %r43; bra.uni BB0_5; BB0_4: mov.u32 %r59, 0; max.s32 %r122, %r8, %r59; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f254, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f255, %f254; mov.f32 %f256, %f254; @%p10 bra BB0_7; mul.wide.s32 %rd25, %r12, 4; add.s64 %rd26, %rd6, %rd25; ld.global.nc.f32 %f254, [%rd26]; add.s64 %rd27, %rd5, %rd25; ld.global.nc.f32 %f255, [%rd27]; add.s64 %rd28, %rd4, %rd25; ld.global.nc.f32 %f256, [%rd28]; BB0_7: mul.f32 %f96, %f255, %f255; fma.rn.f32 %f97, %f254, %f254, %f96; fma.rn.f32 %f10, %f256, %f256, %f97; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs37, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd29, %r12; add.s64 %rd30, %rd3, %rd29; ld.global.nc.u8 %rs37, [%rd30]; BB0_9: setp.gt.u16 %p12, %rs37, %rs1; cvt.u32.u16 %r60, %rs37; and.b32 %r61, %r60, 255; selp.b32 %r62, %r7, %r61, %p12; selp.b32 %r63, %r61, %r7, %p12; add.s32 %r64, %r63, 1; mul.lo.s32 %r65, %r64, %r63; shr.u32 %r66, %r65, 1; add.s32 %r13, %r66, %r62; setp.ne.s16 %p13, %rs18, 0; mov.f32 %f263, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f264, %f263; mov.f32 %f265, %f263; @%p15 bra BB0_11; mul.wide.s32 %rd31, %r13, 4; add.s64 %rd32, %rd2, %rd31; ld.global.nc.f32 %f101, [%rd32]; add.f32 %f102, %f101, %f101; add.s64 %rd33, %rd1, %rd31; ld.global.nc.f32 %f103, [%rd33]; div.rn.f32 %f104, %f103, %f102; mul.f32 %f105, %f104, %f87; fma.rn.f32 %f106, %f3, %f105, %f2; mul.f32 %f107, %f2, %f105; sub.f32 %f108, %f3, %f107; selp.f32 %f109, %f1, %f254, %p11; selp.f32 %f110, %f106, %f255, %p11; selp.f32 %f111, %f108, %f256, %p11; mul.f32 %f112, %f87, %f87; div.rn.f32 %f113, %f102, %f112; sub.f32 %f114, %f109, %f1; sub.f32 %f115, %f110, %f2; sub.f32 %f116, %f111, %f3; fma.rn.f32 %f263, %f114, %f113, 0f00000000; fma.rn.f32 %f117, %f115, %f113, 0f00000000; fma.rn.f32 %f118, %f116, %f113, 0f00000000; div.rn.f32 %f119, %f103, %f87; mul.f32 %f120, %f111, %f119; sub.f32 %f264, %f117, %f120; fma.rn.f32 %f265, %f110, %f119, %f118; BB0_11: add.s32 %r14, %r1, 1; @%p7 bra BB0_13; rem.s32 %r67, %r14, %r43; add.s32 %r68, %r67, %r43; rem.s32 %r123, %r68, %r43; bra.uni BB0_14; BB0_13: add.s32 %r69, %r43, -1; min.s32 %r123, %r14, %r69; BB0_14: add.s32 %r18, %r123, %r5; setp.ge.s32 %p18, %r14, %r43; mov.f32 %f260, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f261, %f260; mov.f32 %f262, %f260; @%p20 bra BB0_16; mul.wide.s32 %rd34, %r18, 4; add.s64 %rd35, %rd6, %rd34; ld.global.nc.f32 %f260, [%rd35]; add.s64 %rd36, %rd5, %rd34; ld.global.nc.f32 %f261, [%rd36]; add.s64 %rd37, %rd4, %rd34; ld.global.nc.f32 %f262, [%rd37]; BB0_16: mul.f32 %f124, %f261, %f261; fma.rn.f32 %f125, %f260, %f260, %f124; fma.rn.f32 %f23, %f262, %f262, %f125; setp.eq.f32 %p21, %f23, 0f00000000; mov.u16 %rs38, %rs1; @%p21 bra BB0_18; cvt.s64.s32 %rd38, %r18; add.s64 %rd39, %rd3, %rd38; ld.global.nc.u8 %rs38, [%rd39]; BB0_18: setp.gt.u16 %p22, %rs38, %rs1; cvt.u32.u16 %r70, %rs38; and.b32 %r71, %r70, 255; selp.b32 %r72, %r7, %r71, %p22; selp.b32 %r73, %r71, %r7, %p22; add.s32 %r74, %r73, 1; mul.lo.s32 %r75, %r74, %r73; shr.u32 %r76, %r75, 1; add.s32 %r19, %r76, %r72; and.pred %p25, %p21, %p13; @%p25 bra BB0_20; mul.wide.s32 %rd40, %r19, 4; add.s64 %rd41, %rd2, %rd40; ld.global.nc.f32 %f126, [%rd41]; add.f32 %f127, %f126, %f126; add.s64 %rd42, %rd1, %rd40; ld.global.nc.f32 %f128, [%rd42]; div.rn.f32 %f129, %f128, %f127; mul.f32 %f130, %f129, %f87; mul.f32 %f131, %f3, %f130; sub.f32 %f132, %f2, %f131; fma.rn.f32 %f133, %f2, %f130, %f3; selp.f32 %f134, %f1, %f260, %p21; selp.f32 %f135, %f132, %f261, %p21; selp.f32 %f136, %f133, %f262, %p21; mul.f32 %f137, %f87, %f87; div.rn.f32 %f138, %f127, %f137; sub.f32 %f139, %f134, %f1; sub.f32 %f140, %f135, %f2; sub.f32 %f141, %f136, %f3; fma.rn.f32 %f263, %f139, %f138, %f263; fma.rn.f32 %f142, %f140, %f138, %f264; fma.rn.f32 %f143, %f141, %f138, %f265; div.rn.f32 %f144, %f128, %f87; fma.rn.f32 %f264, %f136, %f144, %f142; mul.f32 %f145, %f135, %f144; sub.f32 %f265, %f143, %f145; BB0_20: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r20, %r2, -1; @%p27 bra BB0_22; rem.s32 %r77, %r20, %r44; add.s32 %r78, %r77, %r44; rem.s32 %r124, %r78, %r44; bra.uni BB0_23; BB0_22: mov.u32 %r79, 0; max.s32 %r124, %r20, %r79; BB0_23: add.s32 %r80, %r124, %r4; mad.lo.s32 %r24, %r80, %r43, %r1; setp.lt.s32 %p29, %r20, 0; mov.f32 %f266, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f267, %f266; mov.f32 %f268, %f266; @%p30 bra BB0_25; mul.wide.s32 %rd43, %r24, 4; add.s64 %rd44, %rd6, %rd43; ld.global.nc.f32 %f266, [%rd44]; add.s64 %rd45, %rd5, %rd43; ld.global.nc.f32 %f267, [%rd45]; add.s64 %rd46, %rd4, %rd43; ld.global.nc.f32 %f268, [%rd46]; BB0_25: mul.f32 %f149, %f267, %f267; fma.rn.f32 %f150, %f266, %f266, %f149; fma.rn.f32 %f36, %f268, %f268, %f150; setp.eq.f32 %p31, %f36, 0f00000000; mov.u16 %rs39, %rs1; @%p31 bra BB0_27; cvt.s64.s32 %rd47, %r24; add.s64 %rd48, %rd3, %rd47; ld.global.nc.u8 %rs39, [%rd48]; BB0_27: setp.gt.u16 %p32, %rs39, %rs1; cvt.u32.u16 %r81, %rs39; and.b32 %r82, %r81, 255; selp.b32 %r83, %r7, %r82, %p32; selp.b32 %r84, %r82, %r7, %p32; add.s32 %r85, %r84, 1; mul.lo.s32 %r86, %r85, %r84; shr.u32 %r87, %r86, 1; add.s32 %r25, %r87, %r83; and.pred %p35, %p31, %p13; @%p35 bra BB0_29; mul.wide.s32 %rd49, %r25, 4; add.s64 %rd50, %rd2, %rd49; ld.global.nc.f32 %f151, [%rd50]; add.f32 %f152, %f151, %f151; add.s64 %rd51, %rd1, %rd49; ld.global.nc.f32 %f153, [%rd51]; div.rn.f32 %f154, %f153, %f152; mul.f32 %f155, %f154, %f88; mul.f32 %f156, %f3, %f155; sub.f32 %f157, %f1, %f156; fma.rn.f32 %f158, %f1, %f155, %f3; selp.f32 %f159, %f157, %f266, %p31; selp.f32 %f160, %f2, %f267, %p31; selp.f32 %f161, %f158, %f268, %p31; mul.f32 %f162, %f88, %f88; div.rn.f32 %f163, %f152, %f162; sub.f32 %f164, %f159, %f1; sub.f32 %f165, %f160, %f2; sub.f32 %f166, %f161, %f3; fma.rn.f32 %f167, %f164, %f163, %f263; fma.rn.f32 %f264, %f165, %f163, %f264; fma.rn.f32 %f168, %f166, %f163, %f265; div.rn.f32 %f169, %f153, %f88; fma.rn.f32 %f263, %f161, %f169, %f167; mul.f32 %f170, %f159, %f169; sub.f32 %f265, %f168, %f170; BB0_29: add.s32 %r26, %r2, 1; @%p27 bra BB0_31; rem.s32 %r88, %r26, %r44; add.s32 %r89, %r88, %r44; rem.s32 %r125, %r89, %r44; bra.uni BB0_32; BB0_31: add.s32 %r90, %r44, -1; min.s32 %r125, %r26, %r90; BB0_32: add.s32 %r91, %r125, %r4; mad.lo.s32 %r30, %r91, %r43, %r1; setp.ge.s32 %p38, %r26, %r44; mov.f32 %f272, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f273, %f272; mov.f32 %f274, %f272; @%p40 bra BB0_34; mul.wide.s32 %rd52, %r30, 4; add.s64 %rd53, %rd6, %rd52; ld.global.nc.f32 %f272, [%rd53]; add.s64 %rd54, %rd5, %rd52; ld.global.nc.f32 %f273, [%rd54]; add.s64 %rd55, %rd4, %rd52; ld.global.nc.f32 %f274, [%rd55]; BB0_34: mul.f32 %f174, %f273, %f273; fma.rn.f32 %f175, %f272, %f272, %f174; fma.rn.f32 %f49, %f274, %f274, %f175; setp.eq.f32 %p41, %f49, 0f00000000; mov.u16 %rs40, %rs1; @%p41 bra BB0_36; cvt.s64.s32 %rd56, %r30; add.s64 %rd57, %rd3, %rd56; ld.global.nc.u8 %rs40, [%rd57]; BB0_36: setp.gt.u16 %p42, %rs40, %rs1; cvt.u32.u16 %r92, %rs40; and.b32 %r93, %r92, 255; selp.b32 %r94, %r7, %r93, %p42; selp.b32 %r95, %r93, %r7, %p42; add.s32 %r96, %r95, 1; mul.lo.s32 %r97, %r96, %r95; shr.u32 %r98, %r97, 1; add.s32 %r31, %r98, %r94; and.pred %p45, %p41, %p13; @%p45 bra BB0_38; mul.wide.s32 %rd58, %r31, 4; add.s64 %rd59, %rd2, %rd58; ld.global.nc.f32 %f176, [%rd59]; add.f32 %f177, %f176, %f176; add.s64 %rd60, %rd1, %rd58; ld.global.nc.f32 %f178, [%rd60]; div.rn.f32 %f179, %f178, %f177; mul.f32 %f180, %f179, %f88; fma.rn.f32 %f181, %f3, %f180, %f1; mul.f32 %f182, %f1, %f180; sub.f32 %f183, %f3, %f182; selp.f32 %f184, %f181, %f272, %p41; selp.f32 %f185, %f2, %f273, %p41; selp.f32 %f186, %f183, %f274, %p41; mul.f32 %f187, %f88, %f88; div.rn.f32 %f188, %f177, %f187; sub.f32 %f189, %f184, %f1; sub.f32 %f190, %f185, %f2; sub.f32 %f191, %f186, %f3; fma.rn.f32 %f192, %f189, %f188, %f263; fma.rn.f32 %f264, %f190, %f188, %f264; fma.rn.f32 %f193, %f191, %f188, %f265; div.rn.f32 %f194, %f178, %f88; mul.f32 %f195, %f186, %f194; sub.f32 %f263, %f192, %f195; fma.rn.f32 %f265, %f184, %f194, %f193; BB0_38: setp.eq.s32 %p47, %r45, 1; @%p47 bra BB0_57; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r32, %r3, -1; @%p48 bra BB0_41; rem.s32 %r99, %r32, %r45; add.s32 %r100, %r99, %r45; rem.s32 %r126, %r100, %r45; bra.uni BB0_42; BB0_41: mov.u32 %r101, 0; max.s32 %r126, %r32, %r101; BB0_42: mad.lo.s32 %r102, %r126, %r44, %r2; mad.lo.s32 %r36, %r102, %r43, %r1; setp.lt.s32 %p50, %r32, 0; mov.f32 %f278, 0f00000000; and.pred %p51, %p50, %p48; mov.f32 %f279, %f278; mov.f32 %f280, %f278; @%p51 bra BB0_44; mul.wide.s32 %rd61, %r36, 4; add.s64 %rd62, %rd6, %rd61; ld.global.nc.f32 %f278, [%rd62]; add.s64 %rd63, %rd5, %rd61; ld.global.nc.f32 %f279, [%rd63]; add.s64 %rd64, %rd4, %rd61; ld.global.nc.f32 %f280, [%rd64]; BB0_44: mul.f32 %f199, %f279, %f279; fma.rn.f32 %f200, %f278, %f278, %f199; fma.rn.f32 %f62, %f280, %f280, %f200; setp.eq.f32 %p52, %f62, 0f00000000; mov.u16 %rs41, %rs1; @%p52 bra BB0_46; cvt.s64.s32 %rd65, %r36; add.s64 %rd66, %rd3, %rd65; ld.global.nc.u8 %rs41, [%rd66]; BB0_46: setp.gt.u16 %p53, %rs41, %rs1; cvt.u32.u16 %r103, %rs41; and.b32 %r104, %r103, 255; selp.b32 %r105, %r7, %r104, %p53; selp.b32 %r106, %r104, %r7, %p53; add.s32 %r107, %r106, 1; mul.lo.s32 %r108, %r107, %r106; shr.u32 %r109, %r108, 1; add.s32 %r37, %r109, %r105; and.pred %p56, %p52, %p13; @%p56 bra BB0_48; mul.wide.s32 %rd67, %r37, 4; add.s64 %rd68, %rd2, %rd67; ld.global.nc.f32 %f201, [%rd68]; add.f32 %f202, %f201, %f201; add.s64 %rd69, %rd1, %rd67; ld.global.nc.f32 %f203, [%rd69]; div.rn.f32 %f204, %f203, %f202; mul.f32 %f205, %f204, %f89; fma.rn.f32 %f206, %f2, %f205, %f1; mul.f32 %f207, %f1, %f205; sub.f32 %f208, %f2, %f207; selp.f32 %f209, %f206, %f278, %p52; selp.f32 %f210, %f208, %f279, %p52; selp.f32 %f211, %f3, %f280, %p52; mul.f32 %f212, %f89, %f89; div.rn.f32 %f213, %f202, %f212; sub.f32 %f214, %f209, %f1; sub.f32 %f215, %f210, %f2; sub.f32 %f216, %f211, %f3; fma.rn.f32 %f217, %f214, %f213, %f263; fma.rn.f32 %f218, %f215, %f213, %f264; fma.rn.f32 %f265, %f216, %f213, %f265; div.rn.f32 %f219, %f203, %f89; mul.f32 %f220, %f210, %f219; sub.f32 %f263, %f217, %f220; fma.rn.f32 %f264, %f209, %f219, %f218; BB0_48: add.s32 %r38, %r3, 1; @%p48 bra BB0_50; rem.s32 %r110, %r38, %r45; add.s32 %r111, %r110, %r45; rem.s32 %r127, %r111, %r45; bra.uni BB0_51; BB0_50: add.s32 %r112, %r45, -1; min.s32 %r127, %r38, %r112; BB0_51: mad.lo.s32 %r113, %r127, %r44, %r2; mad.lo.s32 %r42, %r113, %r43, %r1; setp.ge.s32 %p59, %r38, %r45; mov.f32 %f284, 0f00000000; and.pred %p61, %p59, %p48; mov.f32 %f285, %f284; mov.f32 %f286, %f284; @%p61 bra BB0_53; mul.wide.s32 %rd70, %r42, 4; add.s64 %rd71, %rd6, %rd70; ld.global.nc.f32 %f286, [%rd71]; add.s64 %rd72, %rd5, %rd70; ld.global.nc.f32 %f285, [%rd72]; add.s64 %rd73, %rd4, %rd70; ld.global.nc.f32 %f284, [%rd73]; BB0_53: mul.f32 %f224, %f286, %f286; fma.rn.f32 %f225, %f285, %f285, %f224; fma.rn.f32 %f75, %f284, %f284, %f225; setp.eq.f32 %p62, %f75, 0f00000000; mov.u16 %rs42, %rs1; @%p62 bra BB0_55; cvt.s64.s32 %rd74, %r42; add.s64 %rd75, %rd3, %rd74; ld.global.nc.u8 %rs42, [%rd75]; BB0_55: setp.gt.u16 %p63, %rs42, %rs1; cvt.u32.u16 %r114, %rs42; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p63; selp.b32 %r117, %r115, %r7, %p63; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd76, %r121, 4; add.s64 %rd7, %rd2, %rd76; add.s64 %rd8, %rd1, %rd76; and.pred %p66, %p62, %p13; @%p66 bra BB0_57; ld.global.nc.f32 %f226, [%rd7]; add.f32 %f227, %f226, %f226; ld.global.nc.f32 %f228, [%rd8]; div.rn.f32 %f229, %f228, %f227; mul.f32 %f230, %f229, %f89; mul.f32 %f231, %f2, %f230; sub.f32 %f232, %f1, %f231; fma.rn.f32 %f233, %f1, %f230, %f2; selp.f32 %f234, %f3, %f284, %p62; selp.f32 %f235, %f233, %f285, %p62; selp.f32 %f236, %f232, %f286, %p62; mul.f32 %f237, %f89, %f89; div.rn.f32 %f238, %f227, %f237; sub.f32 %f239, %f236, %f1; sub.f32 %f240, %f235, %f2; sub.f32 %f241, %f234, %f3; fma.rn.f32 %f242, %f239, %f238, %f263; fma.rn.f32 %f243, %f240, %f238, %f264; fma.rn.f32 %f265, %f241, %f238, %f265; div.rn.f32 %f244, %f228, %f89; fma.rn.f32 %f263, %f235, %f244, %f242; mul.f32 %f245, %f236, %f244; sub.f32 %f264, %f243, %f245; BB0_57: setp.eq.s64 %p68, %rd12, 0; @%p68 bra BB0_59; cvta.to.global.u64 %rd77, %rd12; add.s64 %rd79, %rd77, %rd19; ld.global.nc.f32 %f246, [%rd79]; mul.f32 %f290, %f246, %f290; BB0_59: setp.eq.f32 %p69, %f290, 0f00000000; mov.f32 %f291, 0f00000000; @%p69 bra BB0_61; rcp.rn.f32 %f291, %f290; BB0_61: cvta.to.global.u64 %rd80, %rd11; cvta.to.global.u64 %rd81, %rd10; cvta.to.global.u64 %rd82, %rd9; add.s64 %rd84, %rd82, %rd19; ld.global.f32 %f248, [%rd84]; fma.rn.f32 %f249, %f263, %f291, %f248; st.global.f32 [%rd84], %f249; add.s64 %rd85, %rd81, %rd19; ld.global.f32 %f250, [%rd85]; fma.rn.f32 %f251, %f264, %f291, %f250; st.global.f32 [%rd85], %f251; add.s64 %rd86, %rd80, %rd19; ld.global.f32 %f252, [%rd86]; fma.rn.f32 %f253, %f265, %f291, %f252; st.global.f32 [%rd86], %f253; BB0_62: ret; } ` adddmibulk_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<70>; .reg .b16 %rs<43>; .reg .f32 %f<292>; .reg .b32 %r<128>; .reg .b64 %rd<87>; ld.param.u64 %rd9, [adddmibulk_param_0]; ld.param.u64 %rd10, [adddmibulk_param_1]; ld.param.u64 %rd11, [adddmibulk_param_2]; ld.param.u64 %rd13, [adddmibulk_param_3]; ld.param.u64 %rd14, [adddmibulk_param_4]; ld.param.u64 %rd15, [adddmibulk_param_5]; ld.param.u64 %rd12, [adddmibulk_param_6]; ld.param.f32 %f290, [adddmibulk_param_7]; ld.param.u64 %rd16, [adddmibulk_param_8]; ld.param.u64 %rd17, [adddmibulk_param_9]; ld.param.u64 %rd18, [adddmibulk_param_10]; ld.param.f32 %f87, [adddmibulk_param_11]; ld.param.f32 %f88, [adddmibulk_param_12]; ld.param.f32 %f89, [adddmibulk_param_13]; ld.param.u32 %r43, [adddmibulk_param_14]; ld.param.u32 %r44, [adddmibulk_param_15]; ld.param.u32 %r45, [adddmibulk_param_16]; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; cvta.to.global.u64 %rd1, %rd17; cvta.to.global.u64 %rd2, %rd16; cvta.to.global.u64 %rd3, %rd18; cvta.to.global.u64 %rd4, %rd15; cvta.to.global.u64 %rd5, %rd14; cvta.to.global.u64 %rd6, %rd13; mov.u32 %r46, %ntid.x; mov.u32 %r47, %ctaid.x; mov.u32 %r48, %tid.x; mad.lo.s32 %r1, %r46, %r47, %r48; mov.u32 %r49, %ntid.y; mov.u32 %r50, %ctaid.y; mov.u32 %r51, %tid.y; mad.lo.s32 %r2, %r49, %r50, %r51; mov.u32 %r52, %ntid.z; mov.u32 %r53, %ctaid.z; mov.u32 %r54, %tid.z; mad.lo.s32 %r3, %r52, %r53, %r54; setp.ge.s32 %p1, %r2, %r44; setp.ge.s32 %p2, %r1, %r43; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r45; or.pred %p5, %p3, %p4; @%p5 bra BB0_62; mul.lo.s32 %r4, %r3, %r44; add.s32 %r55, %r4, %r2; mul.lo.s32 %r5, %r55, %r43; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd19, %r6, 4; add.s64 %rd20, %rd6, %rd19; cvt.s64.s32 %rd21, %r6; add.s64 %rd22, %rd5, %rd19; add.s64 %rd23, %rd4, %rd19; add.s64 %rd24, %rd3, %rd21; ld.global.nc.u8 %rs1, [%rd24]; cvt.u32.u16 %r56, %rs1; and.b32 %r7, %r56, 255; ld.global.nc.f32 %f1, [%rd20]; ld.global.nc.f32 %f2, [%rd22]; mul.f32 %f90, %f2, %f2; fma.rn.f32 %f91, %f1, %f1, %f90; ld.global.nc.f32 %f3, [%rd23]; fma.rn.f32 %f92, %f3, %f3, %f91; setp.eq.f32 %p6, %f92, 0f00000000; @%p6 bra BB0_62; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r57, %r8, %r43; add.s32 %r58, %r57, %r43; rem.s32 %r122, %r58, %r43; bra.uni BB0_5; BB0_4: mov.u32 %r59, 0; max.s32 %r122, %r8, %r59; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f254, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f255, %f254; mov.f32 %f256, %f254; @%p10 bra BB0_7; mul.wide.s32 %rd25, %r12, 4; add.s64 %rd26, %rd6, %rd25; ld.global.nc.f32 %f254, [%rd26]; add.s64 %rd27, %rd5, %rd25; ld.global.nc.f32 %f255, [%rd27]; add.s64 %rd28, %rd4, %rd25; ld.global.nc.f32 %f256, [%rd28]; BB0_7: mul.f32 %f96, %f255, %f255; fma.rn.f32 %f97, %f254, %f254, %f96; fma.rn.f32 %f10, %f256, %f256, %f97; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs37, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd29, %r12; add.s64 %rd30, %rd3, %rd29; ld.global.nc.u8 %rs37, [%rd30]; BB0_9: setp.gt.u16 %p12, %rs37, %rs1; cvt.u32.u16 %r60, %rs37; and.b32 %r61, %r60, 255; selp.b32 %r62, %r7, %r61, %p12; selp.b32 %r63, %r61, %r7, %p12; add.s32 %r64, %r63, 1; mul.lo.s32 %r65, %r64, %r63; shr.u32 %r66, %r65, 1; add.s32 %r13, %r66, %r62; setp.ne.s16 %p13, %rs18, 0; mov.f32 %f263, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f264, %f263; mov.f32 %f265, %f263; @%p15 bra BB0_11; mul.wide.s32 %rd31, %r13, 4; add.s64 %rd32, %rd2, %rd31; ld.global.nc.f32 %f101, [%rd32]; add.f32 %f102, %f101, %f101; add.s64 %rd33, %rd1, %rd31; ld.global.nc.f32 %f103, [%rd33]; div.rn.f32 %f104, %f103, %f102; mul.f32 %f105, %f104, %f87; fma.rn.f32 %f106, %f3, %f105, %f2; mul.f32 %f107, %f2, %f105; sub.f32 %f108, %f3, %f107; selp.f32 %f109, %f1, %f254, %p11; selp.f32 %f110, %f106, %f255, %p11; selp.f32 %f111, %f108, %f256, %p11; mul.f32 %f112, %f87, %f87; div.rn.f32 %f113, %f102, %f112; sub.f32 %f114, %f109, %f1; sub.f32 %f115, %f110, %f2; sub.f32 %f116, %f111, %f3; fma.rn.f32 %f263, %f114, %f113, 0f00000000; fma.rn.f32 %f117, %f115, %f113, 0f00000000; fma.rn.f32 %f118, %f116, %f113, 0f00000000; div.rn.f32 %f119, %f103, %f87; mul.f32 %f120, %f111, %f119; sub.f32 %f264, %f117, %f120; fma.rn.f32 %f265, %f110, %f119, %f118; BB0_11: add.s32 %r14, %r1, 1; @%p7 bra BB0_13; rem.s32 %r67, %r14, %r43; add.s32 %r68, %r67, %r43; rem.s32 %r123, %r68, %r43; bra.uni BB0_14; BB0_13: add.s32 %r69, %r43, -1; min.s32 %r123, %r14, %r69; BB0_14: add.s32 %r18, %r123, %r5; setp.ge.s32 %p18, %r14, %r43; mov.f32 %f260, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f261, %f260; mov.f32 %f262, %f260; @%p20 bra BB0_16; mul.wide.s32 %rd34, %r18, 4; add.s64 %rd35, %rd6, %rd34; ld.global.nc.f32 %f260, [%rd35]; add.s64 %rd36, %rd5, %rd34; ld.global.nc.f32 %f261, [%rd36]; add.s64 %rd37, %rd4, %rd34; ld.global.nc.f32 %f262, [%rd37]; BB0_16: mul.f32 %f124, %f261, %f261; fma.rn.f32 %f125, %f260, %f260, %f124; fma.rn.f32 %f23, %f262, %f262, %f125; setp.eq.f32 %p21, %f23, 0f00000000; mov.u16 %rs38, %rs1; @%p21 bra BB0_18; cvt.s64.s32 %rd38, %r18; add.s64 %rd39, %rd3, %rd38; ld.global.nc.u8 %rs38, [%rd39]; BB0_18: setp.gt.u16 %p22, %rs38, %rs1; cvt.u32.u16 %r70, %rs38; and.b32 %r71, %r70, 255; selp.b32 %r72, %r7, %r71, %p22; selp.b32 %r73, %r71, %r7, %p22; add.s32 %r74, %r73, 1; mul.lo.s32 %r75, %r74, %r73; shr.u32 %r76, %r75, 1; add.s32 %r19, %r76, %r72; and.pred %p25, %p21, %p13; @%p25 bra BB0_20; mul.wide.s32 %rd40, %r19, 4; add.s64 %rd41, %rd2, %rd40; ld.global.nc.f32 %f126, [%rd41]; add.f32 %f127, %f126, %f126; add.s64 %rd42, %rd1, %rd40; ld.global.nc.f32 %f128, [%rd42]; div.rn.f32 %f129, %f128, %f127; mul.f32 %f130, %f129, %f87; mul.f32 %f131, %f3, %f130; sub.f32 %f132, %f2, %f131; fma.rn.f32 %f133, %f2, %f130, %f3; selp.f32 %f134, %f1, %f260, %p21; selp.f32 %f135, %f132, %f261, %p21; selp.f32 %f136, %f133, %f262, %p21; mul.f32 %f137, %f87, %f87; div.rn.f32 %f138, %f127, %f137; sub.f32 %f139, %f134, %f1; sub.f32 %f140, %f135, %f2; sub.f32 %f141, %f136, %f3; fma.rn.f32 %f263, %f139, %f138, %f263; fma.rn.f32 %f142, %f140, %f138, %f264; fma.rn.f32 %f143, %f141, %f138, %f265; div.rn.f32 %f144, %f128, %f87; fma.rn.f32 %f264, %f136, %f144, %f142; mul.f32 %f145, %f135, %f144; sub.f32 %f265, %f143, %f145; BB0_20: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r20, %r2, -1; @%p27 bra BB0_22; rem.s32 %r77, %r20, %r44; add.s32 %r78, %r77, %r44; rem.s32 %r124, %r78, %r44; bra.uni BB0_23; BB0_22: mov.u32 %r79, 0; max.s32 %r124, %r20, %r79; BB0_23: add.s32 %r80, %r124, %r4; mad.lo.s32 %r24, %r80, %r43, %r1; setp.lt.s32 %p29, %r20, 0; mov.f32 %f266, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f267, %f266; mov.f32 %f268, %f266; @%p30 bra BB0_25; mul.wide.s32 %rd43, %r24, 4; add.s64 %rd44, %rd6, %rd43; ld.global.nc.f32 %f266, [%rd44]; add.s64 %rd45, %rd5, %rd43; ld.global.nc.f32 %f267, [%rd45]; add.s64 %rd46, %rd4, %rd43; ld.global.nc.f32 %f268, [%rd46]; BB0_25: mul.f32 %f149, %f267, %f267; fma.rn.f32 %f150, %f266, %f266, %f149; fma.rn.f32 %f36, %f268, %f268, %f150; setp.eq.f32 %p31, %f36, 0f00000000; mov.u16 %rs39, %rs1; @%p31 bra BB0_27; cvt.s64.s32 %rd47, %r24; add.s64 %rd48, %rd3, %rd47; ld.global.nc.u8 %rs39, [%rd48]; BB0_27: setp.gt.u16 %p32, %rs39, %rs1; cvt.u32.u16 %r81, %rs39; and.b32 %r82, %r81, 255; selp.b32 %r83, %r7, %r82, %p32; selp.b32 %r84, %r82, %r7, %p32; add.s32 %r85, %r84, 1; mul.lo.s32 %r86, %r85, %r84; shr.u32 %r87, %r86, 1; add.s32 %r25, %r87, %r83; and.pred %p35, %p31, %p13; @%p35 bra BB0_29; mul.wide.s32 %rd49, %r25, 4; add.s64 %rd50, %rd2, %rd49; ld.global.nc.f32 %f151, [%rd50]; add.f32 %f152, %f151, %f151; add.s64 %rd51, %rd1, %rd49; ld.global.nc.f32 %f153, [%rd51]; div.rn.f32 %f154, %f153, %f152; mul.f32 %f155, %f154, %f88; mul.f32 %f156, %f3, %f155; sub.f32 %f157, %f1, %f156; fma.rn.f32 %f158, %f1, %f155, %f3; selp.f32 %f159, %f157, %f266, %p31; selp.f32 %f160, %f2, %f267, %p31; selp.f32 %f161, %f158, %f268, %p31; mul.f32 %f162, %f88, %f88; div.rn.f32 %f163, %f152, %f162; sub.f32 %f164, %f159, %f1; sub.f32 %f165, %f160, %f2; sub.f32 %f166, %f161, %f3; fma.rn.f32 %f167, %f164, %f163, %f263; fma.rn.f32 %f264, %f165, %f163, %f264; fma.rn.f32 %f168, %f166, %f163, %f265; div.rn.f32 %f169, %f153, %f88; fma.rn.f32 %f263, %f161, %f169, %f167; mul.f32 %f170, %f159, %f169; sub.f32 %f265, %f168, %f170; BB0_29: add.s32 %r26, %r2, 1; @%p27 bra BB0_31; rem.s32 %r88, %r26, %r44; add.s32 %r89, %r88, %r44; rem.s32 %r125, %r89, %r44; bra.uni BB0_32; BB0_31: add.s32 %r90, %r44, -1; min.s32 %r125, %r26, %r90; BB0_32: add.s32 %r91, %r125, %r4; mad.lo.s32 %r30, %r91, %r43, %r1; setp.ge.s32 %p38, %r26, %r44; mov.f32 %f272, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f273, %f272; mov.f32 %f274, %f272; @%p40 bra BB0_34; mul.wide.s32 %rd52, %r30, 4; add.s64 %rd53, %rd6, %rd52; ld.global.nc.f32 %f272, [%rd53]; add.s64 %rd54, %rd5, %rd52; ld.global.nc.f32 %f273, [%rd54]; add.s64 %rd55, %rd4, %rd52; ld.global.nc.f32 %f274, [%rd55]; BB0_34: mul.f32 %f174, %f273, %f273; fma.rn.f32 %f175, %f272, %f272, %f174; fma.rn.f32 %f49, %f274, %f274, %f175; setp.eq.f32 %p41, %f49, 0f00000000; mov.u16 %rs40, %rs1; @%p41 bra BB0_36; cvt.s64.s32 %rd56, %r30; add.s64 %rd57, %rd3, %rd56; ld.global.nc.u8 %rs40, [%rd57]; BB0_36: setp.gt.u16 %p42, %rs40, %rs1; cvt.u32.u16 %r92, %rs40; and.b32 %r93, %r92, 255; selp.b32 %r94, %r7, %r93, %p42; selp.b32 %r95, %r93, %r7, %p42; add.s32 %r96, %r95, 1; mul.lo.s32 %r97, %r96, %r95; shr.u32 %r98, %r97, 1; add.s32 %r31, %r98, %r94; and.pred %p45, %p41, %p13; @%p45 bra BB0_38; mul.wide.s32 %rd58, %r31, 4; add.s64 %rd59, %rd2, %rd58; ld.global.nc.f32 %f176, [%rd59]; add.f32 %f177, %f176, %f176; add.s64 %rd60, %rd1, %rd58; ld.global.nc.f32 %f178, [%rd60]; div.rn.f32 %f179, %f178, %f177; mul.f32 %f180, %f179, %f88; fma.rn.f32 %f181, %f3, %f180, %f1; mul.f32 %f182, %f1, %f180; sub.f32 %f183, %f3, %f182; selp.f32 %f184, %f181, %f272, %p41; selp.f32 %f185, %f2, %f273, %p41; selp.f32 %f186, %f183, %f274, %p41; mul.f32 %f187, %f88, %f88; div.rn.f32 %f188, %f177, %f187; sub.f32 %f189, %f184, %f1; sub.f32 %f190, %f185, %f2; sub.f32 %f191, %f186, %f3; fma.rn.f32 %f192, %f189, %f188, %f263; fma.rn.f32 %f264, %f190, %f188, %f264; fma.rn.f32 %f193, %f191, %f188, %f265; div.rn.f32 %f194, %f178, %f88; mul.f32 %f195, %f186, %f194; sub.f32 %f263, %f192, %f195; fma.rn.f32 %f265, %f184, %f194, %f193; BB0_38: setp.eq.s32 %p47, %r45, 1; @%p47 bra BB0_57; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r32, %r3, -1; @%p48 bra BB0_41; rem.s32 %r99, %r32, %r45; add.s32 %r100, %r99, %r45; rem.s32 %r126, %r100, %r45; bra.uni BB0_42; BB0_41: mov.u32 %r101, 0; max.s32 %r126, %r32, %r101; BB0_42: mad.lo.s32 %r102, %r126, %r44, %r2; mad.lo.s32 %r36, %r102, %r43, %r1; setp.lt.s32 %p50, %r32, 0; mov.f32 %f278, 0f00000000; and.pred %p51, %p50, %p48; mov.f32 %f279, %f278; mov.f32 %f280, %f278; @%p51 bra BB0_44; mul.wide.s32 %rd61, %r36, 4; add.s64 %rd62, %rd6, %rd61; ld.global.nc.f32 %f278, [%rd62]; add.s64 %rd63, %rd5, %rd61; ld.global.nc.f32 %f279, [%rd63]; add.s64 %rd64, %rd4, %rd61; ld.global.nc.f32 %f280, [%rd64]; BB0_44: mul.f32 %f199, %f279, %f279; fma.rn.f32 %f200, %f278, %f278, %f199; fma.rn.f32 %f62, %f280, %f280, %f200; setp.eq.f32 %p52, %f62, 0f00000000; mov.u16 %rs41, %rs1; @%p52 bra BB0_46; cvt.s64.s32 %rd65, %r36; add.s64 %rd66, %rd3, %rd65; ld.global.nc.u8 %rs41, [%rd66]; BB0_46: setp.gt.u16 %p53, %rs41, %rs1; cvt.u32.u16 %r103, %rs41; and.b32 %r104, %r103, 255; selp.b32 %r105, %r7, %r104, %p53; selp.b32 %r106, %r104, %r7, %p53; add.s32 %r107, %r106, 1; mul.lo.s32 %r108, %r107, %r106; shr.u32 %r109, %r108, 1; add.s32 %r37, %r109, %r105; and.pred %p56, %p52, %p13; @%p56 bra BB0_48; mul.wide.s32 %rd67, %r37, 4; add.s64 %rd68, %rd2, %rd67; ld.global.nc.f32 %f201, [%rd68]; add.f32 %f202, %f201, %f201; add.s64 %rd69, %rd1, %rd67; ld.global.nc.f32 %f203, [%rd69]; div.rn.f32 %f204, %f203, %f202; mul.f32 %f205, %f204, %f89; fma.rn.f32 %f206, %f2, %f205, %f1; mul.f32 %f207, %f1, %f205; sub.f32 %f208, %f2, %f207; selp.f32 %f209, %f206, %f278, %p52; selp.f32 %f210, %f208, %f279, %p52; selp.f32 %f211, %f3, %f280, %p52; mul.f32 %f212, %f89, %f89; div.rn.f32 %f213, %f202, %f212; sub.f32 %f214, %f209, %f1; sub.f32 %f215, %f210, %f2; sub.f32 %f216, %f211, %f3; fma.rn.f32 %f217, %f214, %f213, %f263; fma.rn.f32 %f218, %f215, %f213, %f264; fma.rn.f32 %f265, %f216, %f213, %f265; div.rn.f32 %f219, %f203, %f89; mul.f32 %f220, %f210, %f219; sub.f32 %f263, %f217, %f220; fma.rn.f32 %f264, %f209, %f219, %f218; BB0_48: add.s32 %r38, %r3, 1; @%p48 bra BB0_50; rem.s32 %r110, %r38, %r45; add.s32 %r111, %r110, %r45; rem.s32 %r127, %r111, %r45; bra.uni BB0_51; BB0_50: add.s32 %r112, %r45, -1; min.s32 %r127, %r38, %r112; BB0_51: mad.lo.s32 %r113, %r127, %r44, %r2; mad.lo.s32 %r42, %r113, %r43, %r1; setp.ge.s32 %p59, %r38, %r45; mov.f32 %f284, 0f00000000; and.pred %p61, %p59, %p48; mov.f32 %f285, %f284; mov.f32 %f286, %f284; @%p61 bra BB0_53; mul.wide.s32 %rd70, %r42, 4; add.s64 %rd71, %rd6, %rd70; ld.global.nc.f32 %f286, [%rd71]; add.s64 %rd72, %rd5, %rd70; ld.global.nc.f32 %f285, [%rd72]; add.s64 %rd73, %rd4, %rd70; ld.global.nc.f32 %f284, [%rd73]; BB0_53: mul.f32 %f224, %f286, %f286; fma.rn.f32 %f225, %f285, %f285, %f224; fma.rn.f32 %f75, %f284, %f284, %f225; setp.eq.f32 %p62, %f75, 0f00000000; mov.u16 %rs42, %rs1; @%p62 bra BB0_55; cvt.s64.s32 %rd74, %r42; add.s64 %rd75, %rd3, %rd74; ld.global.nc.u8 %rs42, [%rd75]; BB0_55: setp.gt.u16 %p63, %rs42, %rs1; cvt.u32.u16 %r114, %rs42; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p63; selp.b32 %r117, %r115, %r7, %p63; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd76, %r121, 4; add.s64 %rd7, %rd2, %rd76; add.s64 %rd8, %rd1, %rd76; and.pred %p66, %p62, %p13; @%p66 bra BB0_57; ld.global.nc.f32 %f226, [%rd7]; add.f32 %f227, %f226, %f226; ld.global.nc.f32 %f228, [%rd8]; div.rn.f32 %f229, %f228, %f227; mul.f32 %f230, %f229, %f89; mul.f32 %f231, %f2, %f230; sub.f32 %f232, %f1, %f231; fma.rn.f32 %f233, %f1, %f230, %f2; selp.f32 %f234, %f3, %f284, %p62; selp.f32 %f235, %f233, %f285, %p62; selp.f32 %f236, %f232, %f286, %p62; mul.f32 %f237, %f89, %f89; div.rn.f32 %f238, %f227, %f237; sub.f32 %f239, %f236, %f1; sub.f32 %f240, %f235, %f2; sub.f32 %f241, %f234, %f3; fma.rn.f32 %f242, %f239, %f238, %f263; fma.rn.f32 %f243, %f240, %f238, %f264; fma.rn.f32 %f265, %f241, %f238, %f265; div.rn.f32 %f244, %f228, %f89; fma.rn.f32 %f263, %f235, %f244, %f242; mul.f32 %f245, %f236, %f244; sub.f32 %f264, %f243, %f245; BB0_57: setp.eq.s64 %p68, %rd12, 0; @%p68 bra BB0_59; cvta.to.global.u64 %rd77, %rd12; add.s64 %rd79, %rd77, %rd19; ld.global.nc.f32 %f246, [%rd79]; mul.f32 %f290, %f246, %f290; BB0_59: setp.eq.f32 %p69, %f290, 0f00000000; mov.f32 %f291, 0f00000000; @%p69 bra BB0_61; rcp.rn.f32 %f291, %f290; BB0_61: cvta.to.global.u64 %rd80, %rd11; cvta.to.global.u64 %rd81, %rd10; cvta.to.global.u64 %rd82, %rd9; add.s64 %rd84, %rd82, %rd19; ld.global.f32 %f248, [%rd84]; fma.rn.f32 %f249, %f263, %f291, %f248; st.global.f32 [%rd84], %f249; add.s64 %rd85, %rd81, %rd19; ld.global.f32 %f250, [%rd85]; fma.rn.f32 %f251, %f264, %f291, %f250; st.global.f32 [%rd85], %f251; add.s64 %rd86, %rd80, %rd19; ld.global.f32 %f252, [%rd86]; fma.rn.f32 %f253, %f265, %f291, %f252; st.global.f32 [%rd86], %f253; BB0_62: ret; } ` adddmibulk_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<70>; .reg .b16 %rs<43>; .reg .f32 %f<292>; .reg .b32 %r<128>; .reg .b64 %rd<87>; ld.param.u64 %rd9, [adddmibulk_param_0]; ld.param.u64 %rd10, [adddmibulk_param_1]; ld.param.u64 %rd11, [adddmibulk_param_2]; ld.param.u64 %rd13, [adddmibulk_param_3]; ld.param.u64 %rd14, [adddmibulk_param_4]; ld.param.u64 %rd15, [adddmibulk_param_5]; ld.param.u64 %rd12, [adddmibulk_param_6]; ld.param.f32 %f290, [adddmibulk_param_7]; ld.param.u64 %rd16, [adddmibulk_param_8]; ld.param.u64 %rd17, [adddmibulk_param_9]; ld.param.u64 %rd18, [adddmibulk_param_10]; ld.param.f32 %f87, [adddmibulk_param_11]; ld.param.f32 %f88, [adddmibulk_param_12]; ld.param.f32 %f89, [adddmibulk_param_13]; ld.param.u32 %r43, [adddmibulk_param_14]; ld.param.u32 %r44, [adddmibulk_param_15]; ld.param.u32 %r45, [adddmibulk_param_16]; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; cvta.to.global.u64 %rd1, %rd17; cvta.to.global.u64 %rd2, %rd16; cvta.to.global.u64 %rd3, %rd18; cvta.to.global.u64 %rd4, %rd15; cvta.to.global.u64 %rd5, %rd14; cvta.to.global.u64 %rd6, %rd13; mov.u32 %r46, %ntid.x; mov.u32 %r47, %ctaid.x; mov.u32 %r48, %tid.x; mad.lo.s32 %r1, %r46, %r47, %r48; mov.u32 %r49, %ntid.y; mov.u32 %r50, %ctaid.y; mov.u32 %r51, %tid.y; mad.lo.s32 %r2, %r49, %r50, %r51; mov.u32 %r52, %ntid.z; mov.u32 %r53, %ctaid.z; mov.u32 %r54, %tid.z; mad.lo.s32 %r3, %r52, %r53, %r54; setp.ge.s32 %p1, %r2, %r44; setp.ge.s32 %p2, %r1, %r43; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r45; or.pred %p5, %p3, %p4; @%p5 bra BB0_62; mul.lo.s32 %r4, %r3, %r44; add.s32 %r55, %r4, %r2; mul.lo.s32 %r5, %r55, %r43; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd19, %r6, 4; add.s64 %rd20, %rd6, %rd19; cvt.s64.s32 %rd21, %r6; add.s64 %rd22, %rd5, %rd19; add.s64 %rd23, %rd4, %rd19; add.s64 %rd24, %rd3, %rd21; ld.global.nc.u8 %rs1, [%rd24]; cvt.u32.u16 %r56, %rs1; and.b32 %r7, %r56, 255; ld.global.nc.f32 %f1, [%rd20]; ld.global.nc.f32 %f2, [%rd22]; mul.f32 %f90, %f2, %f2; fma.rn.f32 %f91, %f1, %f1, %f90; ld.global.nc.f32 %f3, [%rd23]; fma.rn.f32 %f92, %f3, %f3, %f91; setp.eq.f32 %p6, %f92, 0f00000000; @%p6 bra BB0_62; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r57, %r8, %r43; add.s32 %r58, %r57, %r43; rem.s32 %r122, %r58, %r43; bra.uni BB0_5; BB0_4: mov.u32 %r59, 0; max.s32 %r122, %r8, %r59; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f254, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f255, %f254; mov.f32 %f256, %f254; @%p10 bra BB0_7; mul.wide.s32 %rd25, %r12, 4; add.s64 %rd26, %rd6, %rd25; ld.global.nc.f32 %f254, [%rd26]; add.s64 %rd27, %rd5, %rd25; ld.global.nc.f32 %f255, [%rd27]; add.s64 %rd28, %rd4, %rd25; ld.global.nc.f32 %f256, [%rd28]; BB0_7: mul.f32 %f96, %f255, %f255; fma.rn.f32 %f97, %f254, %f254, %f96; fma.rn.f32 %f10, %f256, %f256, %f97; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs37, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd29, %r12; add.s64 %rd30, %rd3, %rd29; ld.global.nc.u8 %rs37, [%rd30]; BB0_9: setp.gt.u16 %p12, %rs37, %rs1; cvt.u32.u16 %r60, %rs37; and.b32 %r61, %r60, 255; selp.b32 %r62, %r7, %r61, %p12; selp.b32 %r63, %r61, %r7, %p12; add.s32 %r64, %r63, 1; mul.lo.s32 %r65, %r64, %r63; shr.u32 %r66, %r65, 1; add.s32 %r13, %r66, %r62; setp.ne.s16 %p13, %rs18, 0; mov.f32 %f263, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f264, %f263; mov.f32 %f265, %f263; @%p15 bra BB0_11; mul.wide.s32 %rd31, %r13, 4; add.s64 %rd32, %rd2, %rd31; ld.global.nc.f32 %f101, [%rd32]; add.f32 %f102, %f101, %f101; add.s64 %rd33, %rd1, %rd31; ld.global.nc.f32 %f103, [%rd33]; div.rn.f32 %f104, %f103, %f102; mul.f32 %f105, %f104, %f87; fma.rn.f32 %f106, %f3, %f105, %f2; mul.f32 %f107, %f2, %f105; sub.f32 %f108, %f3, %f107; selp.f32 %f109, %f1, %f254, %p11; selp.f32 %f110, %f106, %f255, %p11; selp.f32 %f111, %f108, %f256, %p11; mul.f32 %f112, %f87, %f87; div.rn.f32 %f113, %f102, %f112; sub.f32 %f114, %f109, %f1; sub.f32 %f115, %f110, %f2; sub.f32 %f116, %f111, %f3; fma.rn.f32 %f263, %f114, %f113, 0f00000000; fma.rn.f32 %f117, %f115, %f113, 0f00000000; fma.rn.f32 %f118, %f116, %f113, 0f00000000; div.rn.f32 %f119, %f103, %f87; mul.f32 %f120, %f111, %f119; sub.f32 %f264, %f117, %f120; fma.rn.f32 %f265, %f110, %f119, %f118; BB0_11: add.s32 %r14, %r1, 1; @%p7 bra BB0_13; rem.s32 %r67, %r14, %r43; add.s32 %r68, %r67, %r43; rem.s32 %r123, %r68, %r43; bra.uni BB0_14; BB0_13: add.s32 %r69, %r43, -1; min.s32 %r123, %r14, %r69; BB0_14: add.s32 %r18, %r123, %r5; setp.ge.s32 %p18, %r14, %r43; mov.f32 %f260, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f261, %f260; mov.f32 %f262, %f260; @%p20 bra BB0_16; mul.wide.s32 %rd34, %r18, 4; add.s64 %rd35, %rd6, %rd34; ld.global.nc.f32 %f260, [%rd35]; add.s64 %rd36, %rd5, %rd34; ld.global.nc.f32 %f261, [%rd36]; add.s64 %rd37, %rd4, %rd34; ld.global.nc.f32 %f262, [%rd37]; BB0_16: mul.f32 %f124, %f261, %f261; fma.rn.f32 %f125, %f260, %f260, %f124; fma.rn.f32 %f23, %f262, %f262, %f125; setp.eq.f32 %p21, %f23, 0f00000000; mov.u16 %rs38, %rs1; @%p21 bra BB0_18; cvt.s64.s32 %rd38, %r18; add.s64 %rd39, %rd3, %rd38; ld.global.nc.u8 %rs38, [%rd39]; BB0_18: setp.gt.u16 %p22, %rs38, %rs1; cvt.u32.u16 %r70, %rs38; and.b32 %r71, %r70, 255; selp.b32 %r72, %r7, %r71, %p22; selp.b32 %r73, %r71, %r7, %p22; add.s32 %r74, %r73, 1; mul.lo.s32 %r75, %r74, %r73; shr.u32 %r76, %r75, 1; add.s32 %r19, %r76, %r72; and.pred %p25, %p21, %p13; @%p25 bra BB0_20; mul.wide.s32 %rd40, %r19, 4; add.s64 %rd41, %rd2, %rd40; ld.global.nc.f32 %f126, [%rd41]; add.f32 %f127, %f126, %f126; add.s64 %rd42, %rd1, %rd40; ld.global.nc.f32 %f128, [%rd42]; div.rn.f32 %f129, %f128, %f127; mul.f32 %f130, %f129, %f87; mul.f32 %f131, %f3, %f130; sub.f32 %f132, %f2, %f131; fma.rn.f32 %f133, %f2, %f130, %f3; selp.f32 %f134, %f1, %f260, %p21; selp.f32 %f135, %f132, %f261, %p21; selp.f32 %f136, %f133, %f262, %p21; mul.f32 %f137, %f87, %f87; div.rn.f32 %f138, %f127, %f137; sub.f32 %f139, %f134, %f1; sub.f32 %f140, %f135, %f2; sub.f32 %f141, %f136, %f3; fma.rn.f32 %f263, %f139, %f138, %f263; fma.rn.f32 %f142, %f140, %f138, %f264; fma.rn.f32 %f143, %f141, %f138, %f265; div.rn.f32 %f144, %f128, %f87; fma.rn.f32 %f264, %f136, %f144, %f142; mul.f32 %f145, %f135, %f144; sub.f32 %f265, %f143, %f145; BB0_20: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r20, %r2, -1; @%p27 bra BB0_22; rem.s32 %r77, %r20, %r44; add.s32 %r78, %r77, %r44; rem.s32 %r124, %r78, %r44; bra.uni BB0_23; BB0_22: mov.u32 %r79, 0; max.s32 %r124, %r20, %r79; BB0_23: add.s32 %r80, %r124, %r4; mad.lo.s32 %r24, %r80, %r43, %r1; setp.lt.s32 %p29, %r20, 0; mov.f32 %f266, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f267, %f266; mov.f32 %f268, %f266; @%p30 bra BB0_25; mul.wide.s32 %rd43, %r24, 4; add.s64 %rd44, %rd6, %rd43; ld.global.nc.f32 %f266, [%rd44]; add.s64 %rd45, %rd5, %rd43; ld.global.nc.f32 %f267, [%rd45]; add.s64 %rd46, %rd4, %rd43; ld.global.nc.f32 %f268, [%rd46]; BB0_25: mul.f32 %f149, %f267, %f267; fma.rn.f32 %f150, %f266, %f266, %f149; fma.rn.f32 %f36, %f268, %f268, %f150; setp.eq.f32 %p31, %f36, 0f00000000; mov.u16 %rs39, %rs1; @%p31 bra BB0_27; cvt.s64.s32 %rd47, %r24; add.s64 %rd48, %rd3, %rd47; ld.global.nc.u8 %rs39, [%rd48]; BB0_27: setp.gt.u16 %p32, %rs39, %rs1; cvt.u32.u16 %r81, %rs39; and.b32 %r82, %r81, 255; selp.b32 %r83, %r7, %r82, %p32; selp.b32 %r84, %r82, %r7, %p32; add.s32 %r85, %r84, 1; mul.lo.s32 %r86, %r85, %r84; shr.u32 %r87, %r86, 1; add.s32 %r25, %r87, %r83; and.pred %p35, %p31, %p13; @%p35 bra BB0_29; mul.wide.s32 %rd49, %r25, 4; add.s64 %rd50, %rd2, %rd49; ld.global.nc.f32 %f151, [%rd50]; add.f32 %f152, %f151, %f151; add.s64 %rd51, %rd1, %rd49; ld.global.nc.f32 %f153, [%rd51]; div.rn.f32 %f154, %f153, %f152; mul.f32 %f155, %f154, %f88; mul.f32 %f156, %f3, %f155; sub.f32 %f157, %f1, %f156; fma.rn.f32 %f158, %f1, %f155, %f3; selp.f32 %f159, %f157, %f266, %p31; selp.f32 %f160, %f2, %f267, %p31; selp.f32 %f161, %f158, %f268, %p31; mul.f32 %f162, %f88, %f88; div.rn.f32 %f163, %f152, %f162; sub.f32 %f164, %f159, %f1; sub.f32 %f165, %f160, %f2; sub.f32 %f166, %f161, %f3; fma.rn.f32 %f167, %f164, %f163, %f263; fma.rn.f32 %f264, %f165, %f163, %f264; fma.rn.f32 %f168, %f166, %f163, %f265; div.rn.f32 %f169, %f153, %f88; fma.rn.f32 %f263, %f161, %f169, %f167; mul.f32 %f170, %f159, %f169; sub.f32 %f265, %f168, %f170; BB0_29: add.s32 %r26, %r2, 1; @%p27 bra BB0_31; rem.s32 %r88, %r26, %r44; add.s32 %r89, %r88, %r44; rem.s32 %r125, %r89, %r44; bra.uni BB0_32; BB0_31: add.s32 %r90, %r44, -1; min.s32 %r125, %r26, %r90; BB0_32: add.s32 %r91, %r125, %r4; mad.lo.s32 %r30, %r91, %r43, %r1; setp.ge.s32 %p38, %r26, %r44; mov.f32 %f272, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f273, %f272; mov.f32 %f274, %f272; @%p40 bra BB0_34; mul.wide.s32 %rd52, %r30, 4; add.s64 %rd53, %rd6, %rd52; ld.global.nc.f32 %f272, [%rd53]; add.s64 %rd54, %rd5, %rd52; ld.global.nc.f32 %f273, [%rd54]; add.s64 %rd55, %rd4, %rd52; ld.global.nc.f32 %f274, [%rd55]; BB0_34: mul.f32 %f174, %f273, %f273; fma.rn.f32 %f175, %f272, %f272, %f174; fma.rn.f32 %f49, %f274, %f274, %f175; setp.eq.f32 %p41, %f49, 0f00000000; mov.u16 %rs40, %rs1; @%p41 bra BB0_36; cvt.s64.s32 %rd56, %r30; add.s64 %rd57, %rd3, %rd56; ld.global.nc.u8 %rs40, [%rd57]; BB0_36: setp.gt.u16 %p42, %rs40, %rs1; cvt.u32.u16 %r92, %rs40; and.b32 %r93, %r92, 255; selp.b32 %r94, %r7, %r93, %p42; selp.b32 %r95, %r93, %r7, %p42; add.s32 %r96, %r95, 1; mul.lo.s32 %r97, %r96, %r95; shr.u32 %r98, %r97, 1; add.s32 %r31, %r98, %r94; and.pred %p45, %p41, %p13; @%p45 bra BB0_38; mul.wide.s32 %rd58, %r31, 4; add.s64 %rd59, %rd2, %rd58; ld.global.nc.f32 %f176, [%rd59]; add.f32 %f177, %f176, %f176; add.s64 %rd60, %rd1, %rd58; ld.global.nc.f32 %f178, [%rd60]; div.rn.f32 %f179, %f178, %f177; mul.f32 %f180, %f179, %f88; fma.rn.f32 %f181, %f3, %f180, %f1; mul.f32 %f182, %f1, %f180; sub.f32 %f183, %f3, %f182; selp.f32 %f184, %f181, %f272, %p41; selp.f32 %f185, %f2, %f273, %p41; selp.f32 %f186, %f183, %f274, %p41; mul.f32 %f187, %f88, %f88; div.rn.f32 %f188, %f177, %f187; sub.f32 %f189, %f184, %f1; sub.f32 %f190, %f185, %f2; sub.f32 %f191, %f186, %f3; fma.rn.f32 %f192, %f189, %f188, %f263; fma.rn.f32 %f264, %f190, %f188, %f264; fma.rn.f32 %f193, %f191, %f188, %f265; div.rn.f32 %f194, %f178, %f88; mul.f32 %f195, %f186, %f194; sub.f32 %f263, %f192, %f195; fma.rn.f32 %f265, %f184, %f194, %f193; BB0_38: setp.eq.s32 %p47, %r45, 1; @%p47 bra BB0_57; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r32, %r3, -1; @%p48 bra BB0_41; rem.s32 %r99, %r32, %r45; add.s32 %r100, %r99, %r45; rem.s32 %r126, %r100, %r45; bra.uni BB0_42; BB0_41: mov.u32 %r101, 0; max.s32 %r126, %r32, %r101; BB0_42: mad.lo.s32 %r102, %r126, %r44, %r2; mad.lo.s32 %r36, %r102, %r43, %r1; setp.lt.s32 %p50, %r32, 0; mov.f32 %f278, 0f00000000; and.pred %p51, %p50, %p48; mov.f32 %f279, %f278; mov.f32 %f280, %f278; @%p51 bra BB0_44; mul.wide.s32 %rd61, %r36, 4; add.s64 %rd62, %rd6, %rd61; ld.global.nc.f32 %f278, [%rd62]; add.s64 %rd63, %rd5, %rd61; ld.global.nc.f32 %f279, [%rd63]; add.s64 %rd64, %rd4, %rd61; ld.global.nc.f32 %f280, [%rd64]; BB0_44: mul.f32 %f199, %f279, %f279; fma.rn.f32 %f200, %f278, %f278, %f199; fma.rn.f32 %f62, %f280, %f280, %f200; setp.eq.f32 %p52, %f62, 0f00000000; mov.u16 %rs41, %rs1; @%p52 bra BB0_46; cvt.s64.s32 %rd65, %r36; add.s64 %rd66, %rd3, %rd65; ld.global.nc.u8 %rs41, [%rd66]; BB0_46: setp.gt.u16 %p53, %rs41, %rs1; cvt.u32.u16 %r103, %rs41; and.b32 %r104, %r103, 255; selp.b32 %r105, %r7, %r104, %p53; selp.b32 %r106, %r104, %r7, %p53; add.s32 %r107, %r106, 1; mul.lo.s32 %r108, %r107, %r106; shr.u32 %r109, %r108, 1; add.s32 %r37, %r109, %r105; and.pred %p56, %p52, %p13; @%p56 bra BB0_48; mul.wide.s32 %rd67, %r37, 4; add.s64 %rd68, %rd2, %rd67; ld.global.nc.f32 %f201, [%rd68]; add.f32 %f202, %f201, %f201; add.s64 %rd69, %rd1, %rd67; ld.global.nc.f32 %f203, [%rd69]; div.rn.f32 %f204, %f203, %f202; mul.f32 %f205, %f204, %f89; fma.rn.f32 %f206, %f2, %f205, %f1; mul.f32 %f207, %f1, %f205; sub.f32 %f208, %f2, %f207; selp.f32 %f209, %f206, %f278, %p52; selp.f32 %f210, %f208, %f279, %p52; selp.f32 %f211, %f3, %f280, %p52; mul.f32 %f212, %f89, %f89; div.rn.f32 %f213, %f202, %f212; sub.f32 %f214, %f209, %f1; sub.f32 %f215, %f210, %f2; sub.f32 %f216, %f211, %f3; fma.rn.f32 %f217, %f214, %f213, %f263; fma.rn.f32 %f218, %f215, %f213, %f264; fma.rn.f32 %f265, %f216, %f213, %f265; div.rn.f32 %f219, %f203, %f89; mul.f32 %f220, %f210, %f219; sub.f32 %f263, %f217, %f220; fma.rn.f32 %f264, %f209, %f219, %f218; BB0_48: add.s32 %r38, %r3, 1; @%p48 bra BB0_50; rem.s32 %r110, %r38, %r45; add.s32 %r111, %r110, %r45; rem.s32 %r127, %r111, %r45; bra.uni BB0_51; BB0_50: add.s32 %r112, %r45, -1; min.s32 %r127, %r38, %r112; BB0_51: mad.lo.s32 %r113, %r127, %r44, %r2; mad.lo.s32 %r42, %r113, %r43, %r1; setp.ge.s32 %p59, %r38, %r45; mov.f32 %f284, 0f00000000; and.pred %p61, %p59, %p48; mov.f32 %f285, %f284; mov.f32 %f286, %f284; @%p61 bra BB0_53; mul.wide.s32 %rd70, %r42, 4; add.s64 %rd71, %rd6, %rd70; ld.global.nc.f32 %f286, [%rd71]; add.s64 %rd72, %rd5, %rd70; ld.global.nc.f32 %f285, [%rd72]; add.s64 %rd73, %rd4, %rd70; ld.global.nc.f32 %f284, [%rd73]; BB0_53: mul.f32 %f224, %f286, %f286; fma.rn.f32 %f225, %f285, %f285, %f224; fma.rn.f32 %f75, %f284, %f284, %f225; setp.eq.f32 %p62, %f75, 0f00000000; mov.u16 %rs42, %rs1; @%p62 bra BB0_55; cvt.s64.s32 %rd74, %r42; add.s64 %rd75, %rd3, %rd74; ld.global.nc.u8 %rs42, [%rd75]; BB0_55: setp.gt.u16 %p63, %rs42, %rs1; cvt.u32.u16 %r114, %rs42; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p63; selp.b32 %r117, %r115, %r7, %p63; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd76, %r121, 4; add.s64 %rd7, %rd2, %rd76; add.s64 %rd8, %rd1, %rd76; and.pred %p66, %p62, %p13; @%p66 bra BB0_57; ld.global.nc.f32 %f226, [%rd7]; add.f32 %f227, %f226, %f226; ld.global.nc.f32 %f228, [%rd8]; div.rn.f32 %f229, %f228, %f227; mul.f32 %f230, %f229, %f89; mul.f32 %f231, %f2, %f230; sub.f32 %f232, %f1, %f231; fma.rn.f32 %f233, %f1, %f230, %f2; selp.f32 %f234, %f3, %f284, %p62; selp.f32 %f235, %f233, %f285, %p62; selp.f32 %f236, %f232, %f286, %p62; mul.f32 %f237, %f89, %f89; div.rn.f32 %f238, %f227, %f237; sub.f32 %f239, %f236, %f1; sub.f32 %f240, %f235, %f2; sub.f32 %f241, %f234, %f3; fma.rn.f32 %f242, %f239, %f238, %f263; fma.rn.f32 %f243, %f240, %f238, %f264; fma.rn.f32 %f265, %f241, %f238, %f265; div.rn.f32 %f244, %f228, %f89; fma.rn.f32 %f263, %f235, %f244, %f242; mul.f32 %f245, %f236, %f244; sub.f32 %f264, %f243, %f245; BB0_57: setp.eq.s64 %p68, %rd12, 0; @%p68 bra BB0_59; cvta.to.global.u64 %rd77, %rd12; add.s64 %rd79, %rd77, %rd19; ld.global.nc.f32 %f246, [%rd79]; mul.f32 %f290, %f246, %f290; BB0_59: setp.eq.f32 %p69, %f290, 0f00000000; mov.f32 %f291, 0f00000000; @%p69 bra BB0_61; rcp.rn.f32 %f291, %f290; BB0_61: cvta.to.global.u64 %rd80, %rd11; cvta.to.global.u64 %rd81, %rd10; cvta.to.global.u64 %rd82, %rd9; add.s64 %rd84, %rd82, %rd19; ld.global.f32 %f248, [%rd84]; fma.rn.f32 %f249, %f263, %f291, %f248; st.global.f32 [%rd84], %f249; add.s64 %rd85, %rd81, %rd19; ld.global.f32 %f250, [%rd85]; fma.rn.f32 %f251, %f264, %f291, %f250; st.global.f32 [%rd85], %f251; add.s64 %rd86, %rd80, %rd19; ld.global.f32 %f252, [%rd86]; fma.rn.f32 %f253, %f265, %f291, %f252; st.global.f32 [%rd86], %f253; BB0_62: ret; } ` adddmibulk_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<70>; .reg .b16 %rs<43>; .reg .f32 %f<292>; .reg .b32 %r<128>; .reg .b64 %rd<87>; ld.param.u64 %rd9, [adddmibulk_param_0]; ld.param.u64 %rd10, [adddmibulk_param_1]; ld.param.u64 %rd11, [adddmibulk_param_2]; ld.param.u64 %rd13, [adddmibulk_param_3]; ld.param.u64 %rd14, [adddmibulk_param_4]; ld.param.u64 %rd15, [adddmibulk_param_5]; ld.param.u64 %rd12, [adddmibulk_param_6]; ld.param.f32 %f290, [adddmibulk_param_7]; ld.param.u64 %rd16, [adddmibulk_param_8]; ld.param.u64 %rd17, [adddmibulk_param_9]; ld.param.u64 %rd18, [adddmibulk_param_10]; ld.param.f32 %f87, [adddmibulk_param_11]; ld.param.f32 %f88, [adddmibulk_param_12]; ld.param.f32 %f89, [adddmibulk_param_13]; ld.param.u32 %r43, [adddmibulk_param_14]; ld.param.u32 %r44, [adddmibulk_param_15]; ld.param.u32 %r45, [adddmibulk_param_16]; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; cvta.to.global.u64 %rd1, %rd17; cvta.to.global.u64 %rd2, %rd16; cvta.to.global.u64 %rd3, %rd18; cvta.to.global.u64 %rd4, %rd15; cvta.to.global.u64 %rd5, %rd14; cvta.to.global.u64 %rd6, %rd13; mov.u32 %r46, %ntid.x; mov.u32 %r47, %ctaid.x; mov.u32 %r48, %tid.x; mad.lo.s32 %r1, %r46, %r47, %r48; mov.u32 %r49, %ntid.y; mov.u32 %r50, %ctaid.y; mov.u32 %r51, %tid.y; mad.lo.s32 %r2, %r49, %r50, %r51; mov.u32 %r52, %ntid.z; mov.u32 %r53, %ctaid.z; mov.u32 %r54, %tid.z; mad.lo.s32 %r3, %r52, %r53, %r54; setp.ge.s32 %p1, %r2, %r44; setp.ge.s32 %p2, %r1, %r43; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r45; or.pred %p5, %p3, %p4; @%p5 bra BB0_62; mul.lo.s32 %r4, %r3, %r44; add.s32 %r55, %r4, %r2; mul.lo.s32 %r5, %r55, %r43; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd19, %r6, 4; add.s64 %rd20, %rd6, %rd19; cvt.s64.s32 %rd21, %r6; add.s64 %rd22, %rd5, %rd19; add.s64 %rd23, %rd4, %rd19; add.s64 %rd24, %rd3, %rd21; ld.global.nc.u8 %rs1, [%rd24]; cvt.u32.u16 %r56, %rs1; and.b32 %r7, %r56, 255; ld.global.nc.f32 %f1, [%rd20]; ld.global.nc.f32 %f2, [%rd22]; mul.f32 %f90, %f2, %f2; fma.rn.f32 %f91, %f1, %f1, %f90; ld.global.nc.f32 %f3, [%rd23]; fma.rn.f32 %f92, %f3, %f3, %f91; setp.eq.f32 %p6, %f92, 0f00000000; @%p6 bra BB0_62; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r57, %r8, %r43; add.s32 %r58, %r57, %r43; rem.s32 %r122, %r58, %r43; bra.uni BB0_5; BB0_4: mov.u32 %r59, 0; max.s32 %r122, %r8, %r59; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f254, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f255, %f254; mov.f32 %f256, %f254; @%p10 bra BB0_7; mul.wide.s32 %rd25, %r12, 4; add.s64 %rd26, %rd6, %rd25; ld.global.nc.f32 %f254, [%rd26]; add.s64 %rd27, %rd5, %rd25; ld.global.nc.f32 %f255, [%rd27]; add.s64 %rd28, %rd4, %rd25; ld.global.nc.f32 %f256, [%rd28]; BB0_7: mul.f32 %f96, %f255, %f255; fma.rn.f32 %f97, %f254, %f254, %f96; fma.rn.f32 %f10, %f256, %f256, %f97; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs37, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd29, %r12; add.s64 %rd30, %rd3, %rd29; ld.global.nc.u8 %rs37, [%rd30]; BB0_9: setp.gt.u16 %p12, %rs37, %rs1; cvt.u32.u16 %r60, %rs37; and.b32 %r61, %r60, 255; selp.b32 %r62, %r7, %r61, %p12; selp.b32 %r63, %r61, %r7, %p12; add.s32 %r64, %r63, 1; mul.lo.s32 %r65, %r64, %r63; shr.u32 %r66, %r65, 1; add.s32 %r13, %r66, %r62; setp.ne.s16 %p13, %rs18, 0; mov.f32 %f263, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f264, %f263; mov.f32 %f265, %f263; @%p15 bra BB0_11; mul.wide.s32 %rd31, %r13, 4; add.s64 %rd32, %rd2, %rd31; ld.global.nc.f32 %f101, [%rd32]; add.f32 %f102, %f101, %f101; add.s64 %rd33, %rd1, %rd31; ld.global.nc.f32 %f103, [%rd33]; div.rn.f32 %f104, %f103, %f102; mul.f32 %f105, %f104, %f87; fma.rn.f32 %f106, %f3, %f105, %f2; mul.f32 %f107, %f2, %f105; sub.f32 %f108, %f3, %f107; selp.f32 %f109, %f1, %f254, %p11; selp.f32 %f110, %f106, %f255, %p11; selp.f32 %f111, %f108, %f256, %p11; mul.f32 %f112, %f87, %f87; div.rn.f32 %f113, %f102, %f112; sub.f32 %f114, %f109, %f1; sub.f32 %f115, %f110, %f2; sub.f32 %f116, %f111, %f3; fma.rn.f32 %f263, %f114, %f113, 0f00000000; fma.rn.f32 %f117, %f115, %f113, 0f00000000; fma.rn.f32 %f118, %f116, %f113, 0f00000000; div.rn.f32 %f119, %f103, %f87; mul.f32 %f120, %f111, %f119; sub.f32 %f264, %f117, %f120; fma.rn.f32 %f265, %f110, %f119, %f118; BB0_11: add.s32 %r14, %r1, 1; @%p7 bra BB0_13; rem.s32 %r67, %r14, %r43; add.s32 %r68, %r67, %r43; rem.s32 %r123, %r68, %r43; bra.uni BB0_14; BB0_13: add.s32 %r69, %r43, -1; min.s32 %r123, %r14, %r69; BB0_14: add.s32 %r18, %r123, %r5; setp.ge.s32 %p18, %r14, %r43; mov.f32 %f260, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f261, %f260; mov.f32 %f262, %f260; @%p20 bra BB0_16; mul.wide.s32 %rd34, %r18, 4; add.s64 %rd35, %rd6, %rd34; ld.global.nc.f32 %f260, [%rd35]; add.s64 %rd36, %rd5, %rd34; ld.global.nc.f32 %f261, [%rd36]; add.s64 %rd37, %rd4, %rd34; ld.global.nc.f32 %f262, [%rd37]; BB0_16: mul.f32 %f124, %f261, %f261; fma.rn.f32 %f125, %f260, %f260, %f124; fma.rn.f32 %f23, %f262, %f262, %f125; setp.eq.f32 %p21, %f23, 0f00000000; mov.u16 %rs38, %rs1; @%p21 bra BB0_18; cvt.s64.s32 %rd38, %r18; add.s64 %rd39, %rd3, %rd38; ld.global.nc.u8 %rs38, [%rd39]; BB0_18: setp.gt.u16 %p22, %rs38, %rs1; cvt.u32.u16 %r70, %rs38; and.b32 %r71, %r70, 255; selp.b32 %r72, %r7, %r71, %p22; selp.b32 %r73, %r71, %r7, %p22; add.s32 %r74, %r73, 1; mul.lo.s32 %r75, %r74, %r73; shr.u32 %r76, %r75, 1; add.s32 %r19, %r76, %r72; and.pred %p25, %p21, %p13; @%p25 bra BB0_20; mul.wide.s32 %rd40, %r19, 4; add.s64 %rd41, %rd2, %rd40; ld.global.nc.f32 %f126, [%rd41]; add.f32 %f127, %f126, %f126; add.s64 %rd42, %rd1, %rd40; ld.global.nc.f32 %f128, [%rd42]; div.rn.f32 %f129, %f128, %f127; mul.f32 %f130, %f129, %f87; mul.f32 %f131, %f3, %f130; sub.f32 %f132, %f2, %f131; fma.rn.f32 %f133, %f2, %f130, %f3; selp.f32 %f134, %f1, %f260, %p21; selp.f32 %f135, %f132, %f261, %p21; selp.f32 %f136, %f133, %f262, %p21; mul.f32 %f137, %f87, %f87; div.rn.f32 %f138, %f127, %f137; sub.f32 %f139, %f134, %f1; sub.f32 %f140, %f135, %f2; sub.f32 %f141, %f136, %f3; fma.rn.f32 %f263, %f139, %f138, %f263; fma.rn.f32 %f142, %f140, %f138, %f264; fma.rn.f32 %f143, %f141, %f138, %f265; div.rn.f32 %f144, %f128, %f87; fma.rn.f32 %f264, %f136, %f144, %f142; mul.f32 %f145, %f135, %f144; sub.f32 %f265, %f143, %f145; BB0_20: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r20, %r2, -1; @%p27 bra BB0_22; rem.s32 %r77, %r20, %r44; add.s32 %r78, %r77, %r44; rem.s32 %r124, %r78, %r44; bra.uni BB0_23; BB0_22: mov.u32 %r79, 0; max.s32 %r124, %r20, %r79; BB0_23: add.s32 %r80, %r124, %r4; mad.lo.s32 %r24, %r80, %r43, %r1; setp.lt.s32 %p29, %r20, 0; mov.f32 %f266, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f267, %f266; mov.f32 %f268, %f266; @%p30 bra BB0_25; mul.wide.s32 %rd43, %r24, 4; add.s64 %rd44, %rd6, %rd43; ld.global.nc.f32 %f266, [%rd44]; add.s64 %rd45, %rd5, %rd43; ld.global.nc.f32 %f267, [%rd45]; add.s64 %rd46, %rd4, %rd43; ld.global.nc.f32 %f268, [%rd46]; BB0_25: mul.f32 %f149, %f267, %f267; fma.rn.f32 %f150, %f266, %f266, %f149; fma.rn.f32 %f36, %f268, %f268, %f150; setp.eq.f32 %p31, %f36, 0f00000000; mov.u16 %rs39, %rs1; @%p31 bra BB0_27; cvt.s64.s32 %rd47, %r24; add.s64 %rd48, %rd3, %rd47; ld.global.nc.u8 %rs39, [%rd48]; BB0_27: setp.gt.u16 %p32, %rs39, %rs1; cvt.u32.u16 %r81, %rs39; and.b32 %r82, %r81, 255; selp.b32 %r83, %r7, %r82, %p32; selp.b32 %r84, %r82, %r7, %p32; add.s32 %r85, %r84, 1; mul.lo.s32 %r86, %r85, %r84; shr.u32 %r87, %r86, 1; add.s32 %r25, %r87, %r83; and.pred %p35, %p31, %p13; @%p35 bra BB0_29; mul.wide.s32 %rd49, %r25, 4; add.s64 %rd50, %rd2, %rd49; ld.global.nc.f32 %f151, [%rd50]; add.f32 %f152, %f151, %f151; add.s64 %rd51, %rd1, %rd49; ld.global.nc.f32 %f153, [%rd51]; div.rn.f32 %f154, %f153, %f152; mul.f32 %f155, %f154, %f88; mul.f32 %f156, %f3, %f155; sub.f32 %f157, %f1, %f156; fma.rn.f32 %f158, %f1, %f155, %f3; selp.f32 %f159, %f157, %f266, %p31; selp.f32 %f160, %f2, %f267, %p31; selp.f32 %f161, %f158, %f268, %p31; mul.f32 %f162, %f88, %f88; div.rn.f32 %f163, %f152, %f162; sub.f32 %f164, %f159, %f1; sub.f32 %f165, %f160, %f2; sub.f32 %f166, %f161, %f3; fma.rn.f32 %f167, %f164, %f163, %f263; fma.rn.f32 %f264, %f165, %f163, %f264; fma.rn.f32 %f168, %f166, %f163, %f265; div.rn.f32 %f169, %f153, %f88; fma.rn.f32 %f263, %f161, %f169, %f167; mul.f32 %f170, %f159, %f169; sub.f32 %f265, %f168, %f170; BB0_29: add.s32 %r26, %r2, 1; @%p27 bra BB0_31; rem.s32 %r88, %r26, %r44; add.s32 %r89, %r88, %r44; rem.s32 %r125, %r89, %r44; bra.uni BB0_32; BB0_31: add.s32 %r90, %r44, -1; min.s32 %r125, %r26, %r90; BB0_32: add.s32 %r91, %r125, %r4; mad.lo.s32 %r30, %r91, %r43, %r1; setp.ge.s32 %p38, %r26, %r44; mov.f32 %f272, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f273, %f272; mov.f32 %f274, %f272; @%p40 bra BB0_34; mul.wide.s32 %rd52, %r30, 4; add.s64 %rd53, %rd6, %rd52; ld.global.nc.f32 %f272, [%rd53]; add.s64 %rd54, %rd5, %rd52; ld.global.nc.f32 %f273, [%rd54]; add.s64 %rd55, %rd4, %rd52; ld.global.nc.f32 %f274, [%rd55]; BB0_34: mul.f32 %f174, %f273, %f273; fma.rn.f32 %f175, %f272, %f272, %f174; fma.rn.f32 %f49, %f274, %f274, %f175; setp.eq.f32 %p41, %f49, 0f00000000; mov.u16 %rs40, %rs1; @%p41 bra BB0_36; cvt.s64.s32 %rd56, %r30; add.s64 %rd57, %rd3, %rd56; ld.global.nc.u8 %rs40, [%rd57]; BB0_36: setp.gt.u16 %p42, %rs40, %rs1; cvt.u32.u16 %r92, %rs40; and.b32 %r93, %r92, 255; selp.b32 %r94, %r7, %r93, %p42; selp.b32 %r95, %r93, %r7, %p42; add.s32 %r96, %r95, 1; mul.lo.s32 %r97, %r96, %r95; shr.u32 %r98, %r97, 1; add.s32 %r31, %r98, %r94; and.pred %p45, %p41, %p13; @%p45 bra BB0_38; mul.wide.s32 %rd58, %r31, 4; add.s64 %rd59, %rd2, %rd58; ld.global.nc.f32 %f176, [%rd59]; add.f32 %f177, %f176, %f176; add.s64 %rd60, %rd1, %rd58; ld.global.nc.f32 %f178, [%rd60]; div.rn.f32 %f179, %f178, %f177; mul.f32 %f180, %f179, %f88; fma.rn.f32 %f181, %f3, %f180, %f1; mul.f32 %f182, %f1, %f180; sub.f32 %f183, %f3, %f182; selp.f32 %f184, %f181, %f272, %p41; selp.f32 %f185, %f2, %f273, %p41; selp.f32 %f186, %f183, %f274, %p41; mul.f32 %f187, %f88, %f88; div.rn.f32 %f188, %f177, %f187; sub.f32 %f189, %f184, %f1; sub.f32 %f190, %f185, %f2; sub.f32 %f191, %f186, %f3; fma.rn.f32 %f192, %f189, %f188, %f263; fma.rn.f32 %f264, %f190, %f188, %f264; fma.rn.f32 %f193, %f191, %f188, %f265; div.rn.f32 %f194, %f178, %f88; mul.f32 %f195, %f186, %f194; sub.f32 %f263, %f192, %f195; fma.rn.f32 %f265, %f184, %f194, %f193; BB0_38: setp.eq.s32 %p47, %r45, 1; @%p47 bra BB0_57; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r32, %r3, -1; @%p48 bra BB0_41; rem.s32 %r99, %r32, %r45; add.s32 %r100, %r99, %r45; rem.s32 %r126, %r100, %r45; bra.uni BB0_42; BB0_41: mov.u32 %r101, 0; max.s32 %r126, %r32, %r101; BB0_42: mad.lo.s32 %r102, %r126, %r44, %r2; mad.lo.s32 %r36, %r102, %r43, %r1; setp.lt.s32 %p50, %r32, 0; mov.f32 %f278, 0f00000000; and.pred %p51, %p50, %p48; mov.f32 %f279, %f278; mov.f32 %f280, %f278; @%p51 bra BB0_44; mul.wide.s32 %rd61, %r36, 4; add.s64 %rd62, %rd6, %rd61; ld.global.nc.f32 %f278, [%rd62]; add.s64 %rd63, %rd5, %rd61; ld.global.nc.f32 %f279, [%rd63]; add.s64 %rd64, %rd4, %rd61; ld.global.nc.f32 %f280, [%rd64]; BB0_44: mul.f32 %f199, %f279, %f279; fma.rn.f32 %f200, %f278, %f278, %f199; fma.rn.f32 %f62, %f280, %f280, %f200; setp.eq.f32 %p52, %f62, 0f00000000; mov.u16 %rs41, %rs1; @%p52 bra BB0_46; cvt.s64.s32 %rd65, %r36; add.s64 %rd66, %rd3, %rd65; ld.global.nc.u8 %rs41, [%rd66]; BB0_46: setp.gt.u16 %p53, %rs41, %rs1; cvt.u32.u16 %r103, %rs41; and.b32 %r104, %r103, 255; selp.b32 %r105, %r7, %r104, %p53; selp.b32 %r106, %r104, %r7, %p53; add.s32 %r107, %r106, 1; mul.lo.s32 %r108, %r107, %r106; shr.u32 %r109, %r108, 1; add.s32 %r37, %r109, %r105; and.pred %p56, %p52, %p13; @%p56 bra BB0_48; mul.wide.s32 %rd67, %r37, 4; add.s64 %rd68, %rd2, %rd67; ld.global.nc.f32 %f201, [%rd68]; add.f32 %f202, %f201, %f201; add.s64 %rd69, %rd1, %rd67; ld.global.nc.f32 %f203, [%rd69]; div.rn.f32 %f204, %f203, %f202; mul.f32 %f205, %f204, %f89; fma.rn.f32 %f206, %f2, %f205, %f1; mul.f32 %f207, %f1, %f205; sub.f32 %f208, %f2, %f207; selp.f32 %f209, %f206, %f278, %p52; selp.f32 %f210, %f208, %f279, %p52; selp.f32 %f211, %f3, %f280, %p52; mul.f32 %f212, %f89, %f89; div.rn.f32 %f213, %f202, %f212; sub.f32 %f214, %f209, %f1; sub.f32 %f215, %f210, %f2; sub.f32 %f216, %f211, %f3; fma.rn.f32 %f217, %f214, %f213, %f263; fma.rn.f32 %f218, %f215, %f213, %f264; fma.rn.f32 %f265, %f216, %f213, %f265; div.rn.f32 %f219, %f203, %f89; mul.f32 %f220, %f210, %f219; sub.f32 %f263, %f217, %f220; fma.rn.f32 %f264, %f209, %f219, %f218; BB0_48: add.s32 %r38, %r3, 1; @%p48 bra BB0_50; rem.s32 %r110, %r38, %r45; add.s32 %r111, %r110, %r45; rem.s32 %r127, %r111, %r45; bra.uni BB0_51; BB0_50: add.s32 %r112, %r45, -1; min.s32 %r127, %r38, %r112; BB0_51: mad.lo.s32 %r113, %r127, %r44, %r2; mad.lo.s32 %r42, %r113, %r43, %r1; setp.ge.s32 %p59, %r38, %r45; mov.f32 %f284, 0f00000000; and.pred %p61, %p59, %p48; mov.f32 %f285, %f284; mov.f32 %f286, %f284; @%p61 bra BB0_53; mul.wide.s32 %rd70, %r42, 4; add.s64 %rd71, %rd6, %rd70; ld.global.nc.f32 %f286, [%rd71]; add.s64 %rd72, %rd5, %rd70; ld.global.nc.f32 %f285, [%rd72]; add.s64 %rd73, %rd4, %rd70; ld.global.nc.f32 %f284, [%rd73]; BB0_53: mul.f32 %f224, %f286, %f286; fma.rn.f32 %f225, %f285, %f285, %f224; fma.rn.f32 %f75, %f284, %f284, %f225; setp.eq.f32 %p62, %f75, 0f00000000; mov.u16 %rs42, %rs1; @%p62 bra BB0_55; cvt.s64.s32 %rd74, %r42; add.s64 %rd75, %rd3, %rd74; ld.global.nc.u8 %rs42, [%rd75]; BB0_55: setp.gt.u16 %p63, %rs42, %rs1; cvt.u32.u16 %r114, %rs42; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p63; selp.b32 %r117, %r115, %r7, %p63; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd76, %r121, 4; add.s64 %rd7, %rd2, %rd76; add.s64 %rd8, %rd1, %rd76; and.pred %p66, %p62, %p13; @%p66 bra BB0_57; ld.global.nc.f32 %f226, [%rd7]; add.f32 %f227, %f226, %f226; ld.global.nc.f32 %f228, [%rd8]; div.rn.f32 %f229, %f228, %f227; mul.f32 %f230, %f229, %f89; mul.f32 %f231, %f2, %f230; sub.f32 %f232, %f1, %f231; fma.rn.f32 %f233, %f1, %f230, %f2; selp.f32 %f234, %f3, %f284, %p62; selp.f32 %f235, %f233, %f285, %p62; selp.f32 %f236, %f232, %f286, %p62; mul.f32 %f237, %f89, %f89; div.rn.f32 %f238, %f227, %f237; sub.f32 %f239, %f236, %f1; sub.f32 %f240, %f235, %f2; sub.f32 %f241, %f234, %f3; fma.rn.f32 %f242, %f239, %f238, %f263; fma.rn.f32 %f243, %f240, %f238, %f264; fma.rn.f32 %f265, %f241, %f238, %f265; div.rn.f32 %f244, %f228, %f89; fma.rn.f32 %f263, %f235, %f244, %f242; mul.f32 %f245, %f236, %f244; sub.f32 %f264, %f243, %f245; BB0_57: setp.eq.s64 %p68, %rd12, 0; @%p68 bra BB0_59; cvta.to.global.u64 %rd77, %rd12; add.s64 %rd79, %rd77, %rd19; ld.global.nc.f32 %f246, [%rd79]; mul.f32 %f290, %f246, %f290; BB0_59: setp.eq.f32 %p69, %f290, 0f00000000; mov.f32 %f291, 0f00000000; @%p69 bra BB0_61; rcp.rn.f32 %f291, %f290; BB0_61: cvta.to.global.u64 %rd80, %rd11; cvta.to.global.u64 %rd81, %rd10; cvta.to.global.u64 %rd82, %rd9; add.s64 %rd84, %rd82, %rd19; ld.global.f32 %f248, [%rd84]; fma.rn.f32 %f249, %f263, %f291, %f248; st.global.f32 [%rd84], %f249; add.s64 %rd85, %rd81, %rd19; ld.global.f32 %f250, [%rd85]; fma.rn.f32 %f251, %f264, %f291, %f250; st.global.f32 [%rd85], %f251; add.s64 %rd86, %rd80, %rd19; ld.global.f32 %f252, [%rd86]; fma.rn.f32 %f253, %f265, %f291, %f252; st.global.f32 [%rd86], %f253; BB0_62: ret; } ` adddmibulk_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<70>; .reg .b16 %rs<43>; .reg .f32 %f<292>; .reg .b32 %r<128>; .reg .b64 %rd<87>; ld.param.u64 %rd9, [adddmibulk_param_0]; ld.param.u64 %rd10, [adddmibulk_param_1]; ld.param.u64 %rd11, [adddmibulk_param_2]; ld.param.u64 %rd13, [adddmibulk_param_3]; ld.param.u64 %rd14, [adddmibulk_param_4]; ld.param.u64 %rd15, [adddmibulk_param_5]; ld.param.u64 %rd12, [adddmibulk_param_6]; ld.param.f32 %f290, [adddmibulk_param_7]; ld.param.u64 %rd16, [adddmibulk_param_8]; ld.param.u64 %rd17, [adddmibulk_param_9]; ld.param.u64 %rd18, [adddmibulk_param_10]; ld.param.f32 %f87, [adddmibulk_param_11]; ld.param.f32 %f88, [adddmibulk_param_12]; ld.param.f32 %f89, [adddmibulk_param_13]; ld.param.u32 %r43, [adddmibulk_param_14]; ld.param.u32 %r44, [adddmibulk_param_15]; ld.param.u32 %r45, [adddmibulk_param_16]; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; cvta.to.global.u64 %rd1, %rd17; cvta.to.global.u64 %rd2, %rd16; cvta.to.global.u64 %rd3, %rd18; cvta.to.global.u64 %rd4, %rd15; cvta.to.global.u64 %rd5, %rd14; cvta.to.global.u64 %rd6, %rd13; mov.u32 %r46, %ntid.x; mov.u32 %r47, %ctaid.x; mov.u32 %r48, %tid.x; mad.lo.s32 %r1, %r46, %r47, %r48; mov.u32 %r49, %ntid.y; mov.u32 %r50, %ctaid.y; mov.u32 %r51, %tid.y; mad.lo.s32 %r2, %r49, %r50, %r51; mov.u32 %r52, %ntid.z; mov.u32 %r53, %ctaid.z; mov.u32 %r54, %tid.z; mad.lo.s32 %r3, %r52, %r53, %r54; setp.ge.s32 %p1, %r2, %r44; setp.ge.s32 %p2, %r1, %r43; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r45; or.pred %p5, %p3, %p4; @%p5 bra BB0_62; mul.lo.s32 %r4, %r3, %r44; add.s32 %r55, %r4, %r2; mul.lo.s32 %r5, %r55, %r43; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd19, %r6, 4; add.s64 %rd20, %rd6, %rd19; cvt.s64.s32 %rd21, %r6; add.s64 %rd22, %rd5, %rd19; add.s64 %rd23, %rd4, %rd19; add.s64 %rd24, %rd3, %rd21; ld.global.nc.u8 %rs1, [%rd24]; cvt.u32.u16 %r56, %rs1; and.b32 %r7, %r56, 255; ld.global.nc.f32 %f1, [%rd20]; ld.global.nc.f32 %f2, [%rd22]; mul.f32 %f90, %f2, %f2; fma.rn.f32 %f91, %f1, %f1, %f90; ld.global.nc.f32 %f3, [%rd23]; fma.rn.f32 %f92, %f3, %f3, %f91; setp.eq.f32 %p6, %f92, 0f00000000; @%p6 bra BB0_62; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r57, %r8, %r43; add.s32 %r58, %r57, %r43; rem.s32 %r122, %r58, %r43; bra.uni BB0_5; BB0_4: mov.u32 %r59, 0; max.s32 %r122, %r8, %r59; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f254, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f255, %f254; mov.f32 %f256, %f254; @%p10 bra BB0_7; mul.wide.s32 %rd25, %r12, 4; add.s64 %rd26, %rd6, %rd25; ld.global.nc.f32 %f254, [%rd26]; add.s64 %rd27, %rd5, %rd25; ld.global.nc.f32 %f255, [%rd27]; add.s64 %rd28, %rd4, %rd25; ld.global.nc.f32 %f256, [%rd28]; BB0_7: mul.f32 %f96, %f255, %f255; fma.rn.f32 %f97, %f254, %f254, %f96; fma.rn.f32 %f10, %f256, %f256, %f97; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs37, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd29, %r12; add.s64 %rd30, %rd3, %rd29; ld.global.nc.u8 %rs37, [%rd30]; BB0_9: setp.gt.u16 %p12, %rs37, %rs1; cvt.u32.u16 %r60, %rs37; and.b32 %r61, %r60, 255; selp.b32 %r62, %r7, %r61, %p12; selp.b32 %r63, %r61, %r7, %p12; add.s32 %r64, %r63, 1; mul.lo.s32 %r65, %r64, %r63; shr.u32 %r66, %r65, 1; add.s32 %r13, %r66, %r62; setp.ne.s16 %p13, %rs18, 0; mov.f32 %f263, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f264, %f263; mov.f32 %f265, %f263; @%p15 bra BB0_11; mul.wide.s32 %rd31, %r13, 4; add.s64 %rd32, %rd2, %rd31; ld.global.nc.f32 %f101, [%rd32]; add.f32 %f102, %f101, %f101; add.s64 %rd33, %rd1, %rd31; ld.global.nc.f32 %f103, [%rd33]; div.rn.f32 %f104, %f103, %f102; mul.f32 %f105, %f104, %f87; fma.rn.f32 %f106, %f3, %f105, %f2; mul.f32 %f107, %f2, %f105; sub.f32 %f108, %f3, %f107; selp.f32 %f109, %f1, %f254, %p11; selp.f32 %f110, %f106, %f255, %p11; selp.f32 %f111, %f108, %f256, %p11; mul.f32 %f112, %f87, %f87; div.rn.f32 %f113, %f102, %f112; sub.f32 %f114, %f109, %f1; sub.f32 %f115, %f110, %f2; sub.f32 %f116, %f111, %f3; fma.rn.f32 %f263, %f114, %f113, 0f00000000; fma.rn.f32 %f117, %f115, %f113, 0f00000000; fma.rn.f32 %f118, %f116, %f113, 0f00000000; div.rn.f32 %f119, %f103, %f87; mul.f32 %f120, %f111, %f119; sub.f32 %f264, %f117, %f120; fma.rn.f32 %f265, %f110, %f119, %f118; BB0_11: add.s32 %r14, %r1, 1; @%p7 bra BB0_13; rem.s32 %r67, %r14, %r43; add.s32 %r68, %r67, %r43; rem.s32 %r123, %r68, %r43; bra.uni BB0_14; BB0_13: add.s32 %r69, %r43, -1; min.s32 %r123, %r14, %r69; BB0_14: add.s32 %r18, %r123, %r5; setp.ge.s32 %p18, %r14, %r43; mov.f32 %f260, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f261, %f260; mov.f32 %f262, %f260; @%p20 bra BB0_16; mul.wide.s32 %rd34, %r18, 4; add.s64 %rd35, %rd6, %rd34; ld.global.nc.f32 %f260, [%rd35]; add.s64 %rd36, %rd5, %rd34; ld.global.nc.f32 %f261, [%rd36]; add.s64 %rd37, %rd4, %rd34; ld.global.nc.f32 %f262, [%rd37]; BB0_16: mul.f32 %f124, %f261, %f261; fma.rn.f32 %f125, %f260, %f260, %f124; fma.rn.f32 %f23, %f262, %f262, %f125; setp.eq.f32 %p21, %f23, 0f00000000; mov.u16 %rs38, %rs1; @%p21 bra BB0_18; cvt.s64.s32 %rd38, %r18; add.s64 %rd39, %rd3, %rd38; ld.global.nc.u8 %rs38, [%rd39]; BB0_18: setp.gt.u16 %p22, %rs38, %rs1; cvt.u32.u16 %r70, %rs38; and.b32 %r71, %r70, 255; selp.b32 %r72, %r7, %r71, %p22; selp.b32 %r73, %r71, %r7, %p22; add.s32 %r74, %r73, 1; mul.lo.s32 %r75, %r74, %r73; shr.u32 %r76, %r75, 1; add.s32 %r19, %r76, %r72; and.pred %p25, %p21, %p13; @%p25 bra BB0_20; mul.wide.s32 %rd40, %r19, 4; add.s64 %rd41, %rd2, %rd40; ld.global.nc.f32 %f126, [%rd41]; add.f32 %f127, %f126, %f126; add.s64 %rd42, %rd1, %rd40; ld.global.nc.f32 %f128, [%rd42]; div.rn.f32 %f129, %f128, %f127; mul.f32 %f130, %f129, %f87; mul.f32 %f131, %f3, %f130; sub.f32 %f132, %f2, %f131; fma.rn.f32 %f133, %f2, %f130, %f3; selp.f32 %f134, %f1, %f260, %p21; selp.f32 %f135, %f132, %f261, %p21; selp.f32 %f136, %f133, %f262, %p21; mul.f32 %f137, %f87, %f87; div.rn.f32 %f138, %f127, %f137; sub.f32 %f139, %f134, %f1; sub.f32 %f140, %f135, %f2; sub.f32 %f141, %f136, %f3; fma.rn.f32 %f263, %f139, %f138, %f263; fma.rn.f32 %f142, %f140, %f138, %f264; fma.rn.f32 %f143, %f141, %f138, %f265; div.rn.f32 %f144, %f128, %f87; fma.rn.f32 %f264, %f136, %f144, %f142; mul.f32 %f145, %f135, %f144; sub.f32 %f265, %f143, %f145; BB0_20: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r20, %r2, -1; @%p27 bra BB0_22; rem.s32 %r77, %r20, %r44; add.s32 %r78, %r77, %r44; rem.s32 %r124, %r78, %r44; bra.uni BB0_23; BB0_22: mov.u32 %r79, 0; max.s32 %r124, %r20, %r79; BB0_23: add.s32 %r80, %r124, %r4; mad.lo.s32 %r24, %r80, %r43, %r1; setp.lt.s32 %p29, %r20, 0; mov.f32 %f266, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f267, %f266; mov.f32 %f268, %f266; @%p30 bra BB0_25; mul.wide.s32 %rd43, %r24, 4; add.s64 %rd44, %rd6, %rd43; ld.global.nc.f32 %f266, [%rd44]; add.s64 %rd45, %rd5, %rd43; ld.global.nc.f32 %f267, [%rd45]; add.s64 %rd46, %rd4, %rd43; ld.global.nc.f32 %f268, [%rd46]; BB0_25: mul.f32 %f149, %f267, %f267; fma.rn.f32 %f150, %f266, %f266, %f149; fma.rn.f32 %f36, %f268, %f268, %f150; setp.eq.f32 %p31, %f36, 0f00000000; mov.u16 %rs39, %rs1; @%p31 bra BB0_27; cvt.s64.s32 %rd47, %r24; add.s64 %rd48, %rd3, %rd47; ld.global.nc.u8 %rs39, [%rd48]; BB0_27: setp.gt.u16 %p32, %rs39, %rs1; cvt.u32.u16 %r81, %rs39; and.b32 %r82, %r81, 255; selp.b32 %r83, %r7, %r82, %p32; selp.b32 %r84, %r82, %r7, %p32; add.s32 %r85, %r84, 1; mul.lo.s32 %r86, %r85, %r84; shr.u32 %r87, %r86, 1; add.s32 %r25, %r87, %r83; and.pred %p35, %p31, %p13; @%p35 bra BB0_29; mul.wide.s32 %rd49, %r25, 4; add.s64 %rd50, %rd2, %rd49; ld.global.nc.f32 %f151, [%rd50]; add.f32 %f152, %f151, %f151; add.s64 %rd51, %rd1, %rd49; ld.global.nc.f32 %f153, [%rd51]; div.rn.f32 %f154, %f153, %f152; mul.f32 %f155, %f154, %f88; mul.f32 %f156, %f3, %f155; sub.f32 %f157, %f1, %f156; fma.rn.f32 %f158, %f1, %f155, %f3; selp.f32 %f159, %f157, %f266, %p31; selp.f32 %f160, %f2, %f267, %p31; selp.f32 %f161, %f158, %f268, %p31; mul.f32 %f162, %f88, %f88; div.rn.f32 %f163, %f152, %f162; sub.f32 %f164, %f159, %f1; sub.f32 %f165, %f160, %f2; sub.f32 %f166, %f161, %f3; fma.rn.f32 %f167, %f164, %f163, %f263; fma.rn.f32 %f264, %f165, %f163, %f264; fma.rn.f32 %f168, %f166, %f163, %f265; div.rn.f32 %f169, %f153, %f88; fma.rn.f32 %f263, %f161, %f169, %f167; mul.f32 %f170, %f159, %f169; sub.f32 %f265, %f168, %f170; BB0_29: add.s32 %r26, %r2, 1; @%p27 bra BB0_31; rem.s32 %r88, %r26, %r44; add.s32 %r89, %r88, %r44; rem.s32 %r125, %r89, %r44; bra.uni BB0_32; BB0_31: add.s32 %r90, %r44, -1; min.s32 %r125, %r26, %r90; BB0_32: add.s32 %r91, %r125, %r4; mad.lo.s32 %r30, %r91, %r43, %r1; setp.ge.s32 %p38, %r26, %r44; mov.f32 %f272, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f273, %f272; mov.f32 %f274, %f272; @%p40 bra BB0_34; mul.wide.s32 %rd52, %r30, 4; add.s64 %rd53, %rd6, %rd52; ld.global.nc.f32 %f272, [%rd53]; add.s64 %rd54, %rd5, %rd52; ld.global.nc.f32 %f273, [%rd54]; add.s64 %rd55, %rd4, %rd52; ld.global.nc.f32 %f274, [%rd55]; BB0_34: mul.f32 %f174, %f273, %f273; fma.rn.f32 %f175, %f272, %f272, %f174; fma.rn.f32 %f49, %f274, %f274, %f175; setp.eq.f32 %p41, %f49, 0f00000000; mov.u16 %rs40, %rs1; @%p41 bra BB0_36; cvt.s64.s32 %rd56, %r30; add.s64 %rd57, %rd3, %rd56; ld.global.nc.u8 %rs40, [%rd57]; BB0_36: setp.gt.u16 %p42, %rs40, %rs1; cvt.u32.u16 %r92, %rs40; and.b32 %r93, %r92, 255; selp.b32 %r94, %r7, %r93, %p42; selp.b32 %r95, %r93, %r7, %p42; add.s32 %r96, %r95, 1; mul.lo.s32 %r97, %r96, %r95; shr.u32 %r98, %r97, 1; add.s32 %r31, %r98, %r94; and.pred %p45, %p41, %p13; @%p45 bra BB0_38; mul.wide.s32 %rd58, %r31, 4; add.s64 %rd59, %rd2, %rd58; ld.global.nc.f32 %f176, [%rd59]; add.f32 %f177, %f176, %f176; add.s64 %rd60, %rd1, %rd58; ld.global.nc.f32 %f178, [%rd60]; div.rn.f32 %f179, %f178, %f177; mul.f32 %f180, %f179, %f88; fma.rn.f32 %f181, %f3, %f180, %f1; mul.f32 %f182, %f1, %f180; sub.f32 %f183, %f3, %f182; selp.f32 %f184, %f181, %f272, %p41; selp.f32 %f185, %f2, %f273, %p41; selp.f32 %f186, %f183, %f274, %p41; mul.f32 %f187, %f88, %f88; div.rn.f32 %f188, %f177, %f187; sub.f32 %f189, %f184, %f1; sub.f32 %f190, %f185, %f2; sub.f32 %f191, %f186, %f3; fma.rn.f32 %f192, %f189, %f188, %f263; fma.rn.f32 %f264, %f190, %f188, %f264; fma.rn.f32 %f193, %f191, %f188, %f265; div.rn.f32 %f194, %f178, %f88; mul.f32 %f195, %f186, %f194; sub.f32 %f263, %f192, %f195; fma.rn.f32 %f265, %f184, %f194, %f193; BB0_38: setp.eq.s32 %p47, %r45, 1; @%p47 bra BB0_57; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r32, %r3, -1; @%p48 bra BB0_41; rem.s32 %r99, %r32, %r45; add.s32 %r100, %r99, %r45; rem.s32 %r126, %r100, %r45; bra.uni BB0_42; BB0_41: mov.u32 %r101, 0; max.s32 %r126, %r32, %r101; BB0_42: mad.lo.s32 %r102, %r126, %r44, %r2; mad.lo.s32 %r36, %r102, %r43, %r1; setp.lt.s32 %p50, %r32, 0; mov.f32 %f278, 0f00000000; and.pred %p51, %p50, %p48; mov.f32 %f279, %f278; mov.f32 %f280, %f278; @%p51 bra BB0_44; mul.wide.s32 %rd61, %r36, 4; add.s64 %rd62, %rd6, %rd61; ld.global.nc.f32 %f278, [%rd62]; add.s64 %rd63, %rd5, %rd61; ld.global.nc.f32 %f279, [%rd63]; add.s64 %rd64, %rd4, %rd61; ld.global.nc.f32 %f280, [%rd64]; BB0_44: mul.f32 %f199, %f279, %f279; fma.rn.f32 %f200, %f278, %f278, %f199; fma.rn.f32 %f62, %f280, %f280, %f200; setp.eq.f32 %p52, %f62, 0f00000000; mov.u16 %rs41, %rs1; @%p52 bra BB0_46; cvt.s64.s32 %rd65, %r36; add.s64 %rd66, %rd3, %rd65; ld.global.nc.u8 %rs41, [%rd66]; BB0_46: setp.gt.u16 %p53, %rs41, %rs1; cvt.u32.u16 %r103, %rs41; and.b32 %r104, %r103, 255; selp.b32 %r105, %r7, %r104, %p53; selp.b32 %r106, %r104, %r7, %p53; add.s32 %r107, %r106, 1; mul.lo.s32 %r108, %r107, %r106; shr.u32 %r109, %r108, 1; add.s32 %r37, %r109, %r105; and.pred %p56, %p52, %p13; @%p56 bra BB0_48; mul.wide.s32 %rd67, %r37, 4; add.s64 %rd68, %rd2, %rd67; ld.global.nc.f32 %f201, [%rd68]; add.f32 %f202, %f201, %f201; add.s64 %rd69, %rd1, %rd67; ld.global.nc.f32 %f203, [%rd69]; div.rn.f32 %f204, %f203, %f202; mul.f32 %f205, %f204, %f89; fma.rn.f32 %f206, %f2, %f205, %f1; mul.f32 %f207, %f1, %f205; sub.f32 %f208, %f2, %f207; selp.f32 %f209, %f206, %f278, %p52; selp.f32 %f210, %f208, %f279, %p52; selp.f32 %f211, %f3, %f280, %p52; mul.f32 %f212, %f89, %f89; div.rn.f32 %f213, %f202, %f212; sub.f32 %f214, %f209, %f1; sub.f32 %f215, %f210, %f2; sub.f32 %f216, %f211, %f3; fma.rn.f32 %f217, %f214, %f213, %f263; fma.rn.f32 %f218, %f215, %f213, %f264; fma.rn.f32 %f265, %f216, %f213, %f265; div.rn.f32 %f219, %f203, %f89; mul.f32 %f220, %f210, %f219; sub.f32 %f263, %f217, %f220; fma.rn.f32 %f264, %f209, %f219, %f218; BB0_48: add.s32 %r38, %r3, 1; @%p48 bra BB0_50; rem.s32 %r110, %r38, %r45; add.s32 %r111, %r110, %r45; rem.s32 %r127, %r111, %r45; bra.uni BB0_51; BB0_50: add.s32 %r112, %r45, -1; min.s32 %r127, %r38, %r112; BB0_51: mad.lo.s32 %r113, %r127, %r44, %r2; mad.lo.s32 %r42, %r113, %r43, %r1; setp.ge.s32 %p59, %r38, %r45; mov.f32 %f284, 0f00000000; and.pred %p61, %p59, %p48; mov.f32 %f285, %f284; mov.f32 %f286, %f284; @%p61 bra BB0_53; mul.wide.s32 %rd70, %r42, 4; add.s64 %rd71, %rd6, %rd70; ld.global.nc.f32 %f286, [%rd71]; add.s64 %rd72, %rd5, %rd70; ld.global.nc.f32 %f285, [%rd72]; add.s64 %rd73, %rd4, %rd70; ld.global.nc.f32 %f284, [%rd73]; BB0_53: mul.f32 %f224, %f286, %f286; fma.rn.f32 %f225, %f285, %f285, %f224; fma.rn.f32 %f75, %f284, %f284, %f225; setp.eq.f32 %p62, %f75, 0f00000000; mov.u16 %rs42, %rs1; @%p62 bra BB0_55; cvt.s64.s32 %rd74, %r42; add.s64 %rd75, %rd3, %rd74; ld.global.nc.u8 %rs42, [%rd75]; BB0_55: setp.gt.u16 %p63, %rs42, %rs1; cvt.u32.u16 %r114, %rs42; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p63; selp.b32 %r117, %r115, %r7, %p63; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd76, %r121, 4; add.s64 %rd7, %rd2, %rd76; add.s64 %rd8, %rd1, %rd76; and.pred %p66, %p62, %p13; @%p66 bra BB0_57; ld.global.nc.f32 %f226, [%rd7]; add.f32 %f227, %f226, %f226; ld.global.nc.f32 %f228, [%rd8]; div.rn.f32 %f229, %f228, %f227; mul.f32 %f230, %f229, %f89; mul.f32 %f231, %f2, %f230; sub.f32 %f232, %f1, %f231; fma.rn.f32 %f233, %f1, %f230, %f2; selp.f32 %f234, %f3, %f284, %p62; selp.f32 %f235, %f233, %f285, %p62; selp.f32 %f236, %f232, %f286, %p62; mul.f32 %f237, %f89, %f89; div.rn.f32 %f238, %f227, %f237; sub.f32 %f239, %f236, %f1; sub.f32 %f240, %f235, %f2; sub.f32 %f241, %f234, %f3; fma.rn.f32 %f242, %f239, %f238, %f263; fma.rn.f32 %f243, %f240, %f238, %f264; fma.rn.f32 %f265, %f241, %f238, %f265; div.rn.f32 %f244, %f228, %f89; fma.rn.f32 %f263, %f235, %f244, %f242; mul.f32 %f245, %f236, %f244; sub.f32 %f264, %f243, %f245; BB0_57: setp.eq.s64 %p68, %rd12, 0; @%p68 bra BB0_59; cvta.to.global.u64 %rd77, %rd12; add.s64 %rd79, %rd77, %rd19; ld.global.nc.f32 %f246, [%rd79]; mul.f32 %f290, %f246, %f290; BB0_59: setp.eq.f32 %p69, %f290, 0f00000000; mov.f32 %f291, 0f00000000; @%p69 bra BB0_61; rcp.rn.f32 %f291, %f290; BB0_61: cvta.to.global.u64 %rd80, %rd11; cvta.to.global.u64 %rd81, %rd10; cvta.to.global.u64 %rd82, %rd9; add.s64 %rd84, %rd82, %rd19; ld.global.f32 %f248, [%rd84]; fma.rn.f32 %f249, %f263, %f291, %f248; st.global.f32 [%rd84], %f249; add.s64 %rd85, %rd81, %rd19; ld.global.f32 %f250, [%rd85]; fma.rn.f32 %f251, %f264, %f291, %f250; st.global.f32 [%rd85], %f251; add.s64 %rd86, %rd80, %rd19; ld.global.f32 %f252, [%rd86]; fma.rn.f32 %f253, %f265, %f291, %f252; st.global.f32 [%rd86], %f253; BB0_62: ret; } ` adddmibulk_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<70>; .reg .b16 %rs<43>; .reg .f32 %f<292>; .reg .b32 %r<128>; .reg .b64 %rd<87>; ld.param.u64 %rd9, [adddmibulk_param_0]; ld.param.u64 %rd10, [adddmibulk_param_1]; ld.param.u64 %rd11, [adddmibulk_param_2]; ld.param.u64 %rd13, [adddmibulk_param_3]; ld.param.u64 %rd14, [adddmibulk_param_4]; ld.param.u64 %rd15, [adddmibulk_param_5]; ld.param.u64 %rd12, [adddmibulk_param_6]; ld.param.f32 %f290, [adddmibulk_param_7]; ld.param.u64 %rd16, [adddmibulk_param_8]; ld.param.u64 %rd17, [adddmibulk_param_9]; ld.param.u64 %rd18, [adddmibulk_param_10]; ld.param.f32 %f87, [adddmibulk_param_11]; ld.param.f32 %f88, [adddmibulk_param_12]; ld.param.f32 %f89, [adddmibulk_param_13]; ld.param.u32 %r43, [adddmibulk_param_14]; ld.param.u32 %r44, [adddmibulk_param_15]; ld.param.u32 %r45, [adddmibulk_param_16]; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; cvta.to.global.u64 %rd1, %rd17; cvta.to.global.u64 %rd2, %rd16; cvta.to.global.u64 %rd3, %rd18; cvta.to.global.u64 %rd4, %rd15; cvta.to.global.u64 %rd5, %rd14; cvta.to.global.u64 %rd6, %rd13; mov.u32 %r46, %ntid.x; mov.u32 %r47, %ctaid.x; mov.u32 %r48, %tid.x; mad.lo.s32 %r1, %r46, %r47, %r48; mov.u32 %r49, %ntid.y; mov.u32 %r50, %ctaid.y; mov.u32 %r51, %tid.y; mad.lo.s32 %r2, %r49, %r50, %r51; mov.u32 %r52, %ntid.z; mov.u32 %r53, %ctaid.z; mov.u32 %r54, %tid.z; mad.lo.s32 %r3, %r52, %r53, %r54; setp.ge.s32 %p1, %r2, %r44; setp.ge.s32 %p2, %r1, %r43; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r45; or.pred %p5, %p3, %p4; @%p5 bra BB0_62; mul.lo.s32 %r4, %r3, %r44; add.s32 %r55, %r4, %r2; mul.lo.s32 %r5, %r55, %r43; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd19, %r6, 4; add.s64 %rd20, %rd6, %rd19; cvt.s64.s32 %rd21, %r6; add.s64 %rd22, %rd5, %rd19; add.s64 %rd23, %rd4, %rd19; add.s64 %rd24, %rd3, %rd21; ld.global.nc.u8 %rs1, [%rd24]; cvt.u32.u16 %r56, %rs1; and.b32 %r7, %r56, 255; ld.global.nc.f32 %f1, [%rd20]; ld.global.nc.f32 %f2, [%rd22]; mul.f32 %f90, %f2, %f2; fma.rn.f32 %f91, %f1, %f1, %f90; ld.global.nc.f32 %f3, [%rd23]; fma.rn.f32 %f92, %f3, %f3, %f91; setp.eq.f32 %p6, %f92, 0f00000000; @%p6 bra BB0_62; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r57, %r8, %r43; add.s32 %r58, %r57, %r43; rem.s32 %r122, %r58, %r43; bra.uni BB0_5; BB0_4: mov.u32 %r59, 0; max.s32 %r122, %r8, %r59; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f254, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f255, %f254; mov.f32 %f256, %f254; @%p10 bra BB0_7; mul.wide.s32 %rd25, %r12, 4; add.s64 %rd26, %rd6, %rd25; ld.global.nc.f32 %f254, [%rd26]; add.s64 %rd27, %rd5, %rd25; ld.global.nc.f32 %f255, [%rd27]; add.s64 %rd28, %rd4, %rd25; ld.global.nc.f32 %f256, [%rd28]; BB0_7: mul.f32 %f96, %f255, %f255; fma.rn.f32 %f97, %f254, %f254, %f96; fma.rn.f32 %f10, %f256, %f256, %f97; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs37, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd29, %r12; add.s64 %rd30, %rd3, %rd29; ld.global.nc.u8 %rs37, [%rd30]; BB0_9: setp.gt.u16 %p12, %rs37, %rs1; cvt.u32.u16 %r60, %rs37; and.b32 %r61, %r60, 255; selp.b32 %r62, %r7, %r61, %p12; selp.b32 %r63, %r61, %r7, %p12; add.s32 %r64, %r63, 1; mul.lo.s32 %r65, %r64, %r63; shr.u32 %r66, %r65, 1; add.s32 %r13, %r66, %r62; setp.ne.s16 %p13, %rs18, 0; mov.f32 %f263, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f264, %f263; mov.f32 %f265, %f263; @%p15 bra BB0_11; mul.wide.s32 %rd31, %r13, 4; add.s64 %rd32, %rd2, %rd31; ld.global.nc.f32 %f101, [%rd32]; add.f32 %f102, %f101, %f101; add.s64 %rd33, %rd1, %rd31; ld.global.nc.f32 %f103, [%rd33]; div.rn.f32 %f104, %f103, %f102; mul.f32 %f105, %f104, %f87; fma.rn.f32 %f106, %f3, %f105, %f2; mul.f32 %f107, %f2, %f105; sub.f32 %f108, %f3, %f107; selp.f32 %f109, %f1, %f254, %p11; selp.f32 %f110, %f106, %f255, %p11; selp.f32 %f111, %f108, %f256, %p11; mul.f32 %f112, %f87, %f87; div.rn.f32 %f113, %f102, %f112; sub.f32 %f114, %f109, %f1; sub.f32 %f115, %f110, %f2; sub.f32 %f116, %f111, %f3; fma.rn.f32 %f263, %f114, %f113, 0f00000000; fma.rn.f32 %f117, %f115, %f113, 0f00000000; fma.rn.f32 %f118, %f116, %f113, 0f00000000; div.rn.f32 %f119, %f103, %f87; mul.f32 %f120, %f111, %f119; sub.f32 %f264, %f117, %f120; fma.rn.f32 %f265, %f110, %f119, %f118; BB0_11: add.s32 %r14, %r1, 1; @%p7 bra BB0_13; rem.s32 %r67, %r14, %r43; add.s32 %r68, %r67, %r43; rem.s32 %r123, %r68, %r43; bra.uni BB0_14; BB0_13: add.s32 %r69, %r43, -1; min.s32 %r123, %r14, %r69; BB0_14: add.s32 %r18, %r123, %r5; setp.ge.s32 %p18, %r14, %r43; mov.f32 %f260, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f261, %f260; mov.f32 %f262, %f260; @%p20 bra BB0_16; mul.wide.s32 %rd34, %r18, 4; add.s64 %rd35, %rd6, %rd34; ld.global.nc.f32 %f260, [%rd35]; add.s64 %rd36, %rd5, %rd34; ld.global.nc.f32 %f261, [%rd36]; add.s64 %rd37, %rd4, %rd34; ld.global.nc.f32 %f262, [%rd37]; BB0_16: mul.f32 %f124, %f261, %f261; fma.rn.f32 %f125, %f260, %f260, %f124; fma.rn.f32 %f23, %f262, %f262, %f125; setp.eq.f32 %p21, %f23, 0f00000000; mov.u16 %rs38, %rs1; @%p21 bra BB0_18; cvt.s64.s32 %rd38, %r18; add.s64 %rd39, %rd3, %rd38; ld.global.nc.u8 %rs38, [%rd39]; BB0_18: setp.gt.u16 %p22, %rs38, %rs1; cvt.u32.u16 %r70, %rs38; and.b32 %r71, %r70, 255; selp.b32 %r72, %r7, %r71, %p22; selp.b32 %r73, %r71, %r7, %p22; add.s32 %r74, %r73, 1; mul.lo.s32 %r75, %r74, %r73; shr.u32 %r76, %r75, 1; add.s32 %r19, %r76, %r72; and.pred %p25, %p21, %p13; @%p25 bra BB0_20; mul.wide.s32 %rd40, %r19, 4; add.s64 %rd41, %rd2, %rd40; ld.global.nc.f32 %f126, [%rd41]; add.f32 %f127, %f126, %f126; add.s64 %rd42, %rd1, %rd40; ld.global.nc.f32 %f128, [%rd42]; div.rn.f32 %f129, %f128, %f127; mul.f32 %f130, %f129, %f87; mul.f32 %f131, %f3, %f130; sub.f32 %f132, %f2, %f131; fma.rn.f32 %f133, %f2, %f130, %f3; selp.f32 %f134, %f1, %f260, %p21; selp.f32 %f135, %f132, %f261, %p21; selp.f32 %f136, %f133, %f262, %p21; mul.f32 %f137, %f87, %f87; div.rn.f32 %f138, %f127, %f137; sub.f32 %f139, %f134, %f1; sub.f32 %f140, %f135, %f2; sub.f32 %f141, %f136, %f3; fma.rn.f32 %f263, %f139, %f138, %f263; fma.rn.f32 %f142, %f140, %f138, %f264; fma.rn.f32 %f143, %f141, %f138, %f265; div.rn.f32 %f144, %f128, %f87; fma.rn.f32 %f264, %f136, %f144, %f142; mul.f32 %f145, %f135, %f144; sub.f32 %f265, %f143, %f145; BB0_20: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r20, %r2, -1; @%p27 bra BB0_22; rem.s32 %r77, %r20, %r44; add.s32 %r78, %r77, %r44; rem.s32 %r124, %r78, %r44; bra.uni BB0_23; BB0_22: mov.u32 %r79, 0; max.s32 %r124, %r20, %r79; BB0_23: add.s32 %r80, %r124, %r4; mad.lo.s32 %r24, %r80, %r43, %r1; setp.lt.s32 %p29, %r20, 0; mov.f32 %f266, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f267, %f266; mov.f32 %f268, %f266; @%p30 bra BB0_25; mul.wide.s32 %rd43, %r24, 4; add.s64 %rd44, %rd6, %rd43; ld.global.nc.f32 %f266, [%rd44]; add.s64 %rd45, %rd5, %rd43; ld.global.nc.f32 %f267, [%rd45]; add.s64 %rd46, %rd4, %rd43; ld.global.nc.f32 %f268, [%rd46]; BB0_25: mul.f32 %f149, %f267, %f267; fma.rn.f32 %f150, %f266, %f266, %f149; fma.rn.f32 %f36, %f268, %f268, %f150; setp.eq.f32 %p31, %f36, 0f00000000; mov.u16 %rs39, %rs1; @%p31 bra BB0_27; cvt.s64.s32 %rd47, %r24; add.s64 %rd48, %rd3, %rd47; ld.global.nc.u8 %rs39, [%rd48]; BB0_27: setp.gt.u16 %p32, %rs39, %rs1; cvt.u32.u16 %r81, %rs39; and.b32 %r82, %r81, 255; selp.b32 %r83, %r7, %r82, %p32; selp.b32 %r84, %r82, %r7, %p32; add.s32 %r85, %r84, 1; mul.lo.s32 %r86, %r85, %r84; shr.u32 %r87, %r86, 1; add.s32 %r25, %r87, %r83; and.pred %p35, %p31, %p13; @%p35 bra BB0_29; mul.wide.s32 %rd49, %r25, 4; add.s64 %rd50, %rd2, %rd49; ld.global.nc.f32 %f151, [%rd50]; add.f32 %f152, %f151, %f151; add.s64 %rd51, %rd1, %rd49; ld.global.nc.f32 %f153, [%rd51]; div.rn.f32 %f154, %f153, %f152; mul.f32 %f155, %f154, %f88; mul.f32 %f156, %f3, %f155; sub.f32 %f157, %f1, %f156; fma.rn.f32 %f158, %f1, %f155, %f3; selp.f32 %f159, %f157, %f266, %p31; selp.f32 %f160, %f2, %f267, %p31; selp.f32 %f161, %f158, %f268, %p31; mul.f32 %f162, %f88, %f88; div.rn.f32 %f163, %f152, %f162; sub.f32 %f164, %f159, %f1; sub.f32 %f165, %f160, %f2; sub.f32 %f166, %f161, %f3; fma.rn.f32 %f167, %f164, %f163, %f263; fma.rn.f32 %f264, %f165, %f163, %f264; fma.rn.f32 %f168, %f166, %f163, %f265; div.rn.f32 %f169, %f153, %f88; fma.rn.f32 %f263, %f161, %f169, %f167; mul.f32 %f170, %f159, %f169; sub.f32 %f265, %f168, %f170; BB0_29: add.s32 %r26, %r2, 1; @%p27 bra BB0_31; rem.s32 %r88, %r26, %r44; add.s32 %r89, %r88, %r44; rem.s32 %r125, %r89, %r44; bra.uni BB0_32; BB0_31: add.s32 %r90, %r44, -1; min.s32 %r125, %r26, %r90; BB0_32: add.s32 %r91, %r125, %r4; mad.lo.s32 %r30, %r91, %r43, %r1; setp.ge.s32 %p38, %r26, %r44; mov.f32 %f272, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f273, %f272; mov.f32 %f274, %f272; @%p40 bra BB0_34; mul.wide.s32 %rd52, %r30, 4; add.s64 %rd53, %rd6, %rd52; ld.global.nc.f32 %f272, [%rd53]; add.s64 %rd54, %rd5, %rd52; ld.global.nc.f32 %f273, [%rd54]; add.s64 %rd55, %rd4, %rd52; ld.global.nc.f32 %f274, [%rd55]; BB0_34: mul.f32 %f174, %f273, %f273; fma.rn.f32 %f175, %f272, %f272, %f174; fma.rn.f32 %f49, %f274, %f274, %f175; setp.eq.f32 %p41, %f49, 0f00000000; mov.u16 %rs40, %rs1; @%p41 bra BB0_36; cvt.s64.s32 %rd56, %r30; add.s64 %rd57, %rd3, %rd56; ld.global.nc.u8 %rs40, [%rd57]; BB0_36: setp.gt.u16 %p42, %rs40, %rs1; cvt.u32.u16 %r92, %rs40; and.b32 %r93, %r92, 255; selp.b32 %r94, %r7, %r93, %p42; selp.b32 %r95, %r93, %r7, %p42; add.s32 %r96, %r95, 1; mul.lo.s32 %r97, %r96, %r95; shr.u32 %r98, %r97, 1; add.s32 %r31, %r98, %r94; and.pred %p45, %p41, %p13; @%p45 bra BB0_38; mul.wide.s32 %rd58, %r31, 4; add.s64 %rd59, %rd2, %rd58; ld.global.nc.f32 %f176, [%rd59]; add.f32 %f177, %f176, %f176; add.s64 %rd60, %rd1, %rd58; ld.global.nc.f32 %f178, [%rd60]; div.rn.f32 %f179, %f178, %f177; mul.f32 %f180, %f179, %f88; fma.rn.f32 %f181, %f3, %f180, %f1; mul.f32 %f182, %f1, %f180; sub.f32 %f183, %f3, %f182; selp.f32 %f184, %f181, %f272, %p41; selp.f32 %f185, %f2, %f273, %p41; selp.f32 %f186, %f183, %f274, %p41; mul.f32 %f187, %f88, %f88; div.rn.f32 %f188, %f177, %f187; sub.f32 %f189, %f184, %f1; sub.f32 %f190, %f185, %f2; sub.f32 %f191, %f186, %f3; fma.rn.f32 %f192, %f189, %f188, %f263; fma.rn.f32 %f264, %f190, %f188, %f264; fma.rn.f32 %f193, %f191, %f188, %f265; div.rn.f32 %f194, %f178, %f88; mul.f32 %f195, %f186, %f194; sub.f32 %f263, %f192, %f195; fma.rn.f32 %f265, %f184, %f194, %f193; BB0_38: setp.eq.s32 %p47, %r45, 1; @%p47 bra BB0_57; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r32, %r3, -1; @%p48 bra BB0_41; rem.s32 %r99, %r32, %r45; add.s32 %r100, %r99, %r45; rem.s32 %r126, %r100, %r45; bra.uni BB0_42; BB0_41: mov.u32 %r101, 0; max.s32 %r126, %r32, %r101; BB0_42: mad.lo.s32 %r102, %r126, %r44, %r2; mad.lo.s32 %r36, %r102, %r43, %r1; setp.lt.s32 %p50, %r32, 0; mov.f32 %f278, 0f00000000; and.pred %p51, %p50, %p48; mov.f32 %f279, %f278; mov.f32 %f280, %f278; @%p51 bra BB0_44; mul.wide.s32 %rd61, %r36, 4; add.s64 %rd62, %rd6, %rd61; ld.global.nc.f32 %f278, [%rd62]; add.s64 %rd63, %rd5, %rd61; ld.global.nc.f32 %f279, [%rd63]; add.s64 %rd64, %rd4, %rd61; ld.global.nc.f32 %f280, [%rd64]; BB0_44: mul.f32 %f199, %f279, %f279; fma.rn.f32 %f200, %f278, %f278, %f199; fma.rn.f32 %f62, %f280, %f280, %f200; setp.eq.f32 %p52, %f62, 0f00000000; mov.u16 %rs41, %rs1; @%p52 bra BB0_46; cvt.s64.s32 %rd65, %r36; add.s64 %rd66, %rd3, %rd65; ld.global.nc.u8 %rs41, [%rd66]; BB0_46: setp.gt.u16 %p53, %rs41, %rs1; cvt.u32.u16 %r103, %rs41; and.b32 %r104, %r103, 255; selp.b32 %r105, %r7, %r104, %p53; selp.b32 %r106, %r104, %r7, %p53; add.s32 %r107, %r106, 1; mul.lo.s32 %r108, %r107, %r106; shr.u32 %r109, %r108, 1; add.s32 %r37, %r109, %r105; and.pred %p56, %p52, %p13; @%p56 bra BB0_48; mul.wide.s32 %rd67, %r37, 4; add.s64 %rd68, %rd2, %rd67; ld.global.nc.f32 %f201, [%rd68]; add.f32 %f202, %f201, %f201; add.s64 %rd69, %rd1, %rd67; ld.global.nc.f32 %f203, [%rd69]; div.rn.f32 %f204, %f203, %f202; mul.f32 %f205, %f204, %f89; fma.rn.f32 %f206, %f2, %f205, %f1; mul.f32 %f207, %f1, %f205; sub.f32 %f208, %f2, %f207; selp.f32 %f209, %f206, %f278, %p52; selp.f32 %f210, %f208, %f279, %p52; selp.f32 %f211, %f3, %f280, %p52; mul.f32 %f212, %f89, %f89; div.rn.f32 %f213, %f202, %f212; sub.f32 %f214, %f209, %f1; sub.f32 %f215, %f210, %f2; sub.f32 %f216, %f211, %f3; fma.rn.f32 %f217, %f214, %f213, %f263; fma.rn.f32 %f218, %f215, %f213, %f264; fma.rn.f32 %f265, %f216, %f213, %f265; div.rn.f32 %f219, %f203, %f89; mul.f32 %f220, %f210, %f219; sub.f32 %f263, %f217, %f220; fma.rn.f32 %f264, %f209, %f219, %f218; BB0_48: add.s32 %r38, %r3, 1; @%p48 bra BB0_50; rem.s32 %r110, %r38, %r45; add.s32 %r111, %r110, %r45; rem.s32 %r127, %r111, %r45; bra.uni BB0_51; BB0_50: add.s32 %r112, %r45, -1; min.s32 %r127, %r38, %r112; BB0_51: mad.lo.s32 %r113, %r127, %r44, %r2; mad.lo.s32 %r42, %r113, %r43, %r1; setp.ge.s32 %p59, %r38, %r45; mov.f32 %f284, 0f00000000; and.pred %p61, %p59, %p48; mov.f32 %f285, %f284; mov.f32 %f286, %f284; @%p61 bra BB0_53; mul.wide.s32 %rd70, %r42, 4; add.s64 %rd71, %rd6, %rd70; ld.global.nc.f32 %f286, [%rd71]; add.s64 %rd72, %rd5, %rd70; ld.global.nc.f32 %f285, [%rd72]; add.s64 %rd73, %rd4, %rd70; ld.global.nc.f32 %f284, [%rd73]; BB0_53: mul.f32 %f224, %f286, %f286; fma.rn.f32 %f225, %f285, %f285, %f224; fma.rn.f32 %f75, %f284, %f284, %f225; setp.eq.f32 %p62, %f75, 0f00000000; mov.u16 %rs42, %rs1; @%p62 bra BB0_55; cvt.s64.s32 %rd74, %r42; add.s64 %rd75, %rd3, %rd74; ld.global.nc.u8 %rs42, [%rd75]; BB0_55: setp.gt.u16 %p63, %rs42, %rs1; cvt.u32.u16 %r114, %rs42; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p63; selp.b32 %r117, %r115, %r7, %p63; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd76, %r121, 4; add.s64 %rd7, %rd2, %rd76; add.s64 %rd8, %rd1, %rd76; and.pred %p66, %p62, %p13; @%p66 bra BB0_57; ld.global.nc.f32 %f226, [%rd7]; add.f32 %f227, %f226, %f226; ld.global.nc.f32 %f228, [%rd8]; div.rn.f32 %f229, %f228, %f227; mul.f32 %f230, %f229, %f89; mul.f32 %f231, %f2, %f230; sub.f32 %f232, %f1, %f231; fma.rn.f32 %f233, %f1, %f230, %f2; selp.f32 %f234, %f3, %f284, %p62; selp.f32 %f235, %f233, %f285, %p62; selp.f32 %f236, %f232, %f286, %p62; mul.f32 %f237, %f89, %f89; div.rn.f32 %f238, %f227, %f237; sub.f32 %f239, %f236, %f1; sub.f32 %f240, %f235, %f2; sub.f32 %f241, %f234, %f3; fma.rn.f32 %f242, %f239, %f238, %f263; fma.rn.f32 %f243, %f240, %f238, %f264; fma.rn.f32 %f265, %f241, %f238, %f265; div.rn.f32 %f244, %f228, %f89; fma.rn.f32 %f263, %f235, %f244, %f242; mul.f32 %f245, %f236, %f244; sub.f32 %f264, %f243, %f245; BB0_57: setp.eq.s64 %p68, %rd12, 0; @%p68 bra BB0_59; cvta.to.global.u64 %rd77, %rd12; add.s64 %rd79, %rd77, %rd19; ld.global.nc.f32 %f246, [%rd79]; mul.f32 %f290, %f246, %f290; BB0_59: setp.eq.f32 %p69, %f290, 0f00000000; mov.f32 %f291, 0f00000000; @%p69 bra BB0_61; rcp.rn.f32 %f291, %f290; BB0_61: cvta.to.global.u64 %rd80, %rd11; cvta.to.global.u64 %rd81, %rd10; cvta.to.global.u64 %rd82, %rd9; add.s64 %rd84, %rd82, %rd19; ld.global.f32 %f248, [%rd84]; fma.rn.f32 %f249, %f263, %f291, %f248; st.global.f32 [%rd84], %f249; add.s64 %rd85, %rd81, %rd19; ld.global.f32 %f250, [%rd85]; fma.rn.f32 %f251, %f264, %f291, %f250; st.global.f32 [%rd85], %f251; add.s64 %rd86, %rd80, %rd19; ld.global.f32 %f252, [%rd86]; fma.rn.f32 %f253, %f265, %f291, %f252; st.global.f32 [%rd86], %f253; BB0_62: ret; } ` adddmibulk_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<70>; .reg .b16 %rs<43>; .reg .f32 %f<292>; .reg .b32 %r<128>; .reg .b64 %rd<87>; ld.param.u64 %rd9, [adddmibulk_param_0]; ld.param.u64 %rd10, [adddmibulk_param_1]; ld.param.u64 %rd11, [adddmibulk_param_2]; ld.param.u64 %rd13, [adddmibulk_param_3]; ld.param.u64 %rd14, [adddmibulk_param_4]; ld.param.u64 %rd15, [adddmibulk_param_5]; ld.param.u64 %rd12, [adddmibulk_param_6]; ld.param.f32 %f290, [adddmibulk_param_7]; ld.param.u64 %rd16, [adddmibulk_param_8]; ld.param.u64 %rd17, [adddmibulk_param_9]; ld.param.u64 %rd18, [adddmibulk_param_10]; ld.param.f32 %f87, [adddmibulk_param_11]; ld.param.f32 %f88, [adddmibulk_param_12]; ld.param.f32 %f89, [adddmibulk_param_13]; ld.param.u32 %r43, [adddmibulk_param_14]; ld.param.u32 %r44, [adddmibulk_param_15]; ld.param.u32 %r45, [adddmibulk_param_16]; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; cvta.to.global.u64 %rd1, %rd17; cvta.to.global.u64 %rd2, %rd16; cvta.to.global.u64 %rd3, %rd18; cvta.to.global.u64 %rd4, %rd15; cvta.to.global.u64 %rd5, %rd14; cvta.to.global.u64 %rd6, %rd13; mov.u32 %r46, %ntid.x; mov.u32 %r47, %ctaid.x; mov.u32 %r48, %tid.x; mad.lo.s32 %r1, %r46, %r47, %r48; mov.u32 %r49, %ntid.y; mov.u32 %r50, %ctaid.y; mov.u32 %r51, %tid.y; mad.lo.s32 %r2, %r49, %r50, %r51; mov.u32 %r52, %ntid.z; mov.u32 %r53, %ctaid.z; mov.u32 %r54, %tid.z; mad.lo.s32 %r3, %r52, %r53, %r54; setp.ge.s32 %p1, %r2, %r44; setp.ge.s32 %p2, %r1, %r43; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r45; or.pred %p5, %p3, %p4; @%p5 bra BB0_62; mul.lo.s32 %r4, %r3, %r44; add.s32 %r55, %r4, %r2; mul.lo.s32 %r5, %r55, %r43; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd19, %r6, 4; add.s64 %rd20, %rd6, %rd19; cvt.s64.s32 %rd21, %r6; add.s64 %rd22, %rd5, %rd19; add.s64 %rd23, %rd4, %rd19; add.s64 %rd24, %rd3, %rd21; ld.global.nc.u8 %rs1, [%rd24]; cvt.u32.u16 %r56, %rs1; and.b32 %r7, %r56, 255; ld.global.nc.f32 %f1, [%rd20]; ld.global.nc.f32 %f2, [%rd22]; mul.f32 %f90, %f2, %f2; fma.rn.f32 %f91, %f1, %f1, %f90; ld.global.nc.f32 %f3, [%rd23]; fma.rn.f32 %f92, %f3, %f3, %f91; setp.eq.f32 %p6, %f92, 0f00000000; @%p6 bra BB0_62; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r57, %r8, %r43; add.s32 %r58, %r57, %r43; rem.s32 %r122, %r58, %r43; bra.uni BB0_5; BB0_4: mov.u32 %r59, 0; max.s32 %r122, %r8, %r59; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f254, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f255, %f254; mov.f32 %f256, %f254; @%p10 bra BB0_7; mul.wide.s32 %rd25, %r12, 4; add.s64 %rd26, %rd6, %rd25; ld.global.nc.f32 %f254, [%rd26]; add.s64 %rd27, %rd5, %rd25; ld.global.nc.f32 %f255, [%rd27]; add.s64 %rd28, %rd4, %rd25; ld.global.nc.f32 %f256, [%rd28]; BB0_7: mul.f32 %f96, %f255, %f255; fma.rn.f32 %f97, %f254, %f254, %f96; fma.rn.f32 %f10, %f256, %f256, %f97; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs37, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd29, %r12; add.s64 %rd30, %rd3, %rd29; ld.global.nc.u8 %rs37, [%rd30]; BB0_9: setp.gt.u16 %p12, %rs37, %rs1; cvt.u32.u16 %r60, %rs37; and.b32 %r61, %r60, 255; selp.b32 %r62, %r7, %r61, %p12; selp.b32 %r63, %r61, %r7, %p12; add.s32 %r64, %r63, 1; mul.lo.s32 %r65, %r64, %r63; shr.u32 %r66, %r65, 1; add.s32 %r13, %r66, %r62; setp.ne.s16 %p13, %rs18, 0; mov.f32 %f263, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f264, %f263; mov.f32 %f265, %f263; @%p15 bra BB0_11; mul.wide.s32 %rd31, %r13, 4; add.s64 %rd32, %rd2, %rd31; ld.global.nc.f32 %f101, [%rd32]; add.f32 %f102, %f101, %f101; add.s64 %rd33, %rd1, %rd31; ld.global.nc.f32 %f103, [%rd33]; div.rn.f32 %f104, %f103, %f102; mul.f32 %f105, %f104, %f87; fma.rn.f32 %f106, %f3, %f105, %f2; mul.f32 %f107, %f2, %f105; sub.f32 %f108, %f3, %f107; selp.f32 %f109, %f1, %f254, %p11; selp.f32 %f110, %f106, %f255, %p11; selp.f32 %f111, %f108, %f256, %p11; mul.f32 %f112, %f87, %f87; div.rn.f32 %f113, %f102, %f112; sub.f32 %f114, %f109, %f1; sub.f32 %f115, %f110, %f2; sub.f32 %f116, %f111, %f3; fma.rn.f32 %f263, %f114, %f113, 0f00000000; fma.rn.f32 %f117, %f115, %f113, 0f00000000; fma.rn.f32 %f118, %f116, %f113, 0f00000000; div.rn.f32 %f119, %f103, %f87; mul.f32 %f120, %f111, %f119; sub.f32 %f264, %f117, %f120; fma.rn.f32 %f265, %f110, %f119, %f118; BB0_11: add.s32 %r14, %r1, 1; @%p7 bra BB0_13; rem.s32 %r67, %r14, %r43; add.s32 %r68, %r67, %r43; rem.s32 %r123, %r68, %r43; bra.uni BB0_14; BB0_13: add.s32 %r69, %r43, -1; min.s32 %r123, %r14, %r69; BB0_14: add.s32 %r18, %r123, %r5; setp.ge.s32 %p18, %r14, %r43; mov.f32 %f260, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f261, %f260; mov.f32 %f262, %f260; @%p20 bra BB0_16; mul.wide.s32 %rd34, %r18, 4; add.s64 %rd35, %rd6, %rd34; ld.global.nc.f32 %f260, [%rd35]; add.s64 %rd36, %rd5, %rd34; ld.global.nc.f32 %f261, [%rd36]; add.s64 %rd37, %rd4, %rd34; ld.global.nc.f32 %f262, [%rd37]; BB0_16: mul.f32 %f124, %f261, %f261; fma.rn.f32 %f125, %f260, %f260, %f124; fma.rn.f32 %f23, %f262, %f262, %f125; setp.eq.f32 %p21, %f23, 0f00000000; mov.u16 %rs38, %rs1; @%p21 bra BB0_18; cvt.s64.s32 %rd38, %r18; add.s64 %rd39, %rd3, %rd38; ld.global.nc.u8 %rs38, [%rd39]; BB0_18: setp.gt.u16 %p22, %rs38, %rs1; cvt.u32.u16 %r70, %rs38; and.b32 %r71, %r70, 255; selp.b32 %r72, %r7, %r71, %p22; selp.b32 %r73, %r71, %r7, %p22; add.s32 %r74, %r73, 1; mul.lo.s32 %r75, %r74, %r73; shr.u32 %r76, %r75, 1; add.s32 %r19, %r76, %r72; and.pred %p25, %p21, %p13; @%p25 bra BB0_20; mul.wide.s32 %rd40, %r19, 4; add.s64 %rd41, %rd2, %rd40; ld.global.nc.f32 %f126, [%rd41]; add.f32 %f127, %f126, %f126; add.s64 %rd42, %rd1, %rd40; ld.global.nc.f32 %f128, [%rd42]; div.rn.f32 %f129, %f128, %f127; mul.f32 %f130, %f129, %f87; mul.f32 %f131, %f3, %f130; sub.f32 %f132, %f2, %f131; fma.rn.f32 %f133, %f2, %f130, %f3; selp.f32 %f134, %f1, %f260, %p21; selp.f32 %f135, %f132, %f261, %p21; selp.f32 %f136, %f133, %f262, %p21; mul.f32 %f137, %f87, %f87; div.rn.f32 %f138, %f127, %f137; sub.f32 %f139, %f134, %f1; sub.f32 %f140, %f135, %f2; sub.f32 %f141, %f136, %f3; fma.rn.f32 %f263, %f139, %f138, %f263; fma.rn.f32 %f142, %f140, %f138, %f264; fma.rn.f32 %f143, %f141, %f138, %f265; div.rn.f32 %f144, %f128, %f87; fma.rn.f32 %f264, %f136, %f144, %f142; mul.f32 %f145, %f135, %f144; sub.f32 %f265, %f143, %f145; BB0_20: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r20, %r2, -1; @%p27 bra BB0_22; rem.s32 %r77, %r20, %r44; add.s32 %r78, %r77, %r44; rem.s32 %r124, %r78, %r44; bra.uni BB0_23; BB0_22: mov.u32 %r79, 0; max.s32 %r124, %r20, %r79; BB0_23: add.s32 %r80, %r124, %r4; mad.lo.s32 %r24, %r80, %r43, %r1; setp.lt.s32 %p29, %r20, 0; mov.f32 %f266, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f267, %f266; mov.f32 %f268, %f266; @%p30 bra BB0_25; mul.wide.s32 %rd43, %r24, 4; add.s64 %rd44, %rd6, %rd43; ld.global.nc.f32 %f266, [%rd44]; add.s64 %rd45, %rd5, %rd43; ld.global.nc.f32 %f267, [%rd45]; add.s64 %rd46, %rd4, %rd43; ld.global.nc.f32 %f268, [%rd46]; BB0_25: mul.f32 %f149, %f267, %f267; fma.rn.f32 %f150, %f266, %f266, %f149; fma.rn.f32 %f36, %f268, %f268, %f150; setp.eq.f32 %p31, %f36, 0f00000000; mov.u16 %rs39, %rs1; @%p31 bra BB0_27; cvt.s64.s32 %rd47, %r24; add.s64 %rd48, %rd3, %rd47; ld.global.nc.u8 %rs39, [%rd48]; BB0_27: setp.gt.u16 %p32, %rs39, %rs1; cvt.u32.u16 %r81, %rs39; and.b32 %r82, %r81, 255; selp.b32 %r83, %r7, %r82, %p32; selp.b32 %r84, %r82, %r7, %p32; add.s32 %r85, %r84, 1; mul.lo.s32 %r86, %r85, %r84; shr.u32 %r87, %r86, 1; add.s32 %r25, %r87, %r83; and.pred %p35, %p31, %p13; @%p35 bra BB0_29; mul.wide.s32 %rd49, %r25, 4; add.s64 %rd50, %rd2, %rd49; ld.global.nc.f32 %f151, [%rd50]; add.f32 %f152, %f151, %f151; add.s64 %rd51, %rd1, %rd49; ld.global.nc.f32 %f153, [%rd51]; div.rn.f32 %f154, %f153, %f152; mul.f32 %f155, %f154, %f88; mul.f32 %f156, %f3, %f155; sub.f32 %f157, %f1, %f156; fma.rn.f32 %f158, %f1, %f155, %f3; selp.f32 %f159, %f157, %f266, %p31; selp.f32 %f160, %f2, %f267, %p31; selp.f32 %f161, %f158, %f268, %p31; mul.f32 %f162, %f88, %f88; div.rn.f32 %f163, %f152, %f162; sub.f32 %f164, %f159, %f1; sub.f32 %f165, %f160, %f2; sub.f32 %f166, %f161, %f3; fma.rn.f32 %f167, %f164, %f163, %f263; fma.rn.f32 %f264, %f165, %f163, %f264; fma.rn.f32 %f168, %f166, %f163, %f265; div.rn.f32 %f169, %f153, %f88; fma.rn.f32 %f263, %f161, %f169, %f167; mul.f32 %f170, %f159, %f169; sub.f32 %f265, %f168, %f170; BB0_29: add.s32 %r26, %r2, 1; @%p27 bra BB0_31; rem.s32 %r88, %r26, %r44; add.s32 %r89, %r88, %r44; rem.s32 %r125, %r89, %r44; bra.uni BB0_32; BB0_31: add.s32 %r90, %r44, -1; min.s32 %r125, %r26, %r90; BB0_32: add.s32 %r91, %r125, %r4; mad.lo.s32 %r30, %r91, %r43, %r1; setp.ge.s32 %p38, %r26, %r44; mov.f32 %f272, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f273, %f272; mov.f32 %f274, %f272; @%p40 bra BB0_34; mul.wide.s32 %rd52, %r30, 4; add.s64 %rd53, %rd6, %rd52; ld.global.nc.f32 %f272, [%rd53]; add.s64 %rd54, %rd5, %rd52; ld.global.nc.f32 %f273, [%rd54]; add.s64 %rd55, %rd4, %rd52; ld.global.nc.f32 %f274, [%rd55]; BB0_34: mul.f32 %f174, %f273, %f273; fma.rn.f32 %f175, %f272, %f272, %f174; fma.rn.f32 %f49, %f274, %f274, %f175; setp.eq.f32 %p41, %f49, 0f00000000; mov.u16 %rs40, %rs1; @%p41 bra BB0_36; cvt.s64.s32 %rd56, %r30; add.s64 %rd57, %rd3, %rd56; ld.global.nc.u8 %rs40, [%rd57]; BB0_36: setp.gt.u16 %p42, %rs40, %rs1; cvt.u32.u16 %r92, %rs40; and.b32 %r93, %r92, 255; selp.b32 %r94, %r7, %r93, %p42; selp.b32 %r95, %r93, %r7, %p42; add.s32 %r96, %r95, 1; mul.lo.s32 %r97, %r96, %r95; shr.u32 %r98, %r97, 1; add.s32 %r31, %r98, %r94; and.pred %p45, %p41, %p13; @%p45 bra BB0_38; mul.wide.s32 %rd58, %r31, 4; add.s64 %rd59, %rd2, %rd58; ld.global.nc.f32 %f176, [%rd59]; add.f32 %f177, %f176, %f176; add.s64 %rd60, %rd1, %rd58; ld.global.nc.f32 %f178, [%rd60]; div.rn.f32 %f179, %f178, %f177; mul.f32 %f180, %f179, %f88; fma.rn.f32 %f181, %f3, %f180, %f1; mul.f32 %f182, %f1, %f180; sub.f32 %f183, %f3, %f182; selp.f32 %f184, %f181, %f272, %p41; selp.f32 %f185, %f2, %f273, %p41; selp.f32 %f186, %f183, %f274, %p41; mul.f32 %f187, %f88, %f88; div.rn.f32 %f188, %f177, %f187; sub.f32 %f189, %f184, %f1; sub.f32 %f190, %f185, %f2; sub.f32 %f191, %f186, %f3; fma.rn.f32 %f192, %f189, %f188, %f263; fma.rn.f32 %f264, %f190, %f188, %f264; fma.rn.f32 %f193, %f191, %f188, %f265; div.rn.f32 %f194, %f178, %f88; mul.f32 %f195, %f186, %f194; sub.f32 %f263, %f192, %f195; fma.rn.f32 %f265, %f184, %f194, %f193; BB0_38: setp.eq.s32 %p47, %r45, 1; @%p47 bra BB0_57; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r32, %r3, -1; @%p48 bra BB0_41; rem.s32 %r99, %r32, %r45; add.s32 %r100, %r99, %r45; rem.s32 %r126, %r100, %r45; bra.uni BB0_42; BB0_41: mov.u32 %r101, 0; max.s32 %r126, %r32, %r101; BB0_42: mad.lo.s32 %r102, %r126, %r44, %r2; mad.lo.s32 %r36, %r102, %r43, %r1; setp.lt.s32 %p50, %r32, 0; mov.f32 %f278, 0f00000000; and.pred %p51, %p50, %p48; mov.f32 %f279, %f278; mov.f32 %f280, %f278; @%p51 bra BB0_44; mul.wide.s32 %rd61, %r36, 4; add.s64 %rd62, %rd6, %rd61; ld.global.nc.f32 %f278, [%rd62]; add.s64 %rd63, %rd5, %rd61; ld.global.nc.f32 %f279, [%rd63]; add.s64 %rd64, %rd4, %rd61; ld.global.nc.f32 %f280, [%rd64]; BB0_44: mul.f32 %f199, %f279, %f279; fma.rn.f32 %f200, %f278, %f278, %f199; fma.rn.f32 %f62, %f280, %f280, %f200; setp.eq.f32 %p52, %f62, 0f00000000; mov.u16 %rs41, %rs1; @%p52 bra BB0_46; cvt.s64.s32 %rd65, %r36; add.s64 %rd66, %rd3, %rd65; ld.global.nc.u8 %rs41, [%rd66]; BB0_46: setp.gt.u16 %p53, %rs41, %rs1; cvt.u32.u16 %r103, %rs41; and.b32 %r104, %r103, 255; selp.b32 %r105, %r7, %r104, %p53; selp.b32 %r106, %r104, %r7, %p53; add.s32 %r107, %r106, 1; mul.lo.s32 %r108, %r107, %r106; shr.u32 %r109, %r108, 1; add.s32 %r37, %r109, %r105; and.pred %p56, %p52, %p13; @%p56 bra BB0_48; mul.wide.s32 %rd67, %r37, 4; add.s64 %rd68, %rd2, %rd67; ld.global.nc.f32 %f201, [%rd68]; add.f32 %f202, %f201, %f201; add.s64 %rd69, %rd1, %rd67; ld.global.nc.f32 %f203, [%rd69]; div.rn.f32 %f204, %f203, %f202; mul.f32 %f205, %f204, %f89; fma.rn.f32 %f206, %f2, %f205, %f1; mul.f32 %f207, %f1, %f205; sub.f32 %f208, %f2, %f207; selp.f32 %f209, %f206, %f278, %p52; selp.f32 %f210, %f208, %f279, %p52; selp.f32 %f211, %f3, %f280, %p52; mul.f32 %f212, %f89, %f89; div.rn.f32 %f213, %f202, %f212; sub.f32 %f214, %f209, %f1; sub.f32 %f215, %f210, %f2; sub.f32 %f216, %f211, %f3; fma.rn.f32 %f217, %f214, %f213, %f263; fma.rn.f32 %f218, %f215, %f213, %f264; fma.rn.f32 %f265, %f216, %f213, %f265; div.rn.f32 %f219, %f203, %f89; mul.f32 %f220, %f210, %f219; sub.f32 %f263, %f217, %f220; fma.rn.f32 %f264, %f209, %f219, %f218; BB0_48: add.s32 %r38, %r3, 1; @%p48 bra BB0_50; rem.s32 %r110, %r38, %r45; add.s32 %r111, %r110, %r45; rem.s32 %r127, %r111, %r45; bra.uni BB0_51; BB0_50: add.s32 %r112, %r45, -1; min.s32 %r127, %r38, %r112; BB0_51: mad.lo.s32 %r113, %r127, %r44, %r2; mad.lo.s32 %r42, %r113, %r43, %r1; setp.ge.s32 %p59, %r38, %r45; mov.f32 %f284, 0f00000000; and.pred %p61, %p59, %p48; mov.f32 %f285, %f284; mov.f32 %f286, %f284; @%p61 bra BB0_53; mul.wide.s32 %rd70, %r42, 4; add.s64 %rd71, %rd6, %rd70; ld.global.nc.f32 %f286, [%rd71]; add.s64 %rd72, %rd5, %rd70; ld.global.nc.f32 %f285, [%rd72]; add.s64 %rd73, %rd4, %rd70; ld.global.nc.f32 %f284, [%rd73]; BB0_53: mul.f32 %f224, %f286, %f286; fma.rn.f32 %f225, %f285, %f285, %f224; fma.rn.f32 %f75, %f284, %f284, %f225; setp.eq.f32 %p62, %f75, 0f00000000; mov.u16 %rs42, %rs1; @%p62 bra BB0_55; cvt.s64.s32 %rd74, %r42; add.s64 %rd75, %rd3, %rd74; ld.global.nc.u8 %rs42, [%rd75]; BB0_55: setp.gt.u16 %p63, %rs42, %rs1; cvt.u32.u16 %r114, %rs42; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p63; selp.b32 %r117, %r115, %r7, %p63; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd76, %r121, 4; add.s64 %rd7, %rd2, %rd76; add.s64 %rd8, %rd1, %rd76; and.pred %p66, %p62, %p13; @%p66 bra BB0_57; ld.global.nc.f32 %f226, [%rd7]; add.f32 %f227, %f226, %f226; ld.global.nc.f32 %f228, [%rd8]; div.rn.f32 %f229, %f228, %f227; mul.f32 %f230, %f229, %f89; mul.f32 %f231, %f2, %f230; sub.f32 %f232, %f1, %f231; fma.rn.f32 %f233, %f1, %f230, %f2; selp.f32 %f234, %f3, %f284, %p62; selp.f32 %f235, %f233, %f285, %p62; selp.f32 %f236, %f232, %f286, %p62; mul.f32 %f237, %f89, %f89; div.rn.f32 %f238, %f227, %f237; sub.f32 %f239, %f236, %f1; sub.f32 %f240, %f235, %f2; sub.f32 %f241, %f234, %f3; fma.rn.f32 %f242, %f239, %f238, %f263; fma.rn.f32 %f243, %f240, %f238, %f264; fma.rn.f32 %f265, %f241, %f238, %f265; div.rn.f32 %f244, %f228, %f89; fma.rn.f32 %f263, %f235, %f244, %f242; mul.f32 %f245, %f236, %f244; sub.f32 %f264, %f243, %f245; BB0_57: setp.eq.s64 %p68, %rd12, 0; @%p68 bra BB0_59; cvta.to.global.u64 %rd77, %rd12; add.s64 %rd79, %rd77, %rd19; ld.global.nc.f32 %f246, [%rd79]; mul.f32 %f290, %f246, %f290; BB0_59: setp.eq.f32 %p69, %f290, 0f00000000; mov.f32 %f291, 0f00000000; @%p69 bra BB0_61; rcp.rn.f32 %f291, %f290; BB0_61: cvta.to.global.u64 %rd80, %rd11; cvta.to.global.u64 %rd81, %rd10; cvta.to.global.u64 %rd82, %rd9; add.s64 %rd84, %rd82, %rd19; ld.global.f32 %f248, [%rd84]; fma.rn.f32 %f249, %f263, %f291, %f248; st.global.f32 [%rd84], %f249; add.s64 %rd85, %rd81, %rd19; ld.global.f32 %f250, [%rd85]; fma.rn.f32 %f251, %f264, %f291, %f250; st.global.f32 [%rd85], %f251; add.s64 %rd86, %rd80, %rd19; ld.global.f32 %f252, [%rd86]; fma.rn.f32 %f253, %f265, %f291, %f252; st.global.f32 [%rd86], %f253; BB0_62: ret; } ` adddmibulk_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<70>; .reg .b16 %rs<43>; .reg .f32 %f<292>; .reg .b32 %r<128>; .reg .b64 %rd<87>; ld.param.u64 %rd9, [adddmibulk_param_0]; ld.param.u64 %rd10, [adddmibulk_param_1]; ld.param.u64 %rd11, [adddmibulk_param_2]; ld.param.u64 %rd13, [adddmibulk_param_3]; ld.param.u64 %rd14, [adddmibulk_param_4]; ld.param.u64 %rd15, [adddmibulk_param_5]; ld.param.u64 %rd12, [adddmibulk_param_6]; ld.param.f32 %f290, [adddmibulk_param_7]; ld.param.u64 %rd16, [adddmibulk_param_8]; ld.param.u64 %rd17, [adddmibulk_param_9]; ld.param.u64 %rd18, [adddmibulk_param_10]; ld.param.f32 %f87, [adddmibulk_param_11]; ld.param.f32 %f88, [adddmibulk_param_12]; ld.param.f32 %f89, [adddmibulk_param_13]; ld.param.u32 %r43, [adddmibulk_param_14]; ld.param.u32 %r44, [adddmibulk_param_15]; ld.param.u32 %r45, [adddmibulk_param_16]; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; cvta.to.global.u64 %rd1, %rd17; cvta.to.global.u64 %rd2, %rd16; cvta.to.global.u64 %rd3, %rd18; cvta.to.global.u64 %rd4, %rd15; cvta.to.global.u64 %rd5, %rd14; cvta.to.global.u64 %rd6, %rd13; mov.u32 %r46, %ntid.x; mov.u32 %r47, %ctaid.x; mov.u32 %r48, %tid.x; mad.lo.s32 %r1, %r46, %r47, %r48; mov.u32 %r49, %ntid.y; mov.u32 %r50, %ctaid.y; mov.u32 %r51, %tid.y; mad.lo.s32 %r2, %r49, %r50, %r51; mov.u32 %r52, %ntid.z; mov.u32 %r53, %ctaid.z; mov.u32 %r54, %tid.z; mad.lo.s32 %r3, %r52, %r53, %r54; setp.ge.s32 %p1, %r2, %r44; setp.ge.s32 %p2, %r1, %r43; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r45; or.pred %p5, %p3, %p4; @%p5 bra BB0_62; mul.lo.s32 %r4, %r3, %r44; add.s32 %r55, %r4, %r2; mul.lo.s32 %r5, %r55, %r43; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd19, %r6, 4; add.s64 %rd20, %rd6, %rd19; cvt.s64.s32 %rd21, %r6; add.s64 %rd22, %rd5, %rd19; add.s64 %rd23, %rd4, %rd19; add.s64 %rd24, %rd3, %rd21; ld.global.nc.u8 %rs1, [%rd24]; cvt.u32.u16 %r56, %rs1; and.b32 %r7, %r56, 255; ld.global.nc.f32 %f1, [%rd20]; ld.global.nc.f32 %f2, [%rd22]; mul.f32 %f90, %f2, %f2; fma.rn.f32 %f91, %f1, %f1, %f90; ld.global.nc.f32 %f3, [%rd23]; fma.rn.f32 %f92, %f3, %f3, %f91; setp.eq.f32 %p6, %f92, 0f00000000; @%p6 bra BB0_62; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r57, %r8, %r43; add.s32 %r58, %r57, %r43; rem.s32 %r122, %r58, %r43; bra.uni BB0_5; BB0_4: mov.u32 %r59, 0; max.s32 %r122, %r8, %r59; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f254, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f255, %f254; mov.f32 %f256, %f254; @%p10 bra BB0_7; mul.wide.s32 %rd25, %r12, 4; add.s64 %rd26, %rd6, %rd25; ld.global.nc.f32 %f254, [%rd26]; add.s64 %rd27, %rd5, %rd25; ld.global.nc.f32 %f255, [%rd27]; add.s64 %rd28, %rd4, %rd25; ld.global.nc.f32 %f256, [%rd28]; BB0_7: mul.f32 %f96, %f255, %f255; fma.rn.f32 %f97, %f254, %f254, %f96; fma.rn.f32 %f10, %f256, %f256, %f97; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs37, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd29, %r12; add.s64 %rd30, %rd3, %rd29; ld.global.nc.u8 %rs37, [%rd30]; BB0_9: setp.gt.u16 %p12, %rs37, %rs1; cvt.u32.u16 %r60, %rs37; and.b32 %r61, %r60, 255; selp.b32 %r62, %r7, %r61, %p12; selp.b32 %r63, %r61, %r7, %p12; add.s32 %r64, %r63, 1; mul.lo.s32 %r65, %r64, %r63; shr.u32 %r66, %r65, 1; add.s32 %r13, %r66, %r62; setp.ne.s16 %p13, %rs18, 0; mov.f32 %f263, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f264, %f263; mov.f32 %f265, %f263; @%p15 bra BB0_11; mul.wide.s32 %rd31, %r13, 4; add.s64 %rd32, %rd2, %rd31; ld.global.nc.f32 %f101, [%rd32]; add.f32 %f102, %f101, %f101; add.s64 %rd33, %rd1, %rd31; ld.global.nc.f32 %f103, [%rd33]; div.rn.f32 %f104, %f103, %f102; mul.f32 %f105, %f104, %f87; fma.rn.f32 %f106, %f3, %f105, %f2; mul.f32 %f107, %f2, %f105; sub.f32 %f108, %f3, %f107; selp.f32 %f109, %f1, %f254, %p11; selp.f32 %f110, %f106, %f255, %p11; selp.f32 %f111, %f108, %f256, %p11; mul.f32 %f112, %f87, %f87; div.rn.f32 %f113, %f102, %f112; sub.f32 %f114, %f109, %f1; sub.f32 %f115, %f110, %f2; sub.f32 %f116, %f111, %f3; fma.rn.f32 %f263, %f114, %f113, 0f00000000; fma.rn.f32 %f117, %f115, %f113, 0f00000000; fma.rn.f32 %f118, %f116, %f113, 0f00000000; div.rn.f32 %f119, %f103, %f87; mul.f32 %f120, %f111, %f119; sub.f32 %f264, %f117, %f120; fma.rn.f32 %f265, %f110, %f119, %f118; BB0_11: add.s32 %r14, %r1, 1; @%p7 bra BB0_13; rem.s32 %r67, %r14, %r43; add.s32 %r68, %r67, %r43; rem.s32 %r123, %r68, %r43; bra.uni BB0_14; BB0_13: add.s32 %r69, %r43, -1; min.s32 %r123, %r14, %r69; BB0_14: add.s32 %r18, %r123, %r5; setp.ge.s32 %p18, %r14, %r43; mov.f32 %f260, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f261, %f260; mov.f32 %f262, %f260; @%p20 bra BB0_16; mul.wide.s32 %rd34, %r18, 4; add.s64 %rd35, %rd6, %rd34; ld.global.nc.f32 %f260, [%rd35]; add.s64 %rd36, %rd5, %rd34; ld.global.nc.f32 %f261, [%rd36]; add.s64 %rd37, %rd4, %rd34; ld.global.nc.f32 %f262, [%rd37]; BB0_16: mul.f32 %f124, %f261, %f261; fma.rn.f32 %f125, %f260, %f260, %f124; fma.rn.f32 %f23, %f262, %f262, %f125; setp.eq.f32 %p21, %f23, 0f00000000; mov.u16 %rs38, %rs1; @%p21 bra BB0_18; cvt.s64.s32 %rd38, %r18; add.s64 %rd39, %rd3, %rd38; ld.global.nc.u8 %rs38, [%rd39]; BB0_18: setp.gt.u16 %p22, %rs38, %rs1; cvt.u32.u16 %r70, %rs38; and.b32 %r71, %r70, 255; selp.b32 %r72, %r7, %r71, %p22; selp.b32 %r73, %r71, %r7, %p22; add.s32 %r74, %r73, 1; mul.lo.s32 %r75, %r74, %r73; shr.u32 %r76, %r75, 1; add.s32 %r19, %r76, %r72; and.pred %p25, %p21, %p13; @%p25 bra BB0_20; mul.wide.s32 %rd40, %r19, 4; add.s64 %rd41, %rd2, %rd40; ld.global.nc.f32 %f126, [%rd41]; add.f32 %f127, %f126, %f126; add.s64 %rd42, %rd1, %rd40; ld.global.nc.f32 %f128, [%rd42]; div.rn.f32 %f129, %f128, %f127; mul.f32 %f130, %f129, %f87; mul.f32 %f131, %f3, %f130; sub.f32 %f132, %f2, %f131; fma.rn.f32 %f133, %f2, %f130, %f3; selp.f32 %f134, %f1, %f260, %p21; selp.f32 %f135, %f132, %f261, %p21; selp.f32 %f136, %f133, %f262, %p21; mul.f32 %f137, %f87, %f87; div.rn.f32 %f138, %f127, %f137; sub.f32 %f139, %f134, %f1; sub.f32 %f140, %f135, %f2; sub.f32 %f141, %f136, %f3; fma.rn.f32 %f263, %f139, %f138, %f263; fma.rn.f32 %f142, %f140, %f138, %f264; fma.rn.f32 %f143, %f141, %f138, %f265; div.rn.f32 %f144, %f128, %f87; fma.rn.f32 %f264, %f136, %f144, %f142; mul.f32 %f145, %f135, %f144; sub.f32 %f265, %f143, %f145; BB0_20: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r20, %r2, -1; @%p27 bra BB0_22; rem.s32 %r77, %r20, %r44; add.s32 %r78, %r77, %r44; rem.s32 %r124, %r78, %r44; bra.uni BB0_23; BB0_22: mov.u32 %r79, 0; max.s32 %r124, %r20, %r79; BB0_23: add.s32 %r80, %r124, %r4; mad.lo.s32 %r24, %r80, %r43, %r1; setp.lt.s32 %p29, %r20, 0; mov.f32 %f266, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f267, %f266; mov.f32 %f268, %f266; @%p30 bra BB0_25; mul.wide.s32 %rd43, %r24, 4; add.s64 %rd44, %rd6, %rd43; ld.global.nc.f32 %f266, [%rd44]; add.s64 %rd45, %rd5, %rd43; ld.global.nc.f32 %f267, [%rd45]; add.s64 %rd46, %rd4, %rd43; ld.global.nc.f32 %f268, [%rd46]; BB0_25: mul.f32 %f149, %f267, %f267; fma.rn.f32 %f150, %f266, %f266, %f149; fma.rn.f32 %f36, %f268, %f268, %f150; setp.eq.f32 %p31, %f36, 0f00000000; mov.u16 %rs39, %rs1; @%p31 bra BB0_27; cvt.s64.s32 %rd47, %r24; add.s64 %rd48, %rd3, %rd47; ld.global.nc.u8 %rs39, [%rd48]; BB0_27: setp.gt.u16 %p32, %rs39, %rs1; cvt.u32.u16 %r81, %rs39; and.b32 %r82, %r81, 255; selp.b32 %r83, %r7, %r82, %p32; selp.b32 %r84, %r82, %r7, %p32; add.s32 %r85, %r84, 1; mul.lo.s32 %r86, %r85, %r84; shr.u32 %r87, %r86, 1; add.s32 %r25, %r87, %r83; and.pred %p35, %p31, %p13; @%p35 bra BB0_29; mul.wide.s32 %rd49, %r25, 4; add.s64 %rd50, %rd2, %rd49; ld.global.nc.f32 %f151, [%rd50]; add.f32 %f152, %f151, %f151; add.s64 %rd51, %rd1, %rd49; ld.global.nc.f32 %f153, [%rd51]; div.rn.f32 %f154, %f153, %f152; mul.f32 %f155, %f154, %f88; mul.f32 %f156, %f3, %f155; sub.f32 %f157, %f1, %f156; fma.rn.f32 %f158, %f1, %f155, %f3; selp.f32 %f159, %f157, %f266, %p31; selp.f32 %f160, %f2, %f267, %p31; selp.f32 %f161, %f158, %f268, %p31; mul.f32 %f162, %f88, %f88; div.rn.f32 %f163, %f152, %f162; sub.f32 %f164, %f159, %f1; sub.f32 %f165, %f160, %f2; sub.f32 %f166, %f161, %f3; fma.rn.f32 %f167, %f164, %f163, %f263; fma.rn.f32 %f264, %f165, %f163, %f264; fma.rn.f32 %f168, %f166, %f163, %f265; div.rn.f32 %f169, %f153, %f88; fma.rn.f32 %f263, %f161, %f169, %f167; mul.f32 %f170, %f159, %f169; sub.f32 %f265, %f168, %f170; BB0_29: add.s32 %r26, %r2, 1; @%p27 bra BB0_31; rem.s32 %r88, %r26, %r44; add.s32 %r89, %r88, %r44; rem.s32 %r125, %r89, %r44; bra.uni BB0_32; BB0_31: add.s32 %r90, %r44, -1; min.s32 %r125, %r26, %r90; BB0_32: add.s32 %r91, %r125, %r4; mad.lo.s32 %r30, %r91, %r43, %r1; setp.ge.s32 %p38, %r26, %r44; mov.f32 %f272, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f273, %f272; mov.f32 %f274, %f272; @%p40 bra BB0_34; mul.wide.s32 %rd52, %r30, 4; add.s64 %rd53, %rd6, %rd52; ld.global.nc.f32 %f272, [%rd53]; add.s64 %rd54, %rd5, %rd52; ld.global.nc.f32 %f273, [%rd54]; add.s64 %rd55, %rd4, %rd52; ld.global.nc.f32 %f274, [%rd55]; BB0_34: mul.f32 %f174, %f273, %f273; fma.rn.f32 %f175, %f272, %f272, %f174; fma.rn.f32 %f49, %f274, %f274, %f175; setp.eq.f32 %p41, %f49, 0f00000000; mov.u16 %rs40, %rs1; @%p41 bra BB0_36; cvt.s64.s32 %rd56, %r30; add.s64 %rd57, %rd3, %rd56; ld.global.nc.u8 %rs40, [%rd57]; BB0_36: setp.gt.u16 %p42, %rs40, %rs1; cvt.u32.u16 %r92, %rs40; and.b32 %r93, %r92, 255; selp.b32 %r94, %r7, %r93, %p42; selp.b32 %r95, %r93, %r7, %p42; add.s32 %r96, %r95, 1; mul.lo.s32 %r97, %r96, %r95; shr.u32 %r98, %r97, 1; add.s32 %r31, %r98, %r94; and.pred %p45, %p41, %p13; @%p45 bra BB0_38; mul.wide.s32 %rd58, %r31, 4; add.s64 %rd59, %rd2, %rd58; ld.global.nc.f32 %f176, [%rd59]; add.f32 %f177, %f176, %f176; add.s64 %rd60, %rd1, %rd58; ld.global.nc.f32 %f178, [%rd60]; div.rn.f32 %f179, %f178, %f177; mul.f32 %f180, %f179, %f88; fma.rn.f32 %f181, %f3, %f180, %f1; mul.f32 %f182, %f1, %f180; sub.f32 %f183, %f3, %f182; selp.f32 %f184, %f181, %f272, %p41; selp.f32 %f185, %f2, %f273, %p41; selp.f32 %f186, %f183, %f274, %p41; mul.f32 %f187, %f88, %f88; div.rn.f32 %f188, %f177, %f187; sub.f32 %f189, %f184, %f1; sub.f32 %f190, %f185, %f2; sub.f32 %f191, %f186, %f3; fma.rn.f32 %f192, %f189, %f188, %f263; fma.rn.f32 %f264, %f190, %f188, %f264; fma.rn.f32 %f193, %f191, %f188, %f265; div.rn.f32 %f194, %f178, %f88; mul.f32 %f195, %f186, %f194; sub.f32 %f263, %f192, %f195; fma.rn.f32 %f265, %f184, %f194, %f193; BB0_38: setp.eq.s32 %p47, %r45, 1; @%p47 bra BB0_57; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r32, %r3, -1; @%p48 bra BB0_41; rem.s32 %r99, %r32, %r45; add.s32 %r100, %r99, %r45; rem.s32 %r126, %r100, %r45; bra.uni BB0_42; BB0_41: mov.u32 %r101, 0; max.s32 %r126, %r32, %r101; BB0_42: mad.lo.s32 %r102, %r126, %r44, %r2; mad.lo.s32 %r36, %r102, %r43, %r1; setp.lt.s32 %p50, %r32, 0; mov.f32 %f278, 0f00000000; and.pred %p51, %p50, %p48; mov.f32 %f279, %f278; mov.f32 %f280, %f278; @%p51 bra BB0_44; mul.wide.s32 %rd61, %r36, 4; add.s64 %rd62, %rd6, %rd61; ld.global.nc.f32 %f278, [%rd62]; add.s64 %rd63, %rd5, %rd61; ld.global.nc.f32 %f279, [%rd63]; add.s64 %rd64, %rd4, %rd61; ld.global.nc.f32 %f280, [%rd64]; BB0_44: mul.f32 %f199, %f279, %f279; fma.rn.f32 %f200, %f278, %f278, %f199; fma.rn.f32 %f62, %f280, %f280, %f200; setp.eq.f32 %p52, %f62, 0f00000000; mov.u16 %rs41, %rs1; @%p52 bra BB0_46; cvt.s64.s32 %rd65, %r36; add.s64 %rd66, %rd3, %rd65; ld.global.nc.u8 %rs41, [%rd66]; BB0_46: setp.gt.u16 %p53, %rs41, %rs1; cvt.u32.u16 %r103, %rs41; and.b32 %r104, %r103, 255; selp.b32 %r105, %r7, %r104, %p53; selp.b32 %r106, %r104, %r7, %p53; add.s32 %r107, %r106, 1; mul.lo.s32 %r108, %r107, %r106; shr.u32 %r109, %r108, 1; add.s32 %r37, %r109, %r105; and.pred %p56, %p52, %p13; @%p56 bra BB0_48; mul.wide.s32 %rd67, %r37, 4; add.s64 %rd68, %rd2, %rd67; ld.global.nc.f32 %f201, [%rd68]; add.f32 %f202, %f201, %f201; add.s64 %rd69, %rd1, %rd67; ld.global.nc.f32 %f203, [%rd69]; div.rn.f32 %f204, %f203, %f202; mul.f32 %f205, %f204, %f89; fma.rn.f32 %f206, %f2, %f205, %f1; mul.f32 %f207, %f1, %f205; sub.f32 %f208, %f2, %f207; selp.f32 %f209, %f206, %f278, %p52; selp.f32 %f210, %f208, %f279, %p52; selp.f32 %f211, %f3, %f280, %p52; mul.f32 %f212, %f89, %f89; div.rn.f32 %f213, %f202, %f212; sub.f32 %f214, %f209, %f1; sub.f32 %f215, %f210, %f2; sub.f32 %f216, %f211, %f3; fma.rn.f32 %f217, %f214, %f213, %f263; fma.rn.f32 %f218, %f215, %f213, %f264; fma.rn.f32 %f265, %f216, %f213, %f265; div.rn.f32 %f219, %f203, %f89; mul.f32 %f220, %f210, %f219; sub.f32 %f263, %f217, %f220; fma.rn.f32 %f264, %f209, %f219, %f218; BB0_48: add.s32 %r38, %r3, 1; @%p48 bra BB0_50; rem.s32 %r110, %r38, %r45; add.s32 %r111, %r110, %r45; rem.s32 %r127, %r111, %r45; bra.uni BB0_51; BB0_50: add.s32 %r112, %r45, -1; min.s32 %r127, %r38, %r112; BB0_51: mad.lo.s32 %r113, %r127, %r44, %r2; mad.lo.s32 %r42, %r113, %r43, %r1; setp.ge.s32 %p59, %r38, %r45; mov.f32 %f284, 0f00000000; and.pred %p61, %p59, %p48; mov.f32 %f285, %f284; mov.f32 %f286, %f284; @%p61 bra BB0_53; mul.wide.s32 %rd70, %r42, 4; add.s64 %rd71, %rd6, %rd70; ld.global.nc.f32 %f286, [%rd71]; add.s64 %rd72, %rd5, %rd70; ld.global.nc.f32 %f285, [%rd72]; add.s64 %rd73, %rd4, %rd70; ld.global.nc.f32 %f284, [%rd73]; BB0_53: mul.f32 %f224, %f286, %f286; fma.rn.f32 %f225, %f285, %f285, %f224; fma.rn.f32 %f75, %f284, %f284, %f225; setp.eq.f32 %p62, %f75, 0f00000000; mov.u16 %rs42, %rs1; @%p62 bra BB0_55; cvt.s64.s32 %rd74, %r42; add.s64 %rd75, %rd3, %rd74; ld.global.nc.u8 %rs42, [%rd75]; BB0_55: setp.gt.u16 %p63, %rs42, %rs1; cvt.u32.u16 %r114, %rs42; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p63; selp.b32 %r117, %r115, %r7, %p63; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd76, %r121, 4; add.s64 %rd7, %rd2, %rd76; add.s64 %rd8, %rd1, %rd76; and.pred %p66, %p62, %p13; @%p66 bra BB0_57; ld.global.nc.f32 %f226, [%rd7]; add.f32 %f227, %f226, %f226; ld.global.nc.f32 %f228, [%rd8]; div.rn.f32 %f229, %f228, %f227; mul.f32 %f230, %f229, %f89; mul.f32 %f231, %f2, %f230; sub.f32 %f232, %f1, %f231; fma.rn.f32 %f233, %f1, %f230, %f2; selp.f32 %f234, %f3, %f284, %p62; selp.f32 %f235, %f233, %f285, %p62; selp.f32 %f236, %f232, %f286, %p62; mul.f32 %f237, %f89, %f89; div.rn.f32 %f238, %f227, %f237; sub.f32 %f239, %f236, %f1; sub.f32 %f240, %f235, %f2; sub.f32 %f241, %f234, %f3; fma.rn.f32 %f242, %f239, %f238, %f263; fma.rn.f32 %f243, %f240, %f238, %f264; fma.rn.f32 %f265, %f241, %f238, %f265; div.rn.f32 %f244, %f228, %f89; fma.rn.f32 %f263, %f235, %f244, %f242; mul.f32 %f245, %f236, %f244; sub.f32 %f264, %f243, %f245; BB0_57: setp.eq.s64 %p68, %rd12, 0; @%p68 bra BB0_59; cvta.to.global.u64 %rd77, %rd12; add.s64 %rd79, %rd77, %rd19; ld.global.nc.f32 %f246, [%rd79]; mul.f32 %f290, %f246, %f290; BB0_59: setp.eq.f32 %p69, %f290, 0f00000000; mov.f32 %f291, 0f00000000; @%p69 bra BB0_61; rcp.rn.f32 %f291, %f290; BB0_61: cvta.to.global.u64 %rd80, %rd11; cvta.to.global.u64 %rd81, %rd10; cvta.to.global.u64 %rd82, %rd9; add.s64 %rd84, %rd82, %rd19; ld.global.f32 %f248, [%rd84]; fma.rn.f32 %f249, %f263, %f291, %f248; st.global.f32 [%rd84], %f249; add.s64 %rd85, %rd81, %rd19; ld.global.f32 %f250, [%rd85]; fma.rn.f32 %f251, %f264, %f291, %f250; st.global.f32 [%rd85], %f251; add.s64 %rd86, %rd80, %rd19; ld.global.f32 %f252, [%rd86]; fma.rn.f32 %f253, %f265, %f291, %f252; st.global.f32 [%rd86], %f253; BB0_62: ret; } ` adddmibulk_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl adddmibulk .visible .entry adddmibulk( .param .u64 adddmibulk_param_0, .param .u64 adddmibulk_param_1, .param .u64 adddmibulk_param_2, .param .u64 adddmibulk_param_3, .param .u64 adddmibulk_param_4, .param .u64 adddmibulk_param_5, .param .u64 adddmibulk_param_6, .param .f32 adddmibulk_param_7, .param .u64 adddmibulk_param_8, .param .u64 adddmibulk_param_9, .param .u64 adddmibulk_param_10, .param .f32 adddmibulk_param_11, .param .f32 adddmibulk_param_12, .param .f32 adddmibulk_param_13, .param .u32 adddmibulk_param_14, .param .u32 adddmibulk_param_15, .param .u32 adddmibulk_param_16, .param .u8 adddmibulk_param_17, .param .u8 adddmibulk_param_18 ) { .reg .pred %p<70>; .reg .b16 %rs<43>; .reg .f32 %f<292>; .reg .b32 %r<128>; .reg .b64 %rd<87>; ld.param.u64 %rd9, [adddmibulk_param_0]; ld.param.u64 %rd10, [adddmibulk_param_1]; ld.param.u64 %rd11, [adddmibulk_param_2]; ld.param.u64 %rd13, [adddmibulk_param_3]; ld.param.u64 %rd14, [adddmibulk_param_4]; ld.param.u64 %rd15, [adddmibulk_param_5]; ld.param.u64 %rd12, [adddmibulk_param_6]; ld.param.f32 %f290, [adddmibulk_param_7]; ld.param.u64 %rd16, [adddmibulk_param_8]; ld.param.u64 %rd17, [adddmibulk_param_9]; ld.param.u64 %rd18, [adddmibulk_param_10]; ld.param.f32 %f87, [adddmibulk_param_11]; ld.param.f32 %f88, [adddmibulk_param_12]; ld.param.f32 %f89, [adddmibulk_param_13]; ld.param.u32 %r43, [adddmibulk_param_14]; ld.param.u32 %r44, [adddmibulk_param_15]; ld.param.u32 %r45, [adddmibulk_param_16]; ld.param.u8 %rs18, [adddmibulk_param_18]; ld.param.u8 %rs17, [adddmibulk_param_17]; cvta.to.global.u64 %rd1, %rd17; cvta.to.global.u64 %rd2, %rd16; cvta.to.global.u64 %rd3, %rd18; cvta.to.global.u64 %rd4, %rd15; cvta.to.global.u64 %rd5, %rd14; cvta.to.global.u64 %rd6, %rd13; mov.u32 %r46, %ntid.x; mov.u32 %r47, %ctaid.x; mov.u32 %r48, %tid.x; mad.lo.s32 %r1, %r46, %r47, %r48; mov.u32 %r49, %ntid.y; mov.u32 %r50, %ctaid.y; mov.u32 %r51, %tid.y; mad.lo.s32 %r2, %r49, %r50, %r51; mov.u32 %r52, %ntid.z; mov.u32 %r53, %ctaid.z; mov.u32 %r54, %tid.z; mad.lo.s32 %r3, %r52, %r53, %r54; setp.ge.s32 %p1, %r2, %r44; setp.ge.s32 %p2, %r1, %r43; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r45; or.pred %p5, %p3, %p4; @%p5 bra BB0_62; mul.lo.s32 %r4, %r3, %r44; add.s32 %r55, %r4, %r2; mul.lo.s32 %r5, %r55, %r43; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd19, %r6, 4; add.s64 %rd20, %rd6, %rd19; cvt.s64.s32 %rd21, %r6; add.s64 %rd22, %rd5, %rd19; add.s64 %rd23, %rd4, %rd19; add.s64 %rd24, %rd3, %rd21; ld.global.nc.u8 %rs1, [%rd24]; cvt.u32.u16 %r56, %rs1; and.b32 %r7, %r56, 255; ld.global.nc.f32 %f1, [%rd20]; ld.global.nc.f32 %f2, [%rd22]; mul.f32 %f90, %f2, %f2; fma.rn.f32 %f91, %f1, %f1, %f90; ld.global.nc.f32 %f3, [%rd23]; fma.rn.f32 %f92, %f3, %f3, %f91; setp.eq.f32 %p6, %f92, 0f00000000; @%p6 bra BB0_62; and.b16 %rs2, %rs17, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r57, %r8, %r43; add.s32 %r58, %r57, %r43; rem.s32 %r122, %r58, %r43; bra.uni BB0_5; BB0_4: mov.u32 %r59, 0; max.s32 %r122, %r8, %r59; BB0_5: add.s32 %r12, %r122, %r5; setp.lt.s32 %p9, %r8, 0; mov.f32 %f254, 0f00000000; and.pred %p10, %p9, %p7; mov.f32 %f255, %f254; mov.f32 %f256, %f254; @%p10 bra BB0_7; mul.wide.s32 %rd25, %r12, 4; add.s64 %rd26, %rd6, %rd25; ld.global.nc.f32 %f254, [%rd26]; add.s64 %rd27, %rd5, %rd25; ld.global.nc.f32 %f255, [%rd27]; add.s64 %rd28, %rd4, %rd25; ld.global.nc.f32 %f256, [%rd28]; BB0_7: mul.f32 %f96, %f255, %f255; fma.rn.f32 %f97, %f254, %f254, %f96; fma.rn.f32 %f10, %f256, %f256, %f97; setp.eq.f32 %p11, %f10, 0f00000000; mov.u16 %rs37, %rs1; @%p11 bra BB0_9; cvt.s64.s32 %rd29, %r12; add.s64 %rd30, %rd3, %rd29; ld.global.nc.u8 %rs37, [%rd30]; BB0_9: setp.gt.u16 %p12, %rs37, %rs1; cvt.u32.u16 %r60, %rs37; and.b32 %r61, %r60, 255; selp.b32 %r62, %r7, %r61, %p12; selp.b32 %r63, %r61, %r7, %p12; add.s32 %r64, %r63, 1; mul.lo.s32 %r65, %r64, %r63; shr.u32 %r66, %r65, 1; add.s32 %r13, %r66, %r62; setp.ne.s16 %p13, %rs18, 0; mov.f32 %f263, 0f00000000; and.pred %p15, %p11, %p13; mov.f32 %f264, %f263; mov.f32 %f265, %f263; @%p15 bra BB0_11; mul.wide.s32 %rd31, %r13, 4; add.s64 %rd32, %rd2, %rd31; ld.global.nc.f32 %f101, [%rd32]; add.f32 %f102, %f101, %f101; add.s64 %rd33, %rd1, %rd31; ld.global.nc.f32 %f103, [%rd33]; div.rn.f32 %f104, %f103, %f102; mul.f32 %f105, %f104, %f87; fma.rn.f32 %f106, %f3, %f105, %f2; mul.f32 %f107, %f2, %f105; sub.f32 %f108, %f3, %f107; selp.f32 %f109, %f1, %f254, %p11; selp.f32 %f110, %f106, %f255, %p11; selp.f32 %f111, %f108, %f256, %p11; mul.f32 %f112, %f87, %f87; div.rn.f32 %f113, %f102, %f112; sub.f32 %f114, %f109, %f1; sub.f32 %f115, %f110, %f2; sub.f32 %f116, %f111, %f3; fma.rn.f32 %f263, %f114, %f113, 0f00000000; fma.rn.f32 %f117, %f115, %f113, 0f00000000; fma.rn.f32 %f118, %f116, %f113, 0f00000000; div.rn.f32 %f119, %f103, %f87; mul.f32 %f120, %f111, %f119; sub.f32 %f264, %f117, %f120; fma.rn.f32 %f265, %f110, %f119, %f118; BB0_11: add.s32 %r14, %r1, 1; @%p7 bra BB0_13; rem.s32 %r67, %r14, %r43; add.s32 %r68, %r67, %r43; rem.s32 %r123, %r68, %r43; bra.uni BB0_14; BB0_13: add.s32 %r69, %r43, -1; min.s32 %r123, %r14, %r69; BB0_14: add.s32 %r18, %r123, %r5; setp.ge.s32 %p18, %r14, %r43; mov.f32 %f260, 0f00000000; and.pred %p20, %p18, %p7; mov.f32 %f261, %f260; mov.f32 %f262, %f260; @%p20 bra BB0_16; mul.wide.s32 %rd34, %r18, 4; add.s64 %rd35, %rd6, %rd34; ld.global.nc.f32 %f260, [%rd35]; add.s64 %rd36, %rd5, %rd34; ld.global.nc.f32 %f261, [%rd36]; add.s64 %rd37, %rd4, %rd34; ld.global.nc.f32 %f262, [%rd37]; BB0_16: mul.f32 %f124, %f261, %f261; fma.rn.f32 %f125, %f260, %f260, %f124; fma.rn.f32 %f23, %f262, %f262, %f125; setp.eq.f32 %p21, %f23, 0f00000000; mov.u16 %rs38, %rs1; @%p21 bra BB0_18; cvt.s64.s32 %rd38, %r18; add.s64 %rd39, %rd3, %rd38; ld.global.nc.u8 %rs38, [%rd39]; BB0_18: setp.gt.u16 %p22, %rs38, %rs1; cvt.u32.u16 %r70, %rs38; and.b32 %r71, %r70, 255; selp.b32 %r72, %r7, %r71, %p22; selp.b32 %r73, %r71, %r7, %p22; add.s32 %r74, %r73, 1; mul.lo.s32 %r75, %r74, %r73; shr.u32 %r76, %r75, 1; add.s32 %r19, %r76, %r72; and.pred %p25, %p21, %p13; @%p25 bra BB0_20; mul.wide.s32 %rd40, %r19, 4; add.s64 %rd41, %rd2, %rd40; ld.global.nc.f32 %f126, [%rd41]; add.f32 %f127, %f126, %f126; add.s64 %rd42, %rd1, %rd40; ld.global.nc.f32 %f128, [%rd42]; div.rn.f32 %f129, %f128, %f127; mul.f32 %f130, %f129, %f87; mul.f32 %f131, %f3, %f130; sub.f32 %f132, %f2, %f131; fma.rn.f32 %f133, %f2, %f130, %f3; selp.f32 %f134, %f1, %f260, %p21; selp.f32 %f135, %f132, %f261, %p21; selp.f32 %f136, %f133, %f262, %p21; mul.f32 %f137, %f87, %f87; div.rn.f32 %f138, %f127, %f137; sub.f32 %f139, %f134, %f1; sub.f32 %f140, %f135, %f2; sub.f32 %f141, %f136, %f3; fma.rn.f32 %f263, %f139, %f138, %f263; fma.rn.f32 %f142, %f140, %f138, %f264; fma.rn.f32 %f143, %f141, %f138, %f265; div.rn.f32 %f144, %f128, %f87; fma.rn.f32 %f264, %f136, %f144, %f142; mul.f32 %f145, %f135, %f144; sub.f32 %f265, %f143, %f145; BB0_20: and.b16 %rs7, %rs17, 2; setp.eq.s16 %p27, %rs7, 0; add.s32 %r20, %r2, -1; @%p27 bra BB0_22; rem.s32 %r77, %r20, %r44; add.s32 %r78, %r77, %r44; rem.s32 %r124, %r78, %r44; bra.uni BB0_23; BB0_22: mov.u32 %r79, 0; max.s32 %r124, %r20, %r79; BB0_23: add.s32 %r80, %r124, %r4; mad.lo.s32 %r24, %r80, %r43, %r1; setp.lt.s32 %p29, %r20, 0; mov.f32 %f266, 0f00000000; and.pred %p30, %p29, %p27; mov.f32 %f267, %f266; mov.f32 %f268, %f266; @%p30 bra BB0_25; mul.wide.s32 %rd43, %r24, 4; add.s64 %rd44, %rd6, %rd43; ld.global.nc.f32 %f266, [%rd44]; add.s64 %rd45, %rd5, %rd43; ld.global.nc.f32 %f267, [%rd45]; add.s64 %rd46, %rd4, %rd43; ld.global.nc.f32 %f268, [%rd46]; BB0_25: mul.f32 %f149, %f267, %f267; fma.rn.f32 %f150, %f266, %f266, %f149; fma.rn.f32 %f36, %f268, %f268, %f150; setp.eq.f32 %p31, %f36, 0f00000000; mov.u16 %rs39, %rs1; @%p31 bra BB0_27; cvt.s64.s32 %rd47, %r24; add.s64 %rd48, %rd3, %rd47; ld.global.nc.u8 %rs39, [%rd48]; BB0_27: setp.gt.u16 %p32, %rs39, %rs1; cvt.u32.u16 %r81, %rs39; and.b32 %r82, %r81, 255; selp.b32 %r83, %r7, %r82, %p32; selp.b32 %r84, %r82, %r7, %p32; add.s32 %r85, %r84, 1; mul.lo.s32 %r86, %r85, %r84; shr.u32 %r87, %r86, 1; add.s32 %r25, %r87, %r83; and.pred %p35, %p31, %p13; @%p35 bra BB0_29; mul.wide.s32 %rd49, %r25, 4; add.s64 %rd50, %rd2, %rd49; ld.global.nc.f32 %f151, [%rd50]; add.f32 %f152, %f151, %f151; add.s64 %rd51, %rd1, %rd49; ld.global.nc.f32 %f153, [%rd51]; div.rn.f32 %f154, %f153, %f152; mul.f32 %f155, %f154, %f88; mul.f32 %f156, %f3, %f155; sub.f32 %f157, %f1, %f156; fma.rn.f32 %f158, %f1, %f155, %f3; selp.f32 %f159, %f157, %f266, %p31; selp.f32 %f160, %f2, %f267, %p31; selp.f32 %f161, %f158, %f268, %p31; mul.f32 %f162, %f88, %f88; div.rn.f32 %f163, %f152, %f162; sub.f32 %f164, %f159, %f1; sub.f32 %f165, %f160, %f2; sub.f32 %f166, %f161, %f3; fma.rn.f32 %f167, %f164, %f163, %f263; fma.rn.f32 %f264, %f165, %f163, %f264; fma.rn.f32 %f168, %f166, %f163, %f265; div.rn.f32 %f169, %f153, %f88; fma.rn.f32 %f263, %f161, %f169, %f167; mul.f32 %f170, %f159, %f169; sub.f32 %f265, %f168, %f170; BB0_29: add.s32 %r26, %r2, 1; @%p27 bra BB0_31; rem.s32 %r88, %r26, %r44; add.s32 %r89, %r88, %r44; rem.s32 %r125, %r89, %r44; bra.uni BB0_32; BB0_31: add.s32 %r90, %r44, -1; min.s32 %r125, %r26, %r90; BB0_32: add.s32 %r91, %r125, %r4; mad.lo.s32 %r30, %r91, %r43, %r1; setp.ge.s32 %p38, %r26, %r44; mov.f32 %f272, 0f00000000; and.pred %p40, %p38, %p27; mov.f32 %f273, %f272; mov.f32 %f274, %f272; @%p40 bra BB0_34; mul.wide.s32 %rd52, %r30, 4; add.s64 %rd53, %rd6, %rd52; ld.global.nc.f32 %f272, [%rd53]; add.s64 %rd54, %rd5, %rd52; ld.global.nc.f32 %f273, [%rd54]; add.s64 %rd55, %rd4, %rd52; ld.global.nc.f32 %f274, [%rd55]; BB0_34: mul.f32 %f174, %f273, %f273; fma.rn.f32 %f175, %f272, %f272, %f174; fma.rn.f32 %f49, %f274, %f274, %f175; setp.eq.f32 %p41, %f49, 0f00000000; mov.u16 %rs40, %rs1; @%p41 bra BB0_36; cvt.s64.s32 %rd56, %r30; add.s64 %rd57, %rd3, %rd56; ld.global.nc.u8 %rs40, [%rd57]; BB0_36: setp.gt.u16 %p42, %rs40, %rs1; cvt.u32.u16 %r92, %rs40; and.b32 %r93, %r92, 255; selp.b32 %r94, %r7, %r93, %p42; selp.b32 %r95, %r93, %r7, %p42; add.s32 %r96, %r95, 1; mul.lo.s32 %r97, %r96, %r95; shr.u32 %r98, %r97, 1; add.s32 %r31, %r98, %r94; and.pred %p45, %p41, %p13; @%p45 bra BB0_38; mul.wide.s32 %rd58, %r31, 4; add.s64 %rd59, %rd2, %rd58; ld.global.nc.f32 %f176, [%rd59]; add.f32 %f177, %f176, %f176; add.s64 %rd60, %rd1, %rd58; ld.global.nc.f32 %f178, [%rd60]; div.rn.f32 %f179, %f178, %f177; mul.f32 %f180, %f179, %f88; fma.rn.f32 %f181, %f3, %f180, %f1; mul.f32 %f182, %f1, %f180; sub.f32 %f183, %f3, %f182; selp.f32 %f184, %f181, %f272, %p41; selp.f32 %f185, %f2, %f273, %p41; selp.f32 %f186, %f183, %f274, %p41; mul.f32 %f187, %f88, %f88; div.rn.f32 %f188, %f177, %f187; sub.f32 %f189, %f184, %f1; sub.f32 %f190, %f185, %f2; sub.f32 %f191, %f186, %f3; fma.rn.f32 %f192, %f189, %f188, %f263; fma.rn.f32 %f264, %f190, %f188, %f264; fma.rn.f32 %f193, %f191, %f188, %f265; div.rn.f32 %f194, %f178, %f88; mul.f32 %f195, %f186, %f194; sub.f32 %f263, %f192, %f195; fma.rn.f32 %f265, %f184, %f194, %f193; BB0_38: setp.eq.s32 %p47, %r45, 1; @%p47 bra BB0_57; and.b16 %rs12, %rs17, 4; setp.eq.s16 %p48, %rs12, 0; add.s32 %r32, %r3, -1; @%p48 bra BB0_41; rem.s32 %r99, %r32, %r45; add.s32 %r100, %r99, %r45; rem.s32 %r126, %r100, %r45; bra.uni BB0_42; BB0_41: mov.u32 %r101, 0; max.s32 %r126, %r32, %r101; BB0_42: mad.lo.s32 %r102, %r126, %r44, %r2; mad.lo.s32 %r36, %r102, %r43, %r1; setp.lt.s32 %p50, %r32, 0; mov.f32 %f278, 0f00000000; and.pred %p51, %p50, %p48; mov.f32 %f279, %f278; mov.f32 %f280, %f278; @%p51 bra BB0_44; mul.wide.s32 %rd61, %r36, 4; add.s64 %rd62, %rd6, %rd61; ld.global.nc.f32 %f278, [%rd62]; add.s64 %rd63, %rd5, %rd61; ld.global.nc.f32 %f279, [%rd63]; add.s64 %rd64, %rd4, %rd61; ld.global.nc.f32 %f280, [%rd64]; BB0_44: mul.f32 %f199, %f279, %f279; fma.rn.f32 %f200, %f278, %f278, %f199; fma.rn.f32 %f62, %f280, %f280, %f200; setp.eq.f32 %p52, %f62, 0f00000000; mov.u16 %rs41, %rs1; @%p52 bra BB0_46; cvt.s64.s32 %rd65, %r36; add.s64 %rd66, %rd3, %rd65; ld.global.nc.u8 %rs41, [%rd66]; BB0_46: setp.gt.u16 %p53, %rs41, %rs1; cvt.u32.u16 %r103, %rs41; and.b32 %r104, %r103, 255; selp.b32 %r105, %r7, %r104, %p53; selp.b32 %r106, %r104, %r7, %p53; add.s32 %r107, %r106, 1; mul.lo.s32 %r108, %r107, %r106; shr.u32 %r109, %r108, 1; add.s32 %r37, %r109, %r105; and.pred %p56, %p52, %p13; @%p56 bra BB0_48; mul.wide.s32 %rd67, %r37, 4; add.s64 %rd68, %rd2, %rd67; ld.global.nc.f32 %f201, [%rd68]; add.f32 %f202, %f201, %f201; add.s64 %rd69, %rd1, %rd67; ld.global.nc.f32 %f203, [%rd69]; div.rn.f32 %f204, %f203, %f202; mul.f32 %f205, %f204, %f89; fma.rn.f32 %f206, %f2, %f205, %f1; mul.f32 %f207, %f1, %f205; sub.f32 %f208, %f2, %f207; selp.f32 %f209, %f206, %f278, %p52; selp.f32 %f210, %f208, %f279, %p52; selp.f32 %f211, %f3, %f280, %p52; mul.f32 %f212, %f89, %f89; div.rn.f32 %f213, %f202, %f212; sub.f32 %f214, %f209, %f1; sub.f32 %f215, %f210, %f2; sub.f32 %f216, %f211, %f3; fma.rn.f32 %f217, %f214, %f213, %f263; fma.rn.f32 %f218, %f215, %f213, %f264; fma.rn.f32 %f265, %f216, %f213, %f265; div.rn.f32 %f219, %f203, %f89; mul.f32 %f220, %f210, %f219; sub.f32 %f263, %f217, %f220; fma.rn.f32 %f264, %f209, %f219, %f218; BB0_48: add.s32 %r38, %r3, 1; @%p48 bra BB0_50; rem.s32 %r110, %r38, %r45; add.s32 %r111, %r110, %r45; rem.s32 %r127, %r111, %r45; bra.uni BB0_51; BB0_50: add.s32 %r112, %r45, -1; min.s32 %r127, %r38, %r112; BB0_51: mad.lo.s32 %r113, %r127, %r44, %r2; mad.lo.s32 %r42, %r113, %r43, %r1; setp.ge.s32 %p59, %r38, %r45; mov.f32 %f284, 0f00000000; and.pred %p61, %p59, %p48; mov.f32 %f285, %f284; mov.f32 %f286, %f284; @%p61 bra BB0_53; mul.wide.s32 %rd70, %r42, 4; add.s64 %rd71, %rd6, %rd70; ld.global.nc.f32 %f286, [%rd71]; add.s64 %rd72, %rd5, %rd70; ld.global.nc.f32 %f285, [%rd72]; add.s64 %rd73, %rd4, %rd70; ld.global.nc.f32 %f284, [%rd73]; BB0_53: mul.f32 %f224, %f286, %f286; fma.rn.f32 %f225, %f285, %f285, %f224; fma.rn.f32 %f75, %f284, %f284, %f225; setp.eq.f32 %p62, %f75, 0f00000000; mov.u16 %rs42, %rs1; @%p62 bra BB0_55; cvt.s64.s32 %rd74, %r42; add.s64 %rd75, %rd3, %rd74; ld.global.nc.u8 %rs42, [%rd75]; BB0_55: setp.gt.u16 %p63, %rs42, %rs1; cvt.u32.u16 %r114, %rs42; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p63; selp.b32 %r117, %r115, %r7, %p63; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd76, %r121, 4; add.s64 %rd7, %rd2, %rd76; add.s64 %rd8, %rd1, %rd76; and.pred %p66, %p62, %p13; @%p66 bra BB0_57; ld.global.nc.f32 %f226, [%rd7]; add.f32 %f227, %f226, %f226; ld.global.nc.f32 %f228, [%rd8]; div.rn.f32 %f229, %f228, %f227; mul.f32 %f230, %f229, %f89; mul.f32 %f231, %f2, %f230; sub.f32 %f232, %f1, %f231; fma.rn.f32 %f233, %f1, %f230, %f2; selp.f32 %f234, %f3, %f284, %p62; selp.f32 %f235, %f233, %f285, %p62; selp.f32 %f236, %f232, %f286, %p62; mul.f32 %f237, %f89, %f89; div.rn.f32 %f238, %f227, %f237; sub.f32 %f239, %f236, %f1; sub.f32 %f240, %f235, %f2; sub.f32 %f241, %f234, %f3; fma.rn.f32 %f242, %f239, %f238, %f263; fma.rn.f32 %f243, %f240, %f238, %f264; fma.rn.f32 %f265, %f241, %f238, %f265; div.rn.f32 %f244, %f228, %f89; fma.rn.f32 %f263, %f235, %f244, %f242; mul.f32 %f245, %f236, %f244; sub.f32 %f264, %f243, %f245; BB0_57: setp.eq.s64 %p68, %rd12, 0; @%p68 bra BB0_59; cvta.to.global.u64 %rd77, %rd12; add.s64 %rd79, %rd77, %rd19; ld.global.nc.f32 %f246, [%rd79]; mul.f32 %f290, %f246, %f290; BB0_59: setp.eq.f32 %p69, %f290, 0f00000000; mov.f32 %f291, 0f00000000; @%p69 bra BB0_61; rcp.rn.f32 %f291, %f290; BB0_61: cvta.to.global.u64 %rd80, %rd11; cvta.to.global.u64 %rd81, %rd10; cvta.to.global.u64 %rd82, %rd9; add.s64 %rd84, %rd82, %rd19; ld.global.f32 %f248, [%rd84]; fma.rn.f32 %f249, %f263, %f291, %f248; st.global.f32 [%rd84], %f249; add.s64 %rd85, %rd81, %rd19; ld.global.f32 %f250, [%rd85]; fma.rn.f32 %f251, %f264, %f291, %f250; st.global.f32 [%rd85], %f251; add.s64 %rd86, %rd80, %rd19; ld.global.f32 %f252, [%rd86]; fma.rn.f32 %f253, %f265, %f291, %f252; st.global.f32 [%rd86], %f253; BB0_62: ret; } ` ) mumax3-3.10/cuda/dotproduct.cu000066400000000000000000000010461371432437400163140ustar00rootroot00000000000000 #include "float3.h" // dst += prefactor * dot(a,b) extern "C" __global__ void dotproduct(float* __restrict__ dst, float prefactor, float* __restrict__ ax, float* __restrict__ ay, float* __restrict__ az, float* __restrict__ bx, float* __restrict__ by, float* __restrict__ bz, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { float3 A = {ax[i], ay[i], az[i]}; float3 B = {bx[i], by[i], bz[i]}; dst[i] += prefactor * dot(A, B); } } mumax3-3.10/cuda/dotproduct.go000066400000000000000000000010141371432437400163050ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) // dst += prefactor * dot(a, b), as used for energy density func AddDotProduct(dst *data.Slice, prefactor float32, a, b *data.Slice) { util.Argument(dst.NComp() == 1 && a.NComp() == 3 && b.NComp() == 3) util.Argument(dst.Len() == a.Len() && dst.Len() == b.Len()) N := dst.Len() cfg := make1DConf(N) k_dotproduct_async(dst.DevPtr(0), prefactor, a.DevPtr(X), a.DevPtr(Y), a.DevPtr(Z), b.DevPtr(X), b.DevPtr(Y), b.DevPtr(Z), N, cfg) } mumax3-3.10/cuda/dotproduct_wrapper.go000066400000000000000000000704321371432437400200570ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for dotproduct kernel var dotproduct_code cu.Function // Stores the arguments for dotproduct kernel invocation type dotproduct_args_t struct { arg_dst unsafe.Pointer arg_prefactor float32 arg_ax unsafe.Pointer arg_ay unsafe.Pointer arg_az unsafe.Pointer arg_bx unsafe.Pointer arg_by unsafe.Pointer arg_bz unsafe.Pointer arg_N int argptr [9]unsafe.Pointer sync.Mutex } // Stores the arguments for dotproduct kernel invocation var dotproduct_args dotproduct_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. dotproduct_args.argptr[0] = unsafe.Pointer(&dotproduct_args.arg_dst) dotproduct_args.argptr[1] = unsafe.Pointer(&dotproduct_args.arg_prefactor) dotproduct_args.argptr[2] = unsafe.Pointer(&dotproduct_args.arg_ax) dotproduct_args.argptr[3] = unsafe.Pointer(&dotproduct_args.arg_ay) dotproduct_args.argptr[4] = unsafe.Pointer(&dotproduct_args.arg_az) dotproduct_args.argptr[5] = unsafe.Pointer(&dotproduct_args.arg_bx) dotproduct_args.argptr[6] = unsafe.Pointer(&dotproduct_args.arg_by) dotproduct_args.argptr[7] = unsafe.Pointer(&dotproduct_args.arg_bz) dotproduct_args.argptr[8] = unsafe.Pointer(&dotproduct_args.arg_N) } // Wrapper for dotproduct CUDA kernel, asynchronous. func k_dotproduct_async(dst unsafe.Pointer, prefactor float32, ax unsafe.Pointer, ay unsafe.Pointer, az unsafe.Pointer, bx unsafe.Pointer, by unsafe.Pointer, bz unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("dotproduct") } dotproduct_args.Lock() defer dotproduct_args.Unlock() if dotproduct_code == 0 { dotproduct_code = fatbinLoad(dotproduct_map, "dotproduct") } dotproduct_args.arg_dst = dst dotproduct_args.arg_prefactor = prefactor dotproduct_args.arg_ax = ax dotproduct_args.arg_ay = ay dotproduct_args.arg_az = az dotproduct_args.arg_bx = bx dotproduct_args.arg_by = by dotproduct_args.arg_bz = bz dotproduct_args.arg_N = N args := dotproduct_args.argptr[:] cu.LaunchKernel(dotproduct_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("dotproduct") } } // maps compute capability on PTX code for dotproduct kernel. var dotproduct_map = map[int]string{0: "", 30: dotproduct_ptx_30, 32: dotproduct_ptx_32, 35: dotproduct_ptx_35, 37: dotproduct_ptx_37, 50: dotproduct_ptx_50, 52: dotproduct_ptx_52, 53: dotproduct_ptx_53, 60: dotproduct_ptx_60, 61: dotproduct_ptx_61, 62: dotproduct_ptx_62, 70: dotproduct_ptx_70, 72: dotproduct_ptx_72, 75: dotproduct_ptx_75} // dotproduct PTX code for various compute capabilities. const ( dotproduct_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.f32 %f2, [%rd16]; ld.global.f32 %f3, [%rd10]; ld.global.f32 %f4, [%rd18]; ld.global.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.f32 %f8, [%rd20]; ld.global.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; BB0_2: ret; } ` dotproduct_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; BB0_2: ret; } ` dotproduct_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; BB0_2: ret; } ` dotproduct_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; BB0_2: ret; } ` dotproduct_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; BB0_2: ret; } ` dotproduct_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; BB0_2: ret; } ` dotproduct_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; BB0_2: ret; } ` dotproduct_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; BB0_2: ret; } ` dotproduct_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; BB0_2: ret; } ` dotproduct_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; BB0_2: ret; } ` dotproduct_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; BB0_2: ret; } ` dotproduct_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; BB0_2: ret; } ` dotproduct_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl dotproduct .visible .entry dotproduct( .param .u64 dotproduct_param_0, .param .f32 dotproduct_param_1, .param .u64 dotproduct_param_2, .param .u64 dotproduct_param_3, .param .u64 dotproduct_param_4, .param .u64 dotproduct_param_5, .param .u64 dotproduct_param_6, .param .u64 dotproduct_param_7, .param .u32 dotproduct_param_8 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [dotproduct_param_0]; ld.param.f32 %f1, [dotproduct_param_1]; ld.param.u64 %rd2, [dotproduct_param_2]; ld.param.u64 %rd3, [dotproduct_param_3]; ld.param.u64 %rd4, [dotproduct_param_4]; ld.param.u64 %rd5, [dotproduct_param_5]; ld.param.u64 %rd6, [dotproduct_param_6]; ld.param.u64 %rd7, [dotproduct_param_7]; ld.param.u32 %r2, [dotproduct_param_8]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f2, [%rd16]; ld.global.nc.f32 %f3, [%rd10]; ld.global.nc.f32 %f4, [%rd18]; ld.global.nc.f32 %f5, [%rd12]; mul.f32 %f6, %f5, %f4; fma.rn.f32 %f7, %f3, %f2, %f6; ld.global.nc.f32 %f8, [%rd20]; ld.global.nc.f32 %f9, [%rd14]; fma.rn.f32 %f10, %f9, %f8, %f7; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; ld.global.f32 %f11, [%rd22]; fma.rn.f32 %f12, %f10, %f1, %f11; st.global.f32 [%rd22], %f12; BB0_2: ret; } ` ) mumax3-3.10/cuda/exchange.cu000066400000000000000000000052371371432437400157150ustar00rootroot00000000000000#include #include "exchange.h" #include "float3.h" #include "stencil.h" #include "amul.h" // See exchange.go for more details. extern "C" __global__ void addexchange(float* __restrict__ Bx, float* __restrict__ By, float* __restrict__ Bz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ Ms_, float Ms_mul, float* __restrict__ aLUT2d, uint8_t* __restrict__ regions, float wx, float wy, float wz, int Nx, int Ny, int Nz, uint8_t PBC) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } // central cell int I = idx(ix, iy, iz); float3 m0 = make_float3(mx[I], my[I], mz[I]); if (is0(m0)) { return; } uint8_t r0 = regions[I]; float3 B = make_float3(0.0,0.0,0.0); int i_; // neighbor index float3 m_; // neighbor mag float a__; // inter-cell exchange stiffness // left neighbor i_ = idx(lclampx(ix-1), iy, iz); // clamps or wraps index according to PBC m_ = make_float3(mx[i_], my[i_], mz[i_]); // load m m_ = ( is0(m_)? m0: m_ ); // replace missing non-boundary neighbor a__ = aLUT2d[symidx(r0, regions[i_])]; B += wx * a__ *(m_ - m0); // right neighbor i_ = idx(hclampx(ix+1), iy, iz); m_ = make_float3(mx[i_], my[i_], mz[i_]); m_ = ( is0(m_)? m0: m_ ); a__ = aLUT2d[symidx(r0, regions[i_])]; B += wx * a__ *(m_ - m0); // back neighbor i_ = idx(ix, lclampy(iy-1), iz); m_ = make_float3(mx[i_], my[i_], mz[i_]); m_ = ( is0(m_)? m0: m_ ); a__ = aLUT2d[symidx(r0, regions[i_])]; B += wy * a__ *(m_ - m0); // front neighbor i_ = idx(ix, hclampy(iy+1), iz); m_ = make_float3(mx[i_], my[i_], mz[i_]); m_ = ( is0(m_)? m0: m_ ); a__ = aLUT2d[symidx(r0, regions[i_])]; B += wy * a__ *(m_ - m0); // only take vertical derivative for 3D sim if (Nz != 1) { // bottom neighbor i_ = idx(ix, iy, lclampz(iz-1)); m_ = make_float3(mx[i_], my[i_], mz[i_]); m_ = ( is0(m_)? m0: m_ ); a__ = aLUT2d[symidx(r0, regions[i_])]; B += wz * a__ *(m_ - m0); // top neighbor i_ = idx(ix, iy, hclampz(iz+1)); m_ = make_float3(mx[i_], my[i_], mz[i_]); m_ = ( is0(m_)? m0: m_ ); a__ = aLUT2d[symidx(r0, regions[i_])]; B += wz * a__ *(m_ - m0); } float invMs = inv_Msat(Ms_, Ms_mul, I); Bx[I] += B.x*invMs; By[I] += B.y*invMs; Bz[I] += B.z*invMs; } mumax3-3.10/cuda/exchange.go000066400000000000000000000022431371432437400157050ustar00rootroot00000000000000package cuda import ( "unsafe" "github.com/mumax/3/data" ) // Add exchange field to Beff. // m: normalized magnetization // B: effective field in Tesla // Aex_red: Aex / (Msat * 1e18 m2) // see exchange.cu func AddExchange(B, m *data.Slice, Aex_red SymmLUT, Msat MSlice, regions *Bytes, mesh *data.Mesh) { c := mesh.CellSize() wx := float32(2 / (c[X] * c[X])) wy := float32(2 / (c[Y] * c[Y])) wz := float32(2 / (c[Z] * c[Z])) N := mesh.Size() pbc := mesh.PBC_code() cfg := make3DConf(N) k_addexchange_async(B.DevPtr(X), B.DevPtr(Y), B.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), Msat.DevPtr(0), Msat.Mul(0), unsafe.Pointer(Aex_red), regions.Ptr, wx, wy, wz, N[X], N[Y], N[Z], pbc, cfg) } // Finds the average exchange strength around each cell, for debugging. func ExchangeDecode(dst *data.Slice, Aex_red SymmLUT, regions *Bytes, mesh *data.Mesh) { c := mesh.CellSize() wx := float32(2 / (c[X] * c[X])) wy := float32(2 / (c[Y] * c[Y])) wz := float32(2 / (c[Z] * c[Z])) N := mesh.Size() pbc := mesh.PBC_code() cfg := make3DConf(N) k_exchangedecode_async(dst.DevPtr(0), unsafe.Pointer(Aex_red), regions.Ptr, wx, wy, wz, N[X], N[Y], N[Z], pbc, cfg) } mumax3-3.10/cuda/exchange.h000066400000000000000000000002601371432437400155240ustar00rootroot00000000000000#ifndef _EXCHANGE_H_ #define _EXCHANGE_H_ // indexing in symmetric matrix #define symidx(i, j) ( (j<=i)? ( (((i)*((i)+1)) /2 )+(j) ) : ( (((j)*((j)+1)) /2 )+(i) ) ) #endif mumax3-3.10/cuda/exchange_wrapper.go000066400000000000000000005027401371432437400174540ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for addexchange kernel var addexchange_code cu.Function // Stores the arguments for addexchange kernel invocation type addexchange_args_t struct { arg_Bx unsafe.Pointer arg_By unsafe.Pointer arg_Bz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_Ms_ unsafe.Pointer arg_Ms_mul float32 arg_aLUT2d unsafe.Pointer arg_regions unsafe.Pointer arg_wx float32 arg_wy float32 arg_wz float32 arg_Nx int arg_Ny int arg_Nz int arg_PBC byte argptr [17]unsafe.Pointer sync.Mutex } // Stores the arguments for addexchange kernel invocation var addexchange_args addexchange_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. addexchange_args.argptr[0] = unsafe.Pointer(&addexchange_args.arg_Bx) addexchange_args.argptr[1] = unsafe.Pointer(&addexchange_args.arg_By) addexchange_args.argptr[2] = unsafe.Pointer(&addexchange_args.arg_Bz) addexchange_args.argptr[3] = unsafe.Pointer(&addexchange_args.arg_mx) addexchange_args.argptr[4] = unsafe.Pointer(&addexchange_args.arg_my) addexchange_args.argptr[5] = unsafe.Pointer(&addexchange_args.arg_mz) addexchange_args.argptr[6] = unsafe.Pointer(&addexchange_args.arg_Ms_) addexchange_args.argptr[7] = unsafe.Pointer(&addexchange_args.arg_Ms_mul) addexchange_args.argptr[8] = unsafe.Pointer(&addexchange_args.arg_aLUT2d) addexchange_args.argptr[9] = unsafe.Pointer(&addexchange_args.arg_regions) addexchange_args.argptr[10] = unsafe.Pointer(&addexchange_args.arg_wx) addexchange_args.argptr[11] = unsafe.Pointer(&addexchange_args.arg_wy) addexchange_args.argptr[12] = unsafe.Pointer(&addexchange_args.arg_wz) addexchange_args.argptr[13] = unsafe.Pointer(&addexchange_args.arg_Nx) addexchange_args.argptr[14] = unsafe.Pointer(&addexchange_args.arg_Ny) addexchange_args.argptr[15] = unsafe.Pointer(&addexchange_args.arg_Nz) addexchange_args.argptr[16] = unsafe.Pointer(&addexchange_args.arg_PBC) } // Wrapper for addexchange CUDA kernel, asynchronous. func k_addexchange_async(Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, aLUT2d unsafe.Pointer, regions unsafe.Pointer, wx float32, wy float32, wz float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("addexchange") } addexchange_args.Lock() defer addexchange_args.Unlock() if addexchange_code == 0 { addexchange_code = fatbinLoad(addexchange_map, "addexchange") } addexchange_args.arg_Bx = Bx addexchange_args.arg_By = By addexchange_args.arg_Bz = Bz addexchange_args.arg_mx = mx addexchange_args.arg_my = my addexchange_args.arg_mz = mz addexchange_args.arg_Ms_ = Ms_ addexchange_args.arg_Ms_mul = Ms_mul addexchange_args.arg_aLUT2d = aLUT2d addexchange_args.arg_regions = regions addexchange_args.arg_wx = wx addexchange_args.arg_wy = wy addexchange_args.arg_wz = wz addexchange_args.arg_Nx = Nx addexchange_args.arg_Ny = Ny addexchange_args.arg_Nz = Nz addexchange_args.arg_PBC = PBC args := addexchange_args.argptr[:] cu.LaunchKernel(addexchange_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("addexchange") } } // maps compute capability on PTX code for addexchange kernel. var addexchange_map = map[int]string{0: "", 30: addexchange_ptx_30, 32: addexchange_ptx_32, 35: addexchange_ptx_35, 37: addexchange_ptx_37, 50: addexchange_ptx_50, 52: addexchange_ptx_52, 53: addexchange_ptx_53, 60: addexchange_ptx_60, 61: addexchange_ptx_61, 62: addexchange_ptx_62, 70: addexchange_ptx_70, 72: addexchange_ptx_72, 75: addexchange_ptx_75} // addexchange PTX code for various compute capabilities. const ( addexchange_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<28>; .reg .b16 %rs<21>; .reg .f32 %f<133>; .reg .b32 %r<192>; .reg .b64 %rd<108>; ld.param.u64 %rd1, [addexchange_param_0]; ld.param.u64 %rd2, [addexchange_param_1]; ld.param.u64 %rd3, [addexchange_param_2]; ld.param.u64 %rd4, [addexchange_param_3]; ld.param.u64 %rd5, [addexchange_param_4]; ld.param.u64 %rd6, [addexchange_param_5]; ld.param.u64 %rd7, [addexchange_param_6]; ld.param.f32 %f131, [addexchange_param_7]; ld.param.u64 %rd8, [addexchange_param_8]; ld.param.u64 %rd9, [addexchange_param_9]; ld.param.f32 %f30, [addexchange_param_10]; ld.param.f32 %f31, [addexchange_param_11]; ld.param.f32 %f32, [addexchange_param_12]; ld.param.u32 %r29, [addexchange_param_13]; ld.param.u32 %r30, [addexchange_param_14]; ld.param.u32 %r31, [addexchange_param_15]; ld.param.u8 %rs4, [addexchange_param_16]; mov.u32 %r32, %ntid.x; mov.u32 %r33, %ctaid.x; mov.u32 %r34, %tid.x; mad.lo.s32 %r1, %r32, %r33, %r34; mov.u32 %r35, %ntid.y; mov.u32 %r36, %ctaid.y; mov.u32 %r37, %tid.y; mad.lo.s32 %r2, %r35, %r36, %r37; mov.u32 %r38, %ntid.z; mov.u32 %r39, %ctaid.z; mov.u32 %r40, %tid.z; mad.lo.s32 %r3, %r38, %r39, %r40; setp.ge.s32 %p1, %r2, %r30; setp.ge.s32 %p2, %r1, %r29; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r31; or.pred %p5, %p3, %p4; @%p5 bra BB0_27; cvta.to.global.u64 %rd10, %rd6; cvta.to.global.u64 %rd11, %rd5; cvta.to.global.u64 %rd12, %rd4; mad.lo.s32 %r41, %r3, %r30, %r2; mul.lo.s32 %r4, %r41, %r29; add.s32 %r5, %r4, %r1; mul.wide.s32 %rd13, %r5, 4; add.s64 %rd14, %rd12, %rd13; add.s64 %rd15, %rd11, %rd13; add.s64 %rd16, %rd10, %rd13; ld.global.f32 %f1, [%rd14]; ld.global.f32 %f2, [%rd15]; mul.f32 %f33, %f2, %f2; fma.rn.f32 %f34, %f1, %f1, %f33; ld.global.f32 %f3, [%rd16]; fma.rn.f32 %f35, %f3, %f3, %f34; setp.eq.f32 %p6, %f35, 0f00000000; @%p6 bra BB0_27; cvta.to.global.u64 %rd17, %rd9; cvt.s64.s32 %rd18, %r5; add.s64 %rd19, %rd17, %rd18; ld.global.u8 %rs1, [%rd19]; and.b16 %rs5, %rs4, 1; setp.eq.b16 %p7, %rs5, 1; @!%p7 bra BB0_4; bra.uni BB0_3; BB0_3: add.s32 %r46, %r1, -1; rem.s32 %r47, %r46, %r29; add.s32 %r48, %r47, %r29; rem.s32 %r186, %r48, %r29; bra.uni BB0_5; BB0_4: add.s32 %r53, %r1, -1; mov.u32 %r54, 0; max.s32 %r186, %r53, %r54; BB0_5: setp.eq.b16 %p8, %rs5, 1; add.s32 %r55, %r186, %r4; cvt.s64.s32 %rd21, %r55; mul.wide.s32 %rd22, %r55, 4; add.s64 %rd23, %rd12, %rd22; add.s64 %rd25, %rd11, %rd22; add.s64 %rd27, %rd10, %rd22; ld.global.f32 %f36, [%rd23]; ld.global.f32 %f37, [%rd25]; mul.f32 %f38, %f37, %f37; fma.rn.f32 %f39, %f36, %f36, %f38; ld.global.f32 %f40, [%rd27]; fma.rn.f32 %f41, %f40, %f40, %f39; setp.eq.f32 %p9, %f41, 0f00000000; selp.f32 %f42, %f1, %f36, %p9; selp.f32 %f43, %f2, %f37, %p9; selp.f32 %f44, %f3, %f40, %p9; add.s64 %rd29, %rd17, %rd21; ld.global.u8 %rs7, [%rd29]; setp.gt.u16 %p10, %rs7, %rs1; cvt.u32.u16 %r56, %rs7; cvt.u32.u16 %r57, %rs1; and.b32 %r58, %r57, 255; selp.b32 %r59, %r58, %r56, %p10; selp.b32 %r60, %r56, %r58, %p10; add.s32 %r61, %r60, 1; mul.lo.s32 %r62, %r61, %r60; shr.u32 %r63, %r62, 1; add.s32 %r64, %r63, %r59; cvta.to.global.u64 %rd30, %rd8; mul.wide.s32 %rd31, %r64, 4; add.s64 %rd32, %rd30, %rd31; ld.global.f32 %f45, [%rd32]; mul.f32 %f46, %f45, %f30; sub.f32 %f47, %f42, %f1; sub.f32 %f48, %f43, %f2; sub.f32 %f49, %f44, %f3; fma.rn.f32 %f4, %f46, %f47, 0f00000000; fma.rn.f32 %f5, %f46, %f48, 0f00000000; fma.rn.f32 %f6, %f46, %f49, 0f00000000; add.s32 %r9, %r1, 1; @!%p8 bra BB0_7; bra.uni BB0_6; BB0_6: rem.s32 %r69, %r9, %r29; add.s32 %r70, %r69, %r29; rem.s32 %r187, %r70, %r29; bra.uni BB0_8; BB0_7: add.s32 %r71, %r29, -1; min.s32 %r187, %r9, %r71; BB0_8: add.s32 %r72, %r187, %r4; cvt.s64.s32 %rd34, %r72; mul.wide.s32 %rd35, %r72, 4; add.s64 %rd36, %rd12, %rd35; add.s64 %rd38, %rd11, %rd35; add.s64 %rd40, %rd10, %rd35; ld.global.f32 %f50, [%rd36]; ld.global.f32 %f51, [%rd38]; mul.f32 %f52, %f51, %f51; fma.rn.f32 %f53, %f50, %f50, %f52; ld.global.f32 %f54, [%rd40]; fma.rn.f32 %f55, %f54, %f54, %f53; setp.eq.f32 %p11, %f55, 0f00000000; selp.f32 %f56, %f1, %f50, %p11; selp.f32 %f57, %f2, %f51, %p11; selp.f32 %f58, %f3, %f54, %p11; add.s64 %rd42, %rd17, %rd34; ld.global.u8 %rs9, [%rd42]; setp.gt.u16 %p12, %rs9, %rs1; cvt.u32.u16 %r73, %rs9; selp.b32 %r76, %r58, %r73, %p12; selp.b32 %r77, %r73, %r58, %p12; add.s32 %r78, %r77, 1; mul.lo.s32 %r79, %r78, %r77; shr.u32 %r80, %r79, 1; add.s32 %r81, %r80, %r76; mul.wide.s32 %rd44, %r81, 4; add.s64 %rd45, %rd30, %rd44; ld.global.f32 %f59, [%rd45]; mul.f32 %f60, %f59, %f30; sub.f32 %f61, %f56, %f1; sub.f32 %f62, %f57, %f2; sub.f32 %f63, %f58, %f3; fma.rn.f32 %f7, %f60, %f61, %f4; fma.rn.f32 %f8, %f60, %f62, %f5; fma.rn.f32 %f9, %f60, %f63, %f6; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p13, %rs2, 0; add.s32 %r13, %r2, -1; @%p13 bra BB0_10; rem.s32 %r86, %r13, %r30; add.s32 %r87, %r86, %r30; rem.s32 %r188, %r87, %r30; bra.uni BB0_11; BB0_10: mov.u32 %r88, 0; max.s32 %r188, %r13, %r88; BB0_11: mad.lo.s32 %r93, %r3, %r30, %r188; mad.lo.s32 %r98, %r93, %r29, %r1; cvt.s64.s32 %rd47, %r98; mul.wide.s32 %rd48, %r98, 4; add.s64 %rd49, %rd12, %rd48; add.s64 %rd51, %rd11, %rd48; add.s64 %rd53, %rd10, %rd48; ld.global.f32 %f64, [%rd49]; ld.global.f32 %f65, [%rd51]; mul.f32 %f66, %f65, %f65; fma.rn.f32 %f67, %f64, %f64, %f66; ld.global.f32 %f68, [%rd53]; fma.rn.f32 %f69, %f68, %f68, %f67; setp.eq.f32 %p14, %f69, 0f00000000; selp.f32 %f70, %f1, %f64, %p14; selp.f32 %f71, %f2, %f65, %p14; selp.f32 %f72, %f3, %f68, %p14; add.s64 %rd55, %rd17, %rd47; ld.global.u8 %rs11, [%rd55]; setp.gt.u16 %p15, %rs11, %rs1; cvt.u32.u16 %r99, %rs11; selp.b32 %r102, %r58, %r99, %p15; selp.b32 %r103, %r99, %r58, %p15; add.s32 %r104, %r103, 1; mul.lo.s32 %r105, %r104, %r103; shr.u32 %r106, %r105, 1; add.s32 %r107, %r106, %r102; mul.wide.s32 %rd57, %r107, 4; add.s64 %rd58, %rd30, %rd57; ld.global.f32 %f73, [%rd58]; mul.f32 %f74, %f73, %f31; sub.f32 %f75, %f70, %f1; sub.f32 %f76, %f71, %f2; sub.f32 %f77, %f72, %f3; fma.rn.f32 %f10, %f74, %f75, %f7; fma.rn.f32 %f11, %f74, %f76, %f8; fma.rn.f32 %f12, %f74, %f77, %f9; add.s32 %r17, %r2, 1; @%p13 bra BB0_13; rem.s32 %r112, %r17, %r30; add.s32 %r113, %r112, %r30; rem.s32 %r189, %r113, %r30; bra.uni BB0_14; BB0_13: add.s32 %r114, %r30, -1; min.s32 %r189, %r17, %r114; BB0_14: mad.lo.s32 %r119, %r3, %r30, %r189; mad.lo.s32 %r124, %r119, %r29, %r1; cvt.s64.s32 %rd60, %r124; mul.wide.s32 %rd61, %r124, 4; add.s64 %rd62, %rd12, %rd61; add.s64 %rd64, %rd11, %rd61; add.s64 %rd66, %rd10, %rd61; ld.global.f32 %f78, [%rd62]; ld.global.f32 %f79, [%rd64]; mul.f32 %f80, %f79, %f79; fma.rn.f32 %f81, %f78, %f78, %f80; ld.global.f32 %f82, [%rd66]; fma.rn.f32 %f83, %f82, %f82, %f81; setp.eq.f32 %p17, %f83, 0f00000000; selp.f32 %f84, %f1, %f78, %p17; selp.f32 %f85, %f2, %f79, %p17; selp.f32 %f86, %f3, %f82, %p17; add.s64 %rd68, %rd17, %rd60; ld.global.u8 %rs14, [%rd68]; setp.gt.u16 %p18, %rs14, %rs1; cvt.u32.u16 %r125, %rs14; selp.b32 %r128, %r58, %r125, %p18; selp.b32 %r129, %r125, %r58, %p18; add.s32 %r130, %r129, 1; mul.lo.s32 %r131, %r130, %r129; shr.u32 %r132, %r131, 1; add.s32 %r133, %r132, %r128; mul.wide.s32 %rd70, %r133, 4; add.s64 %rd71, %rd30, %rd70; ld.global.f32 %f87, [%rd71]; mul.f32 %f88, %f87, %f31; sub.f32 %f89, %f84, %f1; sub.f32 %f90, %f85, %f2; sub.f32 %f91, %f86, %f3; fma.rn.f32 %f128, %f88, %f89, %f10; fma.rn.f32 %f129, %f88, %f90, %f11; fma.rn.f32 %f130, %f88, %f91, %f12; setp.eq.s32 %p19, %r31, 1; @%p19 bra BB0_22; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p20, %rs3, 0; add.s32 %r21, %r3, -1; @%p20 bra BB0_17; rem.s32 %r138, %r21, %r31; add.s32 %r139, %r138, %r31; rem.s32 %r190, %r139, %r31; bra.uni BB0_18; BB0_17: mov.u32 %r140, 0; max.s32 %r190, %r21, %r140; BB0_18: mad.lo.s32 %r145, %r190, %r30, %r2; mad.lo.s32 %r150, %r145, %r29, %r1; cvt.s64.s32 %rd73, %r150; mul.wide.s32 %rd74, %r150, 4; add.s64 %rd75, %rd12, %rd74; add.s64 %rd77, %rd11, %rd74; add.s64 %rd79, %rd10, %rd74; ld.global.f32 %f92, [%rd75]; ld.global.f32 %f93, [%rd77]; mul.f32 %f94, %f93, %f93; fma.rn.f32 %f95, %f92, %f92, %f94; ld.global.f32 %f96, [%rd79]; fma.rn.f32 %f97, %f96, %f96, %f95; setp.eq.f32 %p21, %f97, 0f00000000; selp.f32 %f98, %f1, %f92, %p21; selp.f32 %f99, %f2, %f93, %p21; selp.f32 %f100, %f3, %f96, %p21; add.s64 %rd81, %rd17, %rd73; ld.global.u8 %rs16, [%rd81]; setp.gt.u16 %p22, %rs16, %rs1; cvt.u32.u16 %r151, %rs16; selp.b32 %r154, %r58, %r151, %p22; selp.b32 %r155, %r151, %r58, %p22; add.s32 %r156, %r155, 1; mul.lo.s32 %r157, %r156, %r155; shr.u32 %r158, %r157, 1; add.s32 %r159, %r158, %r154; mul.wide.s32 %rd83, %r159, 4; add.s64 %rd84, %rd30, %rd83; ld.global.f32 %f101, [%rd84]; mul.f32 %f102, %f101, %f32; sub.f32 %f103, %f98, %f1; sub.f32 %f104, %f99, %f2; sub.f32 %f105, %f100, %f3; fma.rn.f32 %f16, %f102, %f103, %f128; fma.rn.f32 %f17, %f102, %f104, %f129; fma.rn.f32 %f18, %f102, %f105, %f130; add.s32 %r25, %r3, 1; @%p20 bra BB0_20; rem.s32 %r164, %r25, %r31; add.s32 %r165, %r164, %r31; rem.s32 %r191, %r165, %r31; bra.uni BB0_21; BB0_20: add.s32 %r166, %r31, -1; min.s32 %r191, %r25, %r166; BB0_21: mad.lo.s32 %r171, %r191, %r30, %r2; mad.lo.s32 %r176, %r171, %r29, %r1; cvt.s64.s32 %rd86, %r176; mul.wide.s32 %rd87, %r176, 4; add.s64 %rd88, %rd12, %rd87; add.s64 %rd90, %rd11, %rd87; add.s64 %rd92, %rd10, %rd87; ld.global.f32 %f106, [%rd88]; ld.global.f32 %f107, [%rd90]; mul.f32 %f108, %f107, %f107; fma.rn.f32 %f109, %f106, %f106, %f108; ld.global.f32 %f110, [%rd92]; fma.rn.f32 %f111, %f110, %f110, %f109; setp.eq.f32 %p24, %f111, 0f00000000; selp.f32 %f112, %f1, %f106, %p24; selp.f32 %f113, %f2, %f107, %p24; selp.f32 %f114, %f3, %f110, %p24; add.s64 %rd94, %rd17, %rd86; ld.global.u8 %rs19, [%rd94]; setp.gt.u16 %p25, %rs19, %rs1; cvt.u32.u16 %r177, %rs19; selp.b32 %r180, %r58, %r177, %p25; selp.b32 %r181, %r177, %r58, %p25; add.s32 %r182, %r181, 1; mul.lo.s32 %r183, %r182, %r181; shr.u32 %r184, %r183, 1; add.s32 %r185, %r184, %r180; mul.wide.s32 %rd96, %r185, 4; add.s64 %rd97, %rd30, %rd96; ld.global.f32 %f115, [%rd97]; mul.f32 %f116, %f115, %f32; sub.f32 %f117, %f112, %f1; sub.f32 %f118, %f113, %f2; sub.f32 %f119, %f114, %f3; fma.rn.f32 %f128, %f116, %f117, %f16; fma.rn.f32 %f129, %f116, %f118, %f17; fma.rn.f32 %f130, %f116, %f119, %f18; BB0_22: setp.eq.s64 %p26, %rd7, 0; @%p26 bra BB0_24; cvta.to.global.u64 %rd98, %rd7; add.s64 %rd100, %rd98, %rd13; ld.global.f32 %f120, [%rd100]; mul.f32 %f131, %f120, %f131; BB0_24: setp.eq.f32 %p27, %f131, 0f00000000; mov.f32 %f132, 0f00000000; @%p27 bra BB0_26; rcp.rn.f32 %f132, %f131; BB0_26: cvta.to.global.u64 %rd101, %rd1; add.s64 %rd103, %rd101, %rd13; ld.global.f32 %f122, [%rd103]; fma.rn.f32 %f123, %f128, %f132, %f122; st.global.f32 [%rd103], %f123; cvta.to.global.u64 %rd104, %rd2; add.s64 %rd105, %rd104, %rd13; ld.global.f32 %f124, [%rd105]; fma.rn.f32 %f125, %f129, %f132, %f124; st.global.f32 [%rd105], %f125; cvta.to.global.u64 %rd106, %rd3; add.s64 %rd107, %rd106, %rd13; ld.global.f32 %f126, [%rd107]; fma.rn.f32 %f127, %f130, %f132, %f126; st.global.f32 [%rd107], %f127; BB0_27: ret; } ` addexchange_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<28>; .reg .b16 %rs<26>; .reg .f32 %f<133>; .reg .b32 %r<128>; .reg .b64 %rd<79>; ld.param.u64 %rd6, [addexchange_param_0]; ld.param.u64 %rd7, [addexchange_param_1]; ld.param.u64 %rd8, [addexchange_param_2]; ld.param.u64 %rd10, [addexchange_param_3]; ld.param.u64 %rd11, [addexchange_param_4]; ld.param.u64 %rd12, [addexchange_param_5]; ld.param.u64 %rd9, [addexchange_param_6]; ld.param.f32 %f131, [addexchange_param_7]; ld.param.u64 %rd13, [addexchange_param_8]; ld.param.u64 %rd14, [addexchange_param_9]; ld.param.f32 %f30, [addexchange_param_10]; ld.param.f32 %f31, [addexchange_param_11]; ld.param.f32 %f32, [addexchange_param_12]; ld.param.u32 %r32, [addexchange_param_13]; ld.param.u32 %r33, [addexchange_param_14]; ld.param.u32 %r34, [addexchange_param_15]; ld.param.u8 %rs5, [addexchange_param_16]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd12; cvta.to.global.u64 %rd4, %rd11; cvta.to.global.u64 %rd5, %rd10; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_27; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd15, %r6, 4; add.s64 %rd16, %rd5, %rd15; add.s64 %rd17, %rd4, %rd15; add.s64 %rd18, %rd3, %rd15; ld.global.nc.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd17]; mul.f32 %f33, %f2, %f2; fma.rn.f32 %f34, %f1, %f1, %f33; ld.global.nc.f32 %f3, [%rd18]; fma.rn.f32 %f35, %f3, %f3, %f34; setp.eq.f32 %p6, %f35, 0f00000000; @%p6 bra BB0_27; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd2, %rd19; ld.global.nc.u8 %rs1, [%rd20]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd21, %r49; mul.wide.s32 %rd22, %r49, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f36, [%rd23]; ld.global.nc.f32 %f37, [%rd24]; mul.f32 %f38, %f37, %f37; fma.rn.f32 %f39, %f36, %f36, %f38; ld.global.nc.f32 %f40, [%rd25]; fma.rn.f32 %f41, %f40, %f40, %f39; setp.eq.f32 %p8, %f41, 0f00000000; selp.f32 %f42, %f1, %f36, %p8; selp.f32 %f43, %f2, %f37, %p8; selp.f32 %f44, %f3, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; setp.gt.u16 %p9, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p9; selp.b32 %r53, %r51, %r7, %p9; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd27, %r57, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f45, [%rd28]; mul.f32 %f46, %f45, %f30; sub.f32 %f47, %f42, %f1; sub.f32 %f48, %f43, %f2; sub.f32 %f49, %f44, %f3; fma.rn.f32 %f4, %f46, %f47, 0f00000000; fma.rn.f32 %f5, %f46, %f48, 0f00000000; fma.rn.f32 %f6, %f46, %f49, 0f00000000; add.s32 %r12, %r1, 1; @%p7 bra BB0_7; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_8; BB0_7: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_8: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd29, %r61; mul.wide.s32 %rd30, %r61, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f50, [%rd31]; ld.global.nc.f32 %f51, [%rd32]; mul.f32 %f52, %f51, %f51; fma.rn.f32 %f53, %f50, %f50, %f52; ld.global.nc.f32 %f54, [%rd33]; fma.rn.f32 %f55, %f54, %f54, %f53; setp.eq.f32 %p11, %f55, 0f00000000; selp.f32 %f56, %f1, %f50, %p11; selp.f32 %f57, %f2, %f51, %p11; selp.f32 %f58, %f3, %f54, %p11; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs9, [%rd34]; setp.gt.u16 %p12, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p12; selp.b32 %r65, %r63, %r7, %p12; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd35, %r69, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f59, [%rd36]; mul.f32 %f60, %f59, %f30; sub.f32 %f61, %f56, %f1; sub.f32 %f62, %f57, %f2; sub.f32 %f63, %f58, %f3; fma.rn.f32 %f7, %f60, %f61, %f4; fma.rn.f32 %f8, %f60, %f62, %f5; fma.rn.f32 %f9, %f60, %f63, %f6; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p13, %rs3, 0; add.s32 %r16, %r2, -1; @%p13 bra BB0_10; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_11; BB0_10: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_11: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd37, %r74; mul.wide.s32 %rd38, %r74, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f64, [%rd39]; ld.global.nc.f32 %f65, [%rd40]; mul.f32 %f66, %f65, %f65; fma.rn.f32 %f67, %f64, %f64, %f66; ld.global.nc.f32 %f68, [%rd41]; fma.rn.f32 %f69, %f68, %f68, %f67; setp.eq.f32 %p14, %f69, 0f00000000; selp.f32 %f70, %f1, %f64, %p14; selp.f32 %f71, %f2, %f65, %p14; selp.f32 %f72, %f3, %f68, %p14; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs12, [%rd42]; setp.gt.u16 %p15, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p15; selp.b32 %r78, %r76, %r7, %p15; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd43, %r82, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f73, [%rd44]; mul.f32 %f74, %f73, %f31; sub.f32 %f75, %f70, %f1; sub.f32 %f76, %f71, %f2; sub.f32 %f77, %f72, %f3; fma.rn.f32 %f10, %f74, %f75, %f7; fma.rn.f32 %f11, %f74, %f76, %f8; fma.rn.f32 %f12, %f74, %f77, %f9; add.s32 %r20, %r2, 1; @%p13 bra BB0_13; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_14; BB0_13: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_14: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd45, %r87; mul.wide.s32 %rd46, %r87, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f78, [%rd47]; ld.global.nc.f32 %f79, [%rd48]; mul.f32 %f80, %f79, %f79; fma.rn.f32 %f81, %f78, %f78, %f80; ld.global.nc.f32 %f82, [%rd49]; fma.rn.f32 %f83, %f82, %f82, %f81; setp.eq.f32 %p17, %f83, 0f00000000; selp.f32 %f84, %f1, %f78, %p17; selp.f32 %f85, %f2, %f79, %p17; selp.f32 %f86, %f3, %f82, %p17; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs16, [%rd50]; setp.gt.u16 %p18, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p18; selp.b32 %r91, %r89, %r7, %p18; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd51, %r95, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f87, [%rd52]; mul.f32 %f88, %f87, %f31; sub.f32 %f89, %f84, %f1; sub.f32 %f90, %f85, %f2; sub.f32 %f91, %f86, %f3; fma.rn.f32 %f128, %f88, %f89, %f10; fma.rn.f32 %f129, %f88, %f90, %f11; fma.rn.f32 %f130, %f88, %f91, %f12; setp.eq.s32 %p19, %r34, 1; @%p19 bra BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p20, %rs4, 0; add.s32 %r24, %r3, -1; @%p20 bra BB0_17; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_18; BB0_17: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_18: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd53, %r100; mul.wide.s32 %rd54, %r100, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f92, [%rd55]; ld.global.nc.f32 %f93, [%rd56]; mul.f32 %f94, %f93, %f93; fma.rn.f32 %f95, %f92, %f92, %f94; ld.global.nc.f32 %f96, [%rd57]; fma.rn.f32 %f97, %f96, %f96, %f95; setp.eq.f32 %p21, %f97, 0f00000000; selp.f32 %f98, %f1, %f92, %p21; selp.f32 %f99, %f2, %f93, %p21; selp.f32 %f100, %f3, %f96, %p21; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs19, [%rd58]; setp.gt.u16 %p22, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p22; selp.b32 %r104, %r102, %r7, %p22; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd59, %r108, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f101, [%rd60]; mul.f32 %f102, %f101, %f32; sub.f32 %f103, %f98, %f1; sub.f32 %f104, %f99, %f2; sub.f32 %f105, %f100, %f3; fma.rn.f32 %f16, %f102, %f103, %f128; fma.rn.f32 %f17, %f102, %f104, %f129; fma.rn.f32 %f18, %f102, %f105, %f130; add.s32 %r28, %r3, 1; @%p20 bra BB0_20; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_21; BB0_20: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_21: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd61, %r113; mul.wide.s32 %rd62, %r113, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f106, [%rd63]; ld.global.nc.f32 %f107, [%rd64]; mul.f32 %f108, %f107, %f107; fma.rn.f32 %f109, %f106, %f106, %f108; ld.global.nc.f32 %f110, [%rd65]; fma.rn.f32 %f111, %f110, %f110, %f109; setp.eq.f32 %p24, %f111, 0f00000000; selp.f32 %f112, %f1, %f106, %p24; selp.f32 %f113, %f2, %f107, %p24; selp.f32 %f114, %f3, %f110, %p24; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs23, [%rd66]; setp.gt.u16 %p25, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p25; selp.b32 %r117, %r115, %r7, %p25; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd67, %r121, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f115, [%rd68]; mul.f32 %f116, %f115, %f32; sub.f32 %f117, %f112, %f1; sub.f32 %f118, %f113, %f2; sub.f32 %f119, %f114, %f3; fma.rn.f32 %f128, %f116, %f117, %f16; fma.rn.f32 %f129, %f116, %f118, %f17; fma.rn.f32 %f130, %f116, %f119, %f18; BB0_22: setp.eq.s64 %p26, %rd9, 0; @%p26 bra BB0_24; cvta.to.global.u64 %rd69, %rd9; add.s64 %rd71, %rd69, %rd15; ld.global.nc.f32 %f120, [%rd71]; mul.f32 %f131, %f120, %f131; BB0_24: setp.eq.f32 %p27, %f131, 0f00000000; mov.f32 %f132, 0f00000000; @%p27 bra BB0_26; rcp.rn.f32 %f132, %f131; BB0_26: cvta.to.global.u64 %rd72, %rd8; cvta.to.global.u64 %rd73, %rd7; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd76, %rd74, %rd15; ld.global.f32 %f122, [%rd76]; fma.rn.f32 %f123, %f128, %f132, %f122; st.global.f32 [%rd76], %f123; add.s64 %rd77, %rd73, %rd15; ld.global.f32 %f124, [%rd77]; fma.rn.f32 %f125, %f129, %f132, %f124; st.global.f32 [%rd77], %f125; add.s64 %rd78, %rd72, %rd15; ld.global.f32 %f126, [%rd78]; fma.rn.f32 %f127, %f130, %f132, %f126; st.global.f32 [%rd78], %f127; BB0_27: ret; } ` addexchange_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<28>; .reg .b16 %rs<26>; .reg .f32 %f<133>; .reg .b32 %r<128>; .reg .b64 %rd<79>; ld.param.u64 %rd6, [addexchange_param_0]; ld.param.u64 %rd7, [addexchange_param_1]; ld.param.u64 %rd8, [addexchange_param_2]; ld.param.u64 %rd10, [addexchange_param_3]; ld.param.u64 %rd11, [addexchange_param_4]; ld.param.u64 %rd12, [addexchange_param_5]; ld.param.u64 %rd9, [addexchange_param_6]; ld.param.f32 %f131, [addexchange_param_7]; ld.param.u64 %rd13, [addexchange_param_8]; ld.param.u64 %rd14, [addexchange_param_9]; ld.param.f32 %f30, [addexchange_param_10]; ld.param.f32 %f31, [addexchange_param_11]; ld.param.f32 %f32, [addexchange_param_12]; ld.param.u32 %r32, [addexchange_param_13]; ld.param.u32 %r33, [addexchange_param_14]; ld.param.u32 %r34, [addexchange_param_15]; ld.param.u8 %rs5, [addexchange_param_16]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd12; cvta.to.global.u64 %rd4, %rd11; cvta.to.global.u64 %rd5, %rd10; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_27; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd15, %r6, 4; add.s64 %rd16, %rd5, %rd15; add.s64 %rd17, %rd4, %rd15; add.s64 %rd18, %rd3, %rd15; ld.global.nc.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd17]; mul.f32 %f33, %f2, %f2; fma.rn.f32 %f34, %f1, %f1, %f33; ld.global.nc.f32 %f3, [%rd18]; fma.rn.f32 %f35, %f3, %f3, %f34; setp.eq.f32 %p6, %f35, 0f00000000; @%p6 bra BB0_27; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd2, %rd19; ld.global.nc.u8 %rs1, [%rd20]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd21, %r49; mul.wide.s32 %rd22, %r49, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f36, [%rd23]; ld.global.nc.f32 %f37, [%rd24]; mul.f32 %f38, %f37, %f37; fma.rn.f32 %f39, %f36, %f36, %f38; ld.global.nc.f32 %f40, [%rd25]; fma.rn.f32 %f41, %f40, %f40, %f39; setp.eq.f32 %p8, %f41, 0f00000000; selp.f32 %f42, %f1, %f36, %p8; selp.f32 %f43, %f2, %f37, %p8; selp.f32 %f44, %f3, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; setp.gt.u16 %p9, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p9; selp.b32 %r53, %r51, %r7, %p9; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd27, %r57, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f45, [%rd28]; mul.f32 %f46, %f45, %f30; sub.f32 %f47, %f42, %f1; sub.f32 %f48, %f43, %f2; sub.f32 %f49, %f44, %f3; fma.rn.f32 %f4, %f46, %f47, 0f00000000; fma.rn.f32 %f5, %f46, %f48, 0f00000000; fma.rn.f32 %f6, %f46, %f49, 0f00000000; add.s32 %r12, %r1, 1; @%p7 bra BB0_7; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_8; BB0_7: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_8: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd29, %r61; mul.wide.s32 %rd30, %r61, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f50, [%rd31]; ld.global.nc.f32 %f51, [%rd32]; mul.f32 %f52, %f51, %f51; fma.rn.f32 %f53, %f50, %f50, %f52; ld.global.nc.f32 %f54, [%rd33]; fma.rn.f32 %f55, %f54, %f54, %f53; setp.eq.f32 %p11, %f55, 0f00000000; selp.f32 %f56, %f1, %f50, %p11; selp.f32 %f57, %f2, %f51, %p11; selp.f32 %f58, %f3, %f54, %p11; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs9, [%rd34]; setp.gt.u16 %p12, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p12; selp.b32 %r65, %r63, %r7, %p12; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd35, %r69, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f59, [%rd36]; mul.f32 %f60, %f59, %f30; sub.f32 %f61, %f56, %f1; sub.f32 %f62, %f57, %f2; sub.f32 %f63, %f58, %f3; fma.rn.f32 %f7, %f60, %f61, %f4; fma.rn.f32 %f8, %f60, %f62, %f5; fma.rn.f32 %f9, %f60, %f63, %f6; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p13, %rs3, 0; add.s32 %r16, %r2, -1; @%p13 bra BB0_10; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_11; BB0_10: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_11: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd37, %r74; mul.wide.s32 %rd38, %r74, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f64, [%rd39]; ld.global.nc.f32 %f65, [%rd40]; mul.f32 %f66, %f65, %f65; fma.rn.f32 %f67, %f64, %f64, %f66; ld.global.nc.f32 %f68, [%rd41]; fma.rn.f32 %f69, %f68, %f68, %f67; setp.eq.f32 %p14, %f69, 0f00000000; selp.f32 %f70, %f1, %f64, %p14; selp.f32 %f71, %f2, %f65, %p14; selp.f32 %f72, %f3, %f68, %p14; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs12, [%rd42]; setp.gt.u16 %p15, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p15; selp.b32 %r78, %r76, %r7, %p15; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd43, %r82, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f73, [%rd44]; mul.f32 %f74, %f73, %f31; sub.f32 %f75, %f70, %f1; sub.f32 %f76, %f71, %f2; sub.f32 %f77, %f72, %f3; fma.rn.f32 %f10, %f74, %f75, %f7; fma.rn.f32 %f11, %f74, %f76, %f8; fma.rn.f32 %f12, %f74, %f77, %f9; add.s32 %r20, %r2, 1; @%p13 bra BB0_13; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_14; BB0_13: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_14: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd45, %r87; mul.wide.s32 %rd46, %r87, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f78, [%rd47]; ld.global.nc.f32 %f79, [%rd48]; mul.f32 %f80, %f79, %f79; fma.rn.f32 %f81, %f78, %f78, %f80; ld.global.nc.f32 %f82, [%rd49]; fma.rn.f32 %f83, %f82, %f82, %f81; setp.eq.f32 %p17, %f83, 0f00000000; selp.f32 %f84, %f1, %f78, %p17; selp.f32 %f85, %f2, %f79, %p17; selp.f32 %f86, %f3, %f82, %p17; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs16, [%rd50]; setp.gt.u16 %p18, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p18; selp.b32 %r91, %r89, %r7, %p18; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd51, %r95, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f87, [%rd52]; mul.f32 %f88, %f87, %f31; sub.f32 %f89, %f84, %f1; sub.f32 %f90, %f85, %f2; sub.f32 %f91, %f86, %f3; fma.rn.f32 %f128, %f88, %f89, %f10; fma.rn.f32 %f129, %f88, %f90, %f11; fma.rn.f32 %f130, %f88, %f91, %f12; setp.eq.s32 %p19, %r34, 1; @%p19 bra BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p20, %rs4, 0; add.s32 %r24, %r3, -1; @%p20 bra BB0_17; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_18; BB0_17: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_18: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd53, %r100; mul.wide.s32 %rd54, %r100, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f92, [%rd55]; ld.global.nc.f32 %f93, [%rd56]; mul.f32 %f94, %f93, %f93; fma.rn.f32 %f95, %f92, %f92, %f94; ld.global.nc.f32 %f96, [%rd57]; fma.rn.f32 %f97, %f96, %f96, %f95; setp.eq.f32 %p21, %f97, 0f00000000; selp.f32 %f98, %f1, %f92, %p21; selp.f32 %f99, %f2, %f93, %p21; selp.f32 %f100, %f3, %f96, %p21; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs19, [%rd58]; setp.gt.u16 %p22, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p22; selp.b32 %r104, %r102, %r7, %p22; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd59, %r108, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f101, [%rd60]; mul.f32 %f102, %f101, %f32; sub.f32 %f103, %f98, %f1; sub.f32 %f104, %f99, %f2; sub.f32 %f105, %f100, %f3; fma.rn.f32 %f16, %f102, %f103, %f128; fma.rn.f32 %f17, %f102, %f104, %f129; fma.rn.f32 %f18, %f102, %f105, %f130; add.s32 %r28, %r3, 1; @%p20 bra BB0_20; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_21; BB0_20: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_21: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd61, %r113; mul.wide.s32 %rd62, %r113, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f106, [%rd63]; ld.global.nc.f32 %f107, [%rd64]; mul.f32 %f108, %f107, %f107; fma.rn.f32 %f109, %f106, %f106, %f108; ld.global.nc.f32 %f110, [%rd65]; fma.rn.f32 %f111, %f110, %f110, %f109; setp.eq.f32 %p24, %f111, 0f00000000; selp.f32 %f112, %f1, %f106, %p24; selp.f32 %f113, %f2, %f107, %p24; selp.f32 %f114, %f3, %f110, %p24; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs23, [%rd66]; setp.gt.u16 %p25, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p25; selp.b32 %r117, %r115, %r7, %p25; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd67, %r121, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f115, [%rd68]; mul.f32 %f116, %f115, %f32; sub.f32 %f117, %f112, %f1; sub.f32 %f118, %f113, %f2; sub.f32 %f119, %f114, %f3; fma.rn.f32 %f128, %f116, %f117, %f16; fma.rn.f32 %f129, %f116, %f118, %f17; fma.rn.f32 %f130, %f116, %f119, %f18; BB0_22: setp.eq.s64 %p26, %rd9, 0; @%p26 bra BB0_24; cvta.to.global.u64 %rd69, %rd9; add.s64 %rd71, %rd69, %rd15; ld.global.nc.f32 %f120, [%rd71]; mul.f32 %f131, %f120, %f131; BB0_24: setp.eq.f32 %p27, %f131, 0f00000000; mov.f32 %f132, 0f00000000; @%p27 bra BB0_26; rcp.rn.f32 %f132, %f131; BB0_26: cvta.to.global.u64 %rd72, %rd8; cvta.to.global.u64 %rd73, %rd7; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd76, %rd74, %rd15; ld.global.f32 %f122, [%rd76]; fma.rn.f32 %f123, %f128, %f132, %f122; st.global.f32 [%rd76], %f123; add.s64 %rd77, %rd73, %rd15; ld.global.f32 %f124, [%rd77]; fma.rn.f32 %f125, %f129, %f132, %f124; st.global.f32 [%rd77], %f125; add.s64 %rd78, %rd72, %rd15; ld.global.f32 %f126, [%rd78]; fma.rn.f32 %f127, %f130, %f132, %f126; st.global.f32 [%rd78], %f127; BB0_27: ret; } ` addexchange_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<28>; .reg .b16 %rs<26>; .reg .f32 %f<133>; .reg .b32 %r<128>; .reg .b64 %rd<79>; ld.param.u64 %rd6, [addexchange_param_0]; ld.param.u64 %rd7, [addexchange_param_1]; ld.param.u64 %rd8, [addexchange_param_2]; ld.param.u64 %rd10, [addexchange_param_3]; ld.param.u64 %rd11, [addexchange_param_4]; ld.param.u64 %rd12, [addexchange_param_5]; ld.param.u64 %rd9, [addexchange_param_6]; ld.param.f32 %f131, [addexchange_param_7]; ld.param.u64 %rd13, [addexchange_param_8]; ld.param.u64 %rd14, [addexchange_param_9]; ld.param.f32 %f30, [addexchange_param_10]; ld.param.f32 %f31, [addexchange_param_11]; ld.param.f32 %f32, [addexchange_param_12]; ld.param.u32 %r32, [addexchange_param_13]; ld.param.u32 %r33, [addexchange_param_14]; ld.param.u32 %r34, [addexchange_param_15]; ld.param.u8 %rs5, [addexchange_param_16]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd12; cvta.to.global.u64 %rd4, %rd11; cvta.to.global.u64 %rd5, %rd10; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_27; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd15, %r6, 4; add.s64 %rd16, %rd5, %rd15; add.s64 %rd17, %rd4, %rd15; add.s64 %rd18, %rd3, %rd15; ld.global.nc.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd17]; mul.f32 %f33, %f2, %f2; fma.rn.f32 %f34, %f1, %f1, %f33; ld.global.nc.f32 %f3, [%rd18]; fma.rn.f32 %f35, %f3, %f3, %f34; setp.eq.f32 %p6, %f35, 0f00000000; @%p6 bra BB0_27; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd2, %rd19; ld.global.nc.u8 %rs1, [%rd20]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd21, %r49; mul.wide.s32 %rd22, %r49, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f36, [%rd23]; ld.global.nc.f32 %f37, [%rd24]; mul.f32 %f38, %f37, %f37; fma.rn.f32 %f39, %f36, %f36, %f38; ld.global.nc.f32 %f40, [%rd25]; fma.rn.f32 %f41, %f40, %f40, %f39; setp.eq.f32 %p8, %f41, 0f00000000; selp.f32 %f42, %f1, %f36, %p8; selp.f32 %f43, %f2, %f37, %p8; selp.f32 %f44, %f3, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; setp.gt.u16 %p9, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p9; selp.b32 %r53, %r51, %r7, %p9; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd27, %r57, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f45, [%rd28]; mul.f32 %f46, %f45, %f30; sub.f32 %f47, %f42, %f1; sub.f32 %f48, %f43, %f2; sub.f32 %f49, %f44, %f3; fma.rn.f32 %f4, %f46, %f47, 0f00000000; fma.rn.f32 %f5, %f46, %f48, 0f00000000; fma.rn.f32 %f6, %f46, %f49, 0f00000000; add.s32 %r12, %r1, 1; @%p7 bra BB0_7; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_8; BB0_7: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_8: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd29, %r61; mul.wide.s32 %rd30, %r61, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f50, [%rd31]; ld.global.nc.f32 %f51, [%rd32]; mul.f32 %f52, %f51, %f51; fma.rn.f32 %f53, %f50, %f50, %f52; ld.global.nc.f32 %f54, [%rd33]; fma.rn.f32 %f55, %f54, %f54, %f53; setp.eq.f32 %p11, %f55, 0f00000000; selp.f32 %f56, %f1, %f50, %p11; selp.f32 %f57, %f2, %f51, %p11; selp.f32 %f58, %f3, %f54, %p11; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs9, [%rd34]; setp.gt.u16 %p12, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p12; selp.b32 %r65, %r63, %r7, %p12; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd35, %r69, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f59, [%rd36]; mul.f32 %f60, %f59, %f30; sub.f32 %f61, %f56, %f1; sub.f32 %f62, %f57, %f2; sub.f32 %f63, %f58, %f3; fma.rn.f32 %f7, %f60, %f61, %f4; fma.rn.f32 %f8, %f60, %f62, %f5; fma.rn.f32 %f9, %f60, %f63, %f6; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p13, %rs3, 0; add.s32 %r16, %r2, -1; @%p13 bra BB0_10; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_11; BB0_10: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_11: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd37, %r74; mul.wide.s32 %rd38, %r74, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f64, [%rd39]; ld.global.nc.f32 %f65, [%rd40]; mul.f32 %f66, %f65, %f65; fma.rn.f32 %f67, %f64, %f64, %f66; ld.global.nc.f32 %f68, [%rd41]; fma.rn.f32 %f69, %f68, %f68, %f67; setp.eq.f32 %p14, %f69, 0f00000000; selp.f32 %f70, %f1, %f64, %p14; selp.f32 %f71, %f2, %f65, %p14; selp.f32 %f72, %f3, %f68, %p14; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs12, [%rd42]; setp.gt.u16 %p15, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p15; selp.b32 %r78, %r76, %r7, %p15; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd43, %r82, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f73, [%rd44]; mul.f32 %f74, %f73, %f31; sub.f32 %f75, %f70, %f1; sub.f32 %f76, %f71, %f2; sub.f32 %f77, %f72, %f3; fma.rn.f32 %f10, %f74, %f75, %f7; fma.rn.f32 %f11, %f74, %f76, %f8; fma.rn.f32 %f12, %f74, %f77, %f9; add.s32 %r20, %r2, 1; @%p13 bra BB0_13; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_14; BB0_13: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_14: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd45, %r87; mul.wide.s32 %rd46, %r87, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f78, [%rd47]; ld.global.nc.f32 %f79, [%rd48]; mul.f32 %f80, %f79, %f79; fma.rn.f32 %f81, %f78, %f78, %f80; ld.global.nc.f32 %f82, [%rd49]; fma.rn.f32 %f83, %f82, %f82, %f81; setp.eq.f32 %p17, %f83, 0f00000000; selp.f32 %f84, %f1, %f78, %p17; selp.f32 %f85, %f2, %f79, %p17; selp.f32 %f86, %f3, %f82, %p17; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs16, [%rd50]; setp.gt.u16 %p18, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p18; selp.b32 %r91, %r89, %r7, %p18; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd51, %r95, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f87, [%rd52]; mul.f32 %f88, %f87, %f31; sub.f32 %f89, %f84, %f1; sub.f32 %f90, %f85, %f2; sub.f32 %f91, %f86, %f3; fma.rn.f32 %f128, %f88, %f89, %f10; fma.rn.f32 %f129, %f88, %f90, %f11; fma.rn.f32 %f130, %f88, %f91, %f12; setp.eq.s32 %p19, %r34, 1; @%p19 bra BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p20, %rs4, 0; add.s32 %r24, %r3, -1; @%p20 bra BB0_17; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_18; BB0_17: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_18: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd53, %r100; mul.wide.s32 %rd54, %r100, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f92, [%rd55]; ld.global.nc.f32 %f93, [%rd56]; mul.f32 %f94, %f93, %f93; fma.rn.f32 %f95, %f92, %f92, %f94; ld.global.nc.f32 %f96, [%rd57]; fma.rn.f32 %f97, %f96, %f96, %f95; setp.eq.f32 %p21, %f97, 0f00000000; selp.f32 %f98, %f1, %f92, %p21; selp.f32 %f99, %f2, %f93, %p21; selp.f32 %f100, %f3, %f96, %p21; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs19, [%rd58]; setp.gt.u16 %p22, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p22; selp.b32 %r104, %r102, %r7, %p22; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd59, %r108, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f101, [%rd60]; mul.f32 %f102, %f101, %f32; sub.f32 %f103, %f98, %f1; sub.f32 %f104, %f99, %f2; sub.f32 %f105, %f100, %f3; fma.rn.f32 %f16, %f102, %f103, %f128; fma.rn.f32 %f17, %f102, %f104, %f129; fma.rn.f32 %f18, %f102, %f105, %f130; add.s32 %r28, %r3, 1; @%p20 bra BB0_20; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_21; BB0_20: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_21: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd61, %r113; mul.wide.s32 %rd62, %r113, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f106, [%rd63]; ld.global.nc.f32 %f107, [%rd64]; mul.f32 %f108, %f107, %f107; fma.rn.f32 %f109, %f106, %f106, %f108; ld.global.nc.f32 %f110, [%rd65]; fma.rn.f32 %f111, %f110, %f110, %f109; setp.eq.f32 %p24, %f111, 0f00000000; selp.f32 %f112, %f1, %f106, %p24; selp.f32 %f113, %f2, %f107, %p24; selp.f32 %f114, %f3, %f110, %p24; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs23, [%rd66]; setp.gt.u16 %p25, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p25; selp.b32 %r117, %r115, %r7, %p25; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd67, %r121, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f115, [%rd68]; mul.f32 %f116, %f115, %f32; sub.f32 %f117, %f112, %f1; sub.f32 %f118, %f113, %f2; sub.f32 %f119, %f114, %f3; fma.rn.f32 %f128, %f116, %f117, %f16; fma.rn.f32 %f129, %f116, %f118, %f17; fma.rn.f32 %f130, %f116, %f119, %f18; BB0_22: setp.eq.s64 %p26, %rd9, 0; @%p26 bra BB0_24; cvta.to.global.u64 %rd69, %rd9; add.s64 %rd71, %rd69, %rd15; ld.global.nc.f32 %f120, [%rd71]; mul.f32 %f131, %f120, %f131; BB0_24: setp.eq.f32 %p27, %f131, 0f00000000; mov.f32 %f132, 0f00000000; @%p27 bra BB0_26; rcp.rn.f32 %f132, %f131; BB0_26: cvta.to.global.u64 %rd72, %rd8; cvta.to.global.u64 %rd73, %rd7; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd76, %rd74, %rd15; ld.global.f32 %f122, [%rd76]; fma.rn.f32 %f123, %f128, %f132, %f122; st.global.f32 [%rd76], %f123; add.s64 %rd77, %rd73, %rd15; ld.global.f32 %f124, [%rd77]; fma.rn.f32 %f125, %f129, %f132, %f124; st.global.f32 [%rd77], %f125; add.s64 %rd78, %rd72, %rd15; ld.global.f32 %f126, [%rd78]; fma.rn.f32 %f127, %f130, %f132, %f126; st.global.f32 [%rd78], %f127; BB0_27: ret; } ` addexchange_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<28>; .reg .b16 %rs<26>; .reg .f32 %f<133>; .reg .b32 %r<128>; .reg .b64 %rd<79>; ld.param.u64 %rd6, [addexchange_param_0]; ld.param.u64 %rd7, [addexchange_param_1]; ld.param.u64 %rd8, [addexchange_param_2]; ld.param.u64 %rd10, [addexchange_param_3]; ld.param.u64 %rd11, [addexchange_param_4]; ld.param.u64 %rd12, [addexchange_param_5]; ld.param.u64 %rd9, [addexchange_param_6]; ld.param.f32 %f131, [addexchange_param_7]; ld.param.u64 %rd13, [addexchange_param_8]; ld.param.u64 %rd14, [addexchange_param_9]; ld.param.f32 %f30, [addexchange_param_10]; ld.param.f32 %f31, [addexchange_param_11]; ld.param.f32 %f32, [addexchange_param_12]; ld.param.u32 %r32, [addexchange_param_13]; ld.param.u32 %r33, [addexchange_param_14]; ld.param.u32 %r34, [addexchange_param_15]; ld.param.u8 %rs5, [addexchange_param_16]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd12; cvta.to.global.u64 %rd4, %rd11; cvta.to.global.u64 %rd5, %rd10; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_27; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd15, %r6, 4; add.s64 %rd16, %rd5, %rd15; add.s64 %rd17, %rd4, %rd15; add.s64 %rd18, %rd3, %rd15; ld.global.nc.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd17]; mul.f32 %f33, %f2, %f2; fma.rn.f32 %f34, %f1, %f1, %f33; ld.global.nc.f32 %f3, [%rd18]; fma.rn.f32 %f35, %f3, %f3, %f34; setp.eq.f32 %p6, %f35, 0f00000000; @%p6 bra BB0_27; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd2, %rd19; ld.global.nc.u8 %rs1, [%rd20]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd21, %r49; mul.wide.s32 %rd22, %r49, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f36, [%rd23]; ld.global.nc.f32 %f37, [%rd24]; mul.f32 %f38, %f37, %f37; fma.rn.f32 %f39, %f36, %f36, %f38; ld.global.nc.f32 %f40, [%rd25]; fma.rn.f32 %f41, %f40, %f40, %f39; setp.eq.f32 %p8, %f41, 0f00000000; selp.f32 %f42, %f1, %f36, %p8; selp.f32 %f43, %f2, %f37, %p8; selp.f32 %f44, %f3, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; setp.gt.u16 %p9, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p9; selp.b32 %r53, %r51, %r7, %p9; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd27, %r57, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f45, [%rd28]; mul.f32 %f46, %f45, %f30; sub.f32 %f47, %f42, %f1; sub.f32 %f48, %f43, %f2; sub.f32 %f49, %f44, %f3; fma.rn.f32 %f4, %f46, %f47, 0f00000000; fma.rn.f32 %f5, %f46, %f48, 0f00000000; fma.rn.f32 %f6, %f46, %f49, 0f00000000; add.s32 %r12, %r1, 1; @%p7 bra BB0_7; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_8; BB0_7: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_8: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd29, %r61; mul.wide.s32 %rd30, %r61, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f50, [%rd31]; ld.global.nc.f32 %f51, [%rd32]; mul.f32 %f52, %f51, %f51; fma.rn.f32 %f53, %f50, %f50, %f52; ld.global.nc.f32 %f54, [%rd33]; fma.rn.f32 %f55, %f54, %f54, %f53; setp.eq.f32 %p11, %f55, 0f00000000; selp.f32 %f56, %f1, %f50, %p11; selp.f32 %f57, %f2, %f51, %p11; selp.f32 %f58, %f3, %f54, %p11; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs9, [%rd34]; setp.gt.u16 %p12, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p12; selp.b32 %r65, %r63, %r7, %p12; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd35, %r69, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f59, [%rd36]; mul.f32 %f60, %f59, %f30; sub.f32 %f61, %f56, %f1; sub.f32 %f62, %f57, %f2; sub.f32 %f63, %f58, %f3; fma.rn.f32 %f7, %f60, %f61, %f4; fma.rn.f32 %f8, %f60, %f62, %f5; fma.rn.f32 %f9, %f60, %f63, %f6; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p13, %rs3, 0; add.s32 %r16, %r2, -1; @%p13 bra BB0_10; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_11; BB0_10: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_11: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd37, %r74; mul.wide.s32 %rd38, %r74, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f64, [%rd39]; ld.global.nc.f32 %f65, [%rd40]; mul.f32 %f66, %f65, %f65; fma.rn.f32 %f67, %f64, %f64, %f66; ld.global.nc.f32 %f68, [%rd41]; fma.rn.f32 %f69, %f68, %f68, %f67; setp.eq.f32 %p14, %f69, 0f00000000; selp.f32 %f70, %f1, %f64, %p14; selp.f32 %f71, %f2, %f65, %p14; selp.f32 %f72, %f3, %f68, %p14; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs12, [%rd42]; setp.gt.u16 %p15, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p15; selp.b32 %r78, %r76, %r7, %p15; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd43, %r82, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f73, [%rd44]; mul.f32 %f74, %f73, %f31; sub.f32 %f75, %f70, %f1; sub.f32 %f76, %f71, %f2; sub.f32 %f77, %f72, %f3; fma.rn.f32 %f10, %f74, %f75, %f7; fma.rn.f32 %f11, %f74, %f76, %f8; fma.rn.f32 %f12, %f74, %f77, %f9; add.s32 %r20, %r2, 1; @%p13 bra BB0_13; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_14; BB0_13: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_14: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd45, %r87; mul.wide.s32 %rd46, %r87, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f78, [%rd47]; ld.global.nc.f32 %f79, [%rd48]; mul.f32 %f80, %f79, %f79; fma.rn.f32 %f81, %f78, %f78, %f80; ld.global.nc.f32 %f82, [%rd49]; fma.rn.f32 %f83, %f82, %f82, %f81; setp.eq.f32 %p17, %f83, 0f00000000; selp.f32 %f84, %f1, %f78, %p17; selp.f32 %f85, %f2, %f79, %p17; selp.f32 %f86, %f3, %f82, %p17; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs16, [%rd50]; setp.gt.u16 %p18, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p18; selp.b32 %r91, %r89, %r7, %p18; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd51, %r95, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f87, [%rd52]; mul.f32 %f88, %f87, %f31; sub.f32 %f89, %f84, %f1; sub.f32 %f90, %f85, %f2; sub.f32 %f91, %f86, %f3; fma.rn.f32 %f128, %f88, %f89, %f10; fma.rn.f32 %f129, %f88, %f90, %f11; fma.rn.f32 %f130, %f88, %f91, %f12; setp.eq.s32 %p19, %r34, 1; @%p19 bra BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p20, %rs4, 0; add.s32 %r24, %r3, -1; @%p20 bra BB0_17; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_18; BB0_17: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_18: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd53, %r100; mul.wide.s32 %rd54, %r100, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f92, [%rd55]; ld.global.nc.f32 %f93, [%rd56]; mul.f32 %f94, %f93, %f93; fma.rn.f32 %f95, %f92, %f92, %f94; ld.global.nc.f32 %f96, [%rd57]; fma.rn.f32 %f97, %f96, %f96, %f95; setp.eq.f32 %p21, %f97, 0f00000000; selp.f32 %f98, %f1, %f92, %p21; selp.f32 %f99, %f2, %f93, %p21; selp.f32 %f100, %f3, %f96, %p21; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs19, [%rd58]; setp.gt.u16 %p22, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p22; selp.b32 %r104, %r102, %r7, %p22; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd59, %r108, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f101, [%rd60]; mul.f32 %f102, %f101, %f32; sub.f32 %f103, %f98, %f1; sub.f32 %f104, %f99, %f2; sub.f32 %f105, %f100, %f3; fma.rn.f32 %f16, %f102, %f103, %f128; fma.rn.f32 %f17, %f102, %f104, %f129; fma.rn.f32 %f18, %f102, %f105, %f130; add.s32 %r28, %r3, 1; @%p20 bra BB0_20; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_21; BB0_20: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_21: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd61, %r113; mul.wide.s32 %rd62, %r113, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f106, [%rd63]; ld.global.nc.f32 %f107, [%rd64]; mul.f32 %f108, %f107, %f107; fma.rn.f32 %f109, %f106, %f106, %f108; ld.global.nc.f32 %f110, [%rd65]; fma.rn.f32 %f111, %f110, %f110, %f109; setp.eq.f32 %p24, %f111, 0f00000000; selp.f32 %f112, %f1, %f106, %p24; selp.f32 %f113, %f2, %f107, %p24; selp.f32 %f114, %f3, %f110, %p24; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs23, [%rd66]; setp.gt.u16 %p25, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p25; selp.b32 %r117, %r115, %r7, %p25; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd67, %r121, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f115, [%rd68]; mul.f32 %f116, %f115, %f32; sub.f32 %f117, %f112, %f1; sub.f32 %f118, %f113, %f2; sub.f32 %f119, %f114, %f3; fma.rn.f32 %f128, %f116, %f117, %f16; fma.rn.f32 %f129, %f116, %f118, %f17; fma.rn.f32 %f130, %f116, %f119, %f18; BB0_22: setp.eq.s64 %p26, %rd9, 0; @%p26 bra BB0_24; cvta.to.global.u64 %rd69, %rd9; add.s64 %rd71, %rd69, %rd15; ld.global.nc.f32 %f120, [%rd71]; mul.f32 %f131, %f120, %f131; BB0_24: setp.eq.f32 %p27, %f131, 0f00000000; mov.f32 %f132, 0f00000000; @%p27 bra BB0_26; rcp.rn.f32 %f132, %f131; BB0_26: cvta.to.global.u64 %rd72, %rd8; cvta.to.global.u64 %rd73, %rd7; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd76, %rd74, %rd15; ld.global.f32 %f122, [%rd76]; fma.rn.f32 %f123, %f128, %f132, %f122; st.global.f32 [%rd76], %f123; add.s64 %rd77, %rd73, %rd15; ld.global.f32 %f124, [%rd77]; fma.rn.f32 %f125, %f129, %f132, %f124; st.global.f32 [%rd77], %f125; add.s64 %rd78, %rd72, %rd15; ld.global.f32 %f126, [%rd78]; fma.rn.f32 %f127, %f130, %f132, %f126; st.global.f32 [%rd78], %f127; BB0_27: ret; } ` addexchange_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<28>; .reg .b16 %rs<26>; .reg .f32 %f<133>; .reg .b32 %r<128>; .reg .b64 %rd<79>; ld.param.u64 %rd6, [addexchange_param_0]; ld.param.u64 %rd7, [addexchange_param_1]; ld.param.u64 %rd8, [addexchange_param_2]; ld.param.u64 %rd10, [addexchange_param_3]; ld.param.u64 %rd11, [addexchange_param_4]; ld.param.u64 %rd12, [addexchange_param_5]; ld.param.u64 %rd9, [addexchange_param_6]; ld.param.f32 %f131, [addexchange_param_7]; ld.param.u64 %rd13, [addexchange_param_8]; ld.param.u64 %rd14, [addexchange_param_9]; ld.param.f32 %f30, [addexchange_param_10]; ld.param.f32 %f31, [addexchange_param_11]; ld.param.f32 %f32, [addexchange_param_12]; ld.param.u32 %r32, [addexchange_param_13]; ld.param.u32 %r33, [addexchange_param_14]; ld.param.u32 %r34, [addexchange_param_15]; ld.param.u8 %rs5, [addexchange_param_16]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd12; cvta.to.global.u64 %rd4, %rd11; cvta.to.global.u64 %rd5, %rd10; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_27; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd15, %r6, 4; add.s64 %rd16, %rd5, %rd15; add.s64 %rd17, %rd4, %rd15; add.s64 %rd18, %rd3, %rd15; ld.global.nc.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd17]; mul.f32 %f33, %f2, %f2; fma.rn.f32 %f34, %f1, %f1, %f33; ld.global.nc.f32 %f3, [%rd18]; fma.rn.f32 %f35, %f3, %f3, %f34; setp.eq.f32 %p6, %f35, 0f00000000; @%p6 bra BB0_27; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd2, %rd19; ld.global.nc.u8 %rs1, [%rd20]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd21, %r49; mul.wide.s32 %rd22, %r49, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f36, [%rd23]; ld.global.nc.f32 %f37, [%rd24]; mul.f32 %f38, %f37, %f37; fma.rn.f32 %f39, %f36, %f36, %f38; ld.global.nc.f32 %f40, [%rd25]; fma.rn.f32 %f41, %f40, %f40, %f39; setp.eq.f32 %p8, %f41, 0f00000000; selp.f32 %f42, %f1, %f36, %p8; selp.f32 %f43, %f2, %f37, %p8; selp.f32 %f44, %f3, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; setp.gt.u16 %p9, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p9; selp.b32 %r53, %r51, %r7, %p9; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd27, %r57, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f45, [%rd28]; mul.f32 %f46, %f45, %f30; sub.f32 %f47, %f42, %f1; sub.f32 %f48, %f43, %f2; sub.f32 %f49, %f44, %f3; fma.rn.f32 %f4, %f46, %f47, 0f00000000; fma.rn.f32 %f5, %f46, %f48, 0f00000000; fma.rn.f32 %f6, %f46, %f49, 0f00000000; add.s32 %r12, %r1, 1; @%p7 bra BB0_7; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_8; BB0_7: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_8: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd29, %r61; mul.wide.s32 %rd30, %r61, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f50, [%rd31]; ld.global.nc.f32 %f51, [%rd32]; mul.f32 %f52, %f51, %f51; fma.rn.f32 %f53, %f50, %f50, %f52; ld.global.nc.f32 %f54, [%rd33]; fma.rn.f32 %f55, %f54, %f54, %f53; setp.eq.f32 %p11, %f55, 0f00000000; selp.f32 %f56, %f1, %f50, %p11; selp.f32 %f57, %f2, %f51, %p11; selp.f32 %f58, %f3, %f54, %p11; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs9, [%rd34]; setp.gt.u16 %p12, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p12; selp.b32 %r65, %r63, %r7, %p12; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd35, %r69, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f59, [%rd36]; mul.f32 %f60, %f59, %f30; sub.f32 %f61, %f56, %f1; sub.f32 %f62, %f57, %f2; sub.f32 %f63, %f58, %f3; fma.rn.f32 %f7, %f60, %f61, %f4; fma.rn.f32 %f8, %f60, %f62, %f5; fma.rn.f32 %f9, %f60, %f63, %f6; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p13, %rs3, 0; add.s32 %r16, %r2, -1; @%p13 bra BB0_10; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_11; BB0_10: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_11: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd37, %r74; mul.wide.s32 %rd38, %r74, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f64, [%rd39]; ld.global.nc.f32 %f65, [%rd40]; mul.f32 %f66, %f65, %f65; fma.rn.f32 %f67, %f64, %f64, %f66; ld.global.nc.f32 %f68, [%rd41]; fma.rn.f32 %f69, %f68, %f68, %f67; setp.eq.f32 %p14, %f69, 0f00000000; selp.f32 %f70, %f1, %f64, %p14; selp.f32 %f71, %f2, %f65, %p14; selp.f32 %f72, %f3, %f68, %p14; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs12, [%rd42]; setp.gt.u16 %p15, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p15; selp.b32 %r78, %r76, %r7, %p15; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd43, %r82, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f73, [%rd44]; mul.f32 %f74, %f73, %f31; sub.f32 %f75, %f70, %f1; sub.f32 %f76, %f71, %f2; sub.f32 %f77, %f72, %f3; fma.rn.f32 %f10, %f74, %f75, %f7; fma.rn.f32 %f11, %f74, %f76, %f8; fma.rn.f32 %f12, %f74, %f77, %f9; add.s32 %r20, %r2, 1; @%p13 bra BB0_13; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_14; BB0_13: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_14: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd45, %r87; mul.wide.s32 %rd46, %r87, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f78, [%rd47]; ld.global.nc.f32 %f79, [%rd48]; mul.f32 %f80, %f79, %f79; fma.rn.f32 %f81, %f78, %f78, %f80; ld.global.nc.f32 %f82, [%rd49]; fma.rn.f32 %f83, %f82, %f82, %f81; setp.eq.f32 %p17, %f83, 0f00000000; selp.f32 %f84, %f1, %f78, %p17; selp.f32 %f85, %f2, %f79, %p17; selp.f32 %f86, %f3, %f82, %p17; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs16, [%rd50]; setp.gt.u16 %p18, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p18; selp.b32 %r91, %r89, %r7, %p18; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd51, %r95, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f87, [%rd52]; mul.f32 %f88, %f87, %f31; sub.f32 %f89, %f84, %f1; sub.f32 %f90, %f85, %f2; sub.f32 %f91, %f86, %f3; fma.rn.f32 %f128, %f88, %f89, %f10; fma.rn.f32 %f129, %f88, %f90, %f11; fma.rn.f32 %f130, %f88, %f91, %f12; setp.eq.s32 %p19, %r34, 1; @%p19 bra BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p20, %rs4, 0; add.s32 %r24, %r3, -1; @%p20 bra BB0_17; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_18; BB0_17: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_18: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd53, %r100; mul.wide.s32 %rd54, %r100, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f92, [%rd55]; ld.global.nc.f32 %f93, [%rd56]; mul.f32 %f94, %f93, %f93; fma.rn.f32 %f95, %f92, %f92, %f94; ld.global.nc.f32 %f96, [%rd57]; fma.rn.f32 %f97, %f96, %f96, %f95; setp.eq.f32 %p21, %f97, 0f00000000; selp.f32 %f98, %f1, %f92, %p21; selp.f32 %f99, %f2, %f93, %p21; selp.f32 %f100, %f3, %f96, %p21; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs19, [%rd58]; setp.gt.u16 %p22, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p22; selp.b32 %r104, %r102, %r7, %p22; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd59, %r108, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f101, [%rd60]; mul.f32 %f102, %f101, %f32; sub.f32 %f103, %f98, %f1; sub.f32 %f104, %f99, %f2; sub.f32 %f105, %f100, %f3; fma.rn.f32 %f16, %f102, %f103, %f128; fma.rn.f32 %f17, %f102, %f104, %f129; fma.rn.f32 %f18, %f102, %f105, %f130; add.s32 %r28, %r3, 1; @%p20 bra BB0_20; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_21; BB0_20: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_21: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd61, %r113; mul.wide.s32 %rd62, %r113, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f106, [%rd63]; ld.global.nc.f32 %f107, [%rd64]; mul.f32 %f108, %f107, %f107; fma.rn.f32 %f109, %f106, %f106, %f108; ld.global.nc.f32 %f110, [%rd65]; fma.rn.f32 %f111, %f110, %f110, %f109; setp.eq.f32 %p24, %f111, 0f00000000; selp.f32 %f112, %f1, %f106, %p24; selp.f32 %f113, %f2, %f107, %p24; selp.f32 %f114, %f3, %f110, %p24; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs23, [%rd66]; setp.gt.u16 %p25, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p25; selp.b32 %r117, %r115, %r7, %p25; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd67, %r121, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f115, [%rd68]; mul.f32 %f116, %f115, %f32; sub.f32 %f117, %f112, %f1; sub.f32 %f118, %f113, %f2; sub.f32 %f119, %f114, %f3; fma.rn.f32 %f128, %f116, %f117, %f16; fma.rn.f32 %f129, %f116, %f118, %f17; fma.rn.f32 %f130, %f116, %f119, %f18; BB0_22: setp.eq.s64 %p26, %rd9, 0; @%p26 bra BB0_24; cvta.to.global.u64 %rd69, %rd9; add.s64 %rd71, %rd69, %rd15; ld.global.nc.f32 %f120, [%rd71]; mul.f32 %f131, %f120, %f131; BB0_24: setp.eq.f32 %p27, %f131, 0f00000000; mov.f32 %f132, 0f00000000; @%p27 bra BB0_26; rcp.rn.f32 %f132, %f131; BB0_26: cvta.to.global.u64 %rd72, %rd8; cvta.to.global.u64 %rd73, %rd7; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd76, %rd74, %rd15; ld.global.f32 %f122, [%rd76]; fma.rn.f32 %f123, %f128, %f132, %f122; st.global.f32 [%rd76], %f123; add.s64 %rd77, %rd73, %rd15; ld.global.f32 %f124, [%rd77]; fma.rn.f32 %f125, %f129, %f132, %f124; st.global.f32 [%rd77], %f125; add.s64 %rd78, %rd72, %rd15; ld.global.f32 %f126, [%rd78]; fma.rn.f32 %f127, %f130, %f132, %f126; st.global.f32 [%rd78], %f127; BB0_27: ret; } ` addexchange_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<28>; .reg .b16 %rs<26>; .reg .f32 %f<133>; .reg .b32 %r<128>; .reg .b64 %rd<79>; ld.param.u64 %rd6, [addexchange_param_0]; ld.param.u64 %rd7, [addexchange_param_1]; ld.param.u64 %rd8, [addexchange_param_2]; ld.param.u64 %rd10, [addexchange_param_3]; ld.param.u64 %rd11, [addexchange_param_4]; ld.param.u64 %rd12, [addexchange_param_5]; ld.param.u64 %rd9, [addexchange_param_6]; ld.param.f32 %f131, [addexchange_param_7]; ld.param.u64 %rd13, [addexchange_param_8]; ld.param.u64 %rd14, [addexchange_param_9]; ld.param.f32 %f30, [addexchange_param_10]; ld.param.f32 %f31, [addexchange_param_11]; ld.param.f32 %f32, [addexchange_param_12]; ld.param.u32 %r32, [addexchange_param_13]; ld.param.u32 %r33, [addexchange_param_14]; ld.param.u32 %r34, [addexchange_param_15]; ld.param.u8 %rs5, [addexchange_param_16]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd12; cvta.to.global.u64 %rd4, %rd11; cvta.to.global.u64 %rd5, %rd10; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_27; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd15, %r6, 4; add.s64 %rd16, %rd5, %rd15; add.s64 %rd17, %rd4, %rd15; add.s64 %rd18, %rd3, %rd15; ld.global.nc.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd17]; mul.f32 %f33, %f2, %f2; fma.rn.f32 %f34, %f1, %f1, %f33; ld.global.nc.f32 %f3, [%rd18]; fma.rn.f32 %f35, %f3, %f3, %f34; setp.eq.f32 %p6, %f35, 0f00000000; @%p6 bra BB0_27; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd2, %rd19; ld.global.nc.u8 %rs1, [%rd20]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd21, %r49; mul.wide.s32 %rd22, %r49, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f36, [%rd23]; ld.global.nc.f32 %f37, [%rd24]; mul.f32 %f38, %f37, %f37; fma.rn.f32 %f39, %f36, %f36, %f38; ld.global.nc.f32 %f40, [%rd25]; fma.rn.f32 %f41, %f40, %f40, %f39; setp.eq.f32 %p8, %f41, 0f00000000; selp.f32 %f42, %f1, %f36, %p8; selp.f32 %f43, %f2, %f37, %p8; selp.f32 %f44, %f3, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; setp.gt.u16 %p9, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p9; selp.b32 %r53, %r51, %r7, %p9; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd27, %r57, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f45, [%rd28]; mul.f32 %f46, %f45, %f30; sub.f32 %f47, %f42, %f1; sub.f32 %f48, %f43, %f2; sub.f32 %f49, %f44, %f3; fma.rn.f32 %f4, %f46, %f47, 0f00000000; fma.rn.f32 %f5, %f46, %f48, 0f00000000; fma.rn.f32 %f6, %f46, %f49, 0f00000000; add.s32 %r12, %r1, 1; @%p7 bra BB0_7; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_8; BB0_7: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_8: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd29, %r61; mul.wide.s32 %rd30, %r61, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f50, [%rd31]; ld.global.nc.f32 %f51, [%rd32]; mul.f32 %f52, %f51, %f51; fma.rn.f32 %f53, %f50, %f50, %f52; ld.global.nc.f32 %f54, [%rd33]; fma.rn.f32 %f55, %f54, %f54, %f53; setp.eq.f32 %p11, %f55, 0f00000000; selp.f32 %f56, %f1, %f50, %p11; selp.f32 %f57, %f2, %f51, %p11; selp.f32 %f58, %f3, %f54, %p11; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs9, [%rd34]; setp.gt.u16 %p12, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p12; selp.b32 %r65, %r63, %r7, %p12; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd35, %r69, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f59, [%rd36]; mul.f32 %f60, %f59, %f30; sub.f32 %f61, %f56, %f1; sub.f32 %f62, %f57, %f2; sub.f32 %f63, %f58, %f3; fma.rn.f32 %f7, %f60, %f61, %f4; fma.rn.f32 %f8, %f60, %f62, %f5; fma.rn.f32 %f9, %f60, %f63, %f6; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p13, %rs3, 0; add.s32 %r16, %r2, -1; @%p13 bra BB0_10; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_11; BB0_10: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_11: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd37, %r74; mul.wide.s32 %rd38, %r74, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f64, [%rd39]; ld.global.nc.f32 %f65, [%rd40]; mul.f32 %f66, %f65, %f65; fma.rn.f32 %f67, %f64, %f64, %f66; ld.global.nc.f32 %f68, [%rd41]; fma.rn.f32 %f69, %f68, %f68, %f67; setp.eq.f32 %p14, %f69, 0f00000000; selp.f32 %f70, %f1, %f64, %p14; selp.f32 %f71, %f2, %f65, %p14; selp.f32 %f72, %f3, %f68, %p14; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs12, [%rd42]; setp.gt.u16 %p15, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p15; selp.b32 %r78, %r76, %r7, %p15; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd43, %r82, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f73, [%rd44]; mul.f32 %f74, %f73, %f31; sub.f32 %f75, %f70, %f1; sub.f32 %f76, %f71, %f2; sub.f32 %f77, %f72, %f3; fma.rn.f32 %f10, %f74, %f75, %f7; fma.rn.f32 %f11, %f74, %f76, %f8; fma.rn.f32 %f12, %f74, %f77, %f9; add.s32 %r20, %r2, 1; @%p13 bra BB0_13; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_14; BB0_13: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_14: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd45, %r87; mul.wide.s32 %rd46, %r87, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f78, [%rd47]; ld.global.nc.f32 %f79, [%rd48]; mul.f32 %f80, %f79, %f79; fma.rn.f32 %f81, %f78, %f78, %f80; ld.global.nc.f32 %f82, [%rd49]; fma.rn.f32 %f83, %f82, %f82, %f81; setp.eq.f32 %p17, %f83, 0f00000000; selp.f32 %f84, %f1, %f78, %p17; selp.f32 %f85, %f2, %f79, %p17; selp.f32 %f86, %f3, %f82, %p17; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs16, [%rd50]; setp.gt.u16 %p18, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p18; selp.b32 %r91, %r89, %r7, %p18; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd51, %r95, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f87, [%rd52]; mul.f32 %f88, %f87, %f31; sub.f32 %f89, %f84, %f1; sub.f32 %f90, %f85, %f2; sub.f32 %f91, %f86, %f3; fma.rn.f32 %f128, %f88, %f89, %f10; fma.rn.f32 %f129, %f88, %f90, %f11; fma.rn.f32 %f130, %f88, %f91, %f12; setp.eq.s32 %p19, %r34, 1; @%p19 bra BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p20, %rs4, 0; add.s32 %r24, %r3, -1; @%p20 bra BB0_17; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_18; BB0_17: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_18: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd53, %r100; mul.wide.s32 %rd54, %r100, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f92, [%rd55]; ld.global.nc.f32 %f93, [%rd56]; mul.f32 %f94, %f93, %f93; fma.rn.f32 %f95, %f92, %f92, %f94; ld.global.nc.f32 %f96, [%rd57]; fma.rn.f32 %f97, %f96, %f96, %f95; setp.eq.f32 %p21, %f97, 0f00000000; selp.f32 %f98, %f1, %f92, %p21; selp.f32 %f99, %f2, %f93, %p21; selp.f32 %f100, %f3, %f96, %p21; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs19, [%rd58]; setp.gt.u16 %p22, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p22; selp.b32 %r104, %r102, %r7, %p22; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd59, %r108, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f101, [%rd60]; mul.f32 %f102, %f101, %f32; sub.f32 %f103, %f98, %f1; sub.f32 %f104, %f99, %f2; sub.f32 %f105, %f100, %f3; fma.rn.f32 %f16, %f102, %f103, %f128; fma.rn.f32 %f17, %f102, %f104, %f129; fma.rn.f32 %f18, %f102, %f105, %f130; add.s32 %r28, %r3, 1; @%p20 bra BB0_20; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_21; BB0_20: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_21: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd61, %r113; mul.wide.s32 %rd62, %r113, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f106, [%rd63]; ld.global.nc.f32 %f107, [%rd64]; mul.f32 %f108, %f107, %f107; fma.rn.f32 %f109, %f106, %f106, %f108; ld.global.nc.f32 %f110, [%rd65]; fma.rn.f32 %f111, %f110, %f110, %f109; setp.eq.f32 %p24, %f111, 0f00000000; selp.f32 %f112, %f1, %f106, %p24; selp.f32 %f113, %f2, %f107, %p24; selp.f32 %f114, %f3, %f110, %p24; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs23, [%rd66]; setp.gt.u16 %p25, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p25; selp.b32 %r117, %r115, %r7, %p25; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd67, %r121, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f115, [%rd68]; mul.f32 %f116, %f115, %f32; sub.f32 %f117, %f112, %f1; sub.f32 %f118, %f113, %f2; sub.f32 %f119, %f114, %f3; fma.rn.f32 %f128, %f116, %f117, %f16; fma.rn.f32 %f129, %f116, %f118, %f17; fma.rn.f32 %f130, %f116, %f119, %f18; BB0_22: setp.eq.s64 %p26, %rd9, 0; @%p26 bra BB0_24; cvta.to.global.u64 %rd69, %rd9; add.s64 %rd71, %rd69, %rd15; ld.global.nc.f32 %f120, [%rd71]; mul.f32 %f131, %f120, %f131; BB0_24: setp.eq.f32 %p27, %f131, 0f00000000; mov.f32 %f132, 0f00000000; @%p27 bra BB0_26; rcp.rn.f32 %f132, %f131; BB0_26: cvta.to.global.u64 %rd72, %rd8; cvta.to.global.u64 %rd73, %rd7; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd76, %rd74, %rd15; ld.global.f32 %f122, [%rd76]; fma.rn.f32 %f123, %f128, %f132, %f122; st.global.f32 [%rd76], %f123; add.s64 %rd77, %rd73, %rd15; ld.global.f32 %f124, [%rd77]; fma.rn.f32 %f125, %f129, %f132, %f124; st.global.f32 [%rd77], %f125; add.s64 %rd78, %rd72, %rd15; ld.global.f32 %f126, [%rd78]; fma.rn.f32 %f127, %f130, %f132, %f126; st.global.f32 [%rd78], %f127; BB0_27: ret; } ` addexchange_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<28>; .reg .b16 %rs<26>; .reg .f32 %f<133>; .reg .b32 %r<128>; .reg .b64 %rd<79>; ld.param.u64 %rd6, [addexchange_param_0]; ld.param.u64 %rd7, [addexchange_param_1]; ld.param.u64 %rd8, [addexchange_param_2]; ld.param.u64 %rd10, [addexchange_param_3]; ld.param.u64 %rd11, [addexchange_param_4]; ld.param.u64 %rd12, [addexchange_param_5]; ld.param.u64 %rd9, [addexchange_param_6]; ld.param.f32 %f131, [addexchange_param_7]; ld.param.u64 %rd13, [addexchange_param_8]; ld.param.u64 %rd14, [addexchange_param_9]; ld.param.f32 %f30, [addexchange_param_10]; ld.param.f32 %f31, [addexchange_param_11]; ld.param.f32 %f32, [addexchange_param_12]; ld.param.u32 %r32, [addexchange_param_13]; ld.param.u32 %r33, [addexchange_param_14]; ld.param.u32 %r34, [addexchange_param_15]; ld.param.u8 %rs5, [addexchange_param_16]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd12; cvta.to.global.u64 %rd4, %rd11; cvta.to.global.u64 %rd5, %rd10; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_27; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd15, %r6, 4; add.s64 %rd16, %rd5, %rd15; add.s64 %rd17, %rd4, %rd15; add.s64 %rd18, %rd3, %rd15; ld.global.nc.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd17]; mul.f32 %f33, %f2, %f2; fma.rn.f32 %f34, %f1, %f1, %f33; ld.global.nc.f32 %f3, [%rd18]; fma.rn.f32 %f35, %f3, %f3, %f34; setp.eq.f32 %p6, %f35, 0f00000000; @%p6 bra BB0_27; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd2, %rd19; ld.global.nc.u8 %rs1, [%rd20]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd21, %r49; mul.wide.s32 %rd22, %r49, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f36, [%rd23]; ld.global.nc.f32 %f37, [%rd24]; mul.f32 %f38, %f37, %f37; fma.rn.f32 %f39, %f36, %f36, %f38; ld.global.nc.f32 %f40, [%rd25]; fma.rn.f32 %f41, %f40, %f40, %f39; setp.eq.f32 %p8, %f41, 0f00000000; selp.f32 %f42, %f1, %f36, %p8; selp.f32 %f43, %f2, %f37, %p8; selp.f32 %f44, %f3, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; setp.gt.u16 %p9, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p9; selp.b32 %r53, %r51, %r7, %p9; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd27, %r57, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f45, [%rd28]; mul.f32 %f46, %f45, %f30; sub.f32 %f47, %f42, %f1; sub.f32 %f48, %f43, %f2; sub.f32 %f49, %f44, %f3; fma.rn.f32 %f4, %f46, %f47, 0f00000000; fma.rn.f32 %f5, %f46, %f48, 0f00000000; fma.rn.f32 %f6, %f46, %f49, 0f00000000; add.s32 %r12, %r1, 1; @%p7 bra BB0_7; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_8; BB0_7: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_8: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd29, %r61; mul.wide.s32 %rd30, %r61, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f50, [%rd31]; ld.global.nc.f32 %f51, [%rd32]; mul.f32 %f52, %f51, %f51; fma.rn.f32 %f53, %f50, %f50, %f52; ld.global.nc.f32 %f54, [%rd33]; fma.rn.f32 %f55, %f54, %f54, %f53; setp.eq.f32 %p11, %f55, 0f00000000; selp.f32 %f56, %f1, %f50, %p11; selp.f32 %f57, %f2, %f51, %p11; selp.f32 %f58, %f3, %f54, %p11; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs9, [%rd34]; setp.gt.u16 %p12, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p12; selp.b32 %r65, %r63, %r7, %p12; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd35, %r69, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f59, [%rd36]; mul.f32 %f60, %f59, %f30; sub.f32 %f61, %f56, %f1; sub.f32 %f62, %f57, %f2; sub.f32 %f63, %f58, %f3; fma.rn.f32 %f7, %f60, %f61, %f4; fma.rn.f32 %f8, %f60, %f62, %f5; fma.rn.f32 %f9, %f60, %f63, %f6; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p13, %rs3, 0; add.s32 %r16, %r2, -1; @%p13 bra BB0_10; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_11; BB0_10: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_11: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd37, %r74; mul.wide.s32 %rd38, %r74, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f64, [%rd39]; ld.global.nc.f32 %f65, [%rd40]; mul.f32 %f66, %f65, %f65; fma.rn.f32 %f67, %f64, %f64, %f66; ld.global.nc.f32 %f68, [%rd41]; fma.rn.f32 %f69, %f68, %f68, %f67; setp.eq.f32 %p14, %f69, 0f00000000; selp.f32 %f70, %f1, %f64, %p14; selp.f32 %f71, %f2, %f65, %p14; selp.f32 %f72, %f3, %f68, %p14; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs12, [%rd42]; setp.gt.u16 %p15, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p15; selp.b32 %r78, %r76, %r7, %p15; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd43, %r82, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f73, [%rd44]; mul.f32 %f74, %f73, %f31; sub.f32 %f75, %f70, %f1; sub.f32 %f76, %f71, %f2; sub.f32 %f77, %f72, %f3; fma.rn.f32 %f10, %f74, %f75, %f7; fma.rn.f32 %f11, %f74, %f76, %f8; fma.rn.f32 %f12, %f74, %f77, %f9; add.s32 %r20, %r2, 1; @%p13 bra BB0_13; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_14; BB0_13: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_14: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd45, %r87; mul.wide.s32 %rd46, %r87, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f78, [%rd47]; ld.global.nc.f32 %f79, [%rd48]; mul.f32 %f80, %f79, %f79; fma.rn.f32 %f81, %f78, %f78, %f80; ld.global.nc.f32 %f82, [%rd49]; fma.rn.f32 %f83, %f82, %f82, %f81; setp.eq.f32 %p17, %f83, 0f00000000; selp.f32 %f84, %f1, %f78, %p17; selp.f32 %f85, %f2, %f79, %p17; selp.f32 %f86, %f3, %f82, %p17; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs16, [%rd50]; setp.gt.u16 %p18, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p18; selp.b32 %r91, %r89, %r7, %p18; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd51, %r95, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f87, [%rd52]; mul.f32 %f88, %f87, %f31; sub.f32 %f89, %f84, %f1; sub.f32 %f90, %f85, %f2; sub.f32 %f91, %f86, %f3; fma.rn.f32 %f128, %f88, %f89, %f10; fma.rn.f32 %f129, %f88, %f90, %f11; fma.rn.f32 %f130, %f88, %f91, %f12; setp.eq.s32 %p19, %r34, 1; @%p19 bra BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p20, %rs4, 0; add.s32 %r24, %r3, -1; @%p20 bra BB0_17; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_18; BB0_17: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_18: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd53, %r100; mul.wide.s32 %rd54, %r100, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f92, [%rd55]; ld.global.nc.f32 %f93, [%rd56]; mul.f32 %f94, %f93, %f93; fma.rn.f32 %f95, %f92, %f92, %f94; ld.global.nc.f32 %f96, [%rd57]; fma.rn.f32 %f97, %f96, %f96, %f95; setp.eq.f32 %p21, %f97, 0f00000000; selp.f32 %f98, %f1, %f92, %p21; selp.f32 %f99, %f2, %f93, %p21; selp.f32 %f100, %f3, %f96, %p21; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs19, [%rd58]; setp.gt.u16 %p22, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p22; selp.b32 %r104, %r102, %r7, %p22; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd59, %r108, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f101, [%rd60]; mul.f32 %f102, %f101, %f32; sub.f32 %f103, %f98, %f1; sub.f32 %f104, %f99, %f2; sub.f32 %f105, %f100, %f3; fma.rn.f32 %f16, %f102, %f103, %f128; fma.rn.f32 %f17, %f102, %f104, %f129; fma.rn.f32 %f18, %f102, %f105, %f130; add.s32 %r28, %r3, 1; @%p20 bra BB0_20; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_21; BB0_20: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_21: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd61, %r113; mul.wide.s32 %rd62, %r113, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f106, [%rd63]; ld.global.nc.f32 %f107, [%rd64]; mul.f32 %f108, %f107, %f107; fma.rn.f32 %f109, %f106, %f106, %f108; ld.global.nc.f32 %f110, [%rd65]; fma.rn.f32 %f111, %f110, %f110, %f109; setp.eq.f32 %p24, %f111, 0f00000000; selp.f32 %f112, %f1, %f106, %p24; selp.f32 %f113, %f2, %f107, %p24; selp.f32 %f114, %f3, %f110, %p24; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs23, [%rd66]; setp.gt.u16 %p25, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p25; selp.b32 %r117, %r115, %r7, %p25; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd67, %r121, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f115, [%rd68]; mul.f32 %f116, %f115, %f32; sub.f32 %f117, %f112, %f1; sub.f32 %f118, %f113, %f2; sub.f32 %f119, %f114, %f3; fma.rn.f32 %f128, %f116, %f117, %f16; fma.rn.f32 %f129, %f116, %f118, %f17; fma.rn.f32 %f130, %f116, %f119, %f18; BB0_22: setp.eq.s64 %p26, %rd9, 0; @%p26 bra BB0_24; cvta.to.global.u64 %rd69, %rd9; add.s64 %rd71, %rd69, %rd15; ld.global.nc.f32 %f120, [%rd71]; mul.f32 %f131, %f120, %f131; BB0_24: setp.eq.f32 %p27, %f131, 0f00000000; mov.f32 %f132, 0f00000000; @%p27 bra BB0_26; rcp.rn.f32 %f132, %f131; BB0_26: cvta.to.global.u64 %rd72, %rd8; cvta.to.global.u64 %rd73, %rd7; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd76, %rd74, %rd15; ld.global.f32 %f122, [%rd76]; fma.rn.f32 %f123, %f128, %f132, %f122; st.global.f32 [%rd76], %f123; add.s64 %rd77, %rd73, %rd15; ld.global.f32 %f124, [%rd77]; fma.rn.f32 %f125, %f129, %f132, %f124; st.global.f32 [%rd77], %f125; add.s64 %rd78, %rd72, %rd15; ld.global.f32 %f126, [%rd78]; fma.rn.f32 %f127, %f130, %f132, %f126; st.global.f32 [%rd78], %f127; BB0_27: ret; } ` addexchange_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<28>; .reg .b16 %rs<26>; .reg .f32 %f<133>; .reg .b32 %r<128>; .reg .b64 %rd<79>; ld.param.u64 %rd6, [addexchange_param_0]; ld.param.u64 %rd7, [addexchange_param_1]; ld.param.u64 %rd8, [addexchange_param_2]; ld.param.u64 %rd10, [addexchange_param_3]; ld.param.u64 %rd11, [addexchange_param_4]; ld.param.u64 %rd12, [addexchange_param_5]; ld.param.u64 %rd9, [addexchange_param_6]; ld.param.f32 %f131, [addexchange_param_7]; ld.param.u64 %rd13, [addexchange_param_8]; ld.param.u64 %rd14, [addexchange_param_9]; ld.param.f32 %f30, [addexchange_param_10]; ld.param.f32 %f31, [addexchange_param_11]; ld.param.f32 %f32, [addexchange_param_12]; ld.param.u32 %r32, [addexchange_param_13]; ld.param.u32 %r33, [addexchange_param_14]; ld.param.u32 %r34, [addexchange_param_15]; ld.param.u8 %rs5, [addexchange_param_16]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd12; cvta.to.global.u64 %rd4, %rd11; cvta.to.global.u64 %rd5, %rd10; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_27; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd15, %r6, 4; add.s64 %rd16, %rd5, %rd15; add.s64 %rd17, %rd4, %rd15; add.s64 %rd18, %rd3, %rd15; ld.global.nc.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd17]; mul.f32 %f33, %f2, %f2; fma.rn.f32 %f34, %f1, %f1, %f33; ld.global.nc.f32 %f3, [%rd18]; fma.rn.f32 %f35, %f3, %f3, %f34; setp.eq.f32 %p6, %f35, 0f00000000; @%p6 bra BB0_27; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd2, %rd19; ld.global.nc.u8 %rs1, [%rd20]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd21, %r49; mul.wide.s32 %rd22, %r49, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f36, [%rd23]; ld.global.nc.f32 %f37, [%rd24]; mul.f32 %f38, %f37, %f37; fma.rn.f32 %f39, %f36, %f36, %f38; ld.global.nc.f32 %f40, [%rd25]; fma.rn.f32 %f41, %f40, %f40, %f39; setp.eq.f32 %p8, %f41, 0f00000000; selp.f32 %f42, %f1, %f36, %p8; selp.f32 %f43, %f2, %f37, %p8; selp.f32 %f44, %f3, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; setp.gt.u16 %p9, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p9; selp.b32 %r53, %r51, %r7, %p9; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd27, %r57, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f45, [%rd28]; mul.f32 %f46, %f45, %f30; sub.f32 %f47, %f42, %f1; sub.f32 %f48, %f43, %f2; sub.f32 %f49, %f44, %f3; fma.rn.f32 %f4, %f46, %f47, 0f00000000; fma.rn.f32 %f5, %f46, %f48, 0f00000000; fma.rn.f32 %f6, %f46, %f49, 0f00000000; add.s32 %r12, %r1, 1; @%p7 bra BB0_7; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_8; BB0_7: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_8: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd29, %r61; mul.wide.s32 %rd30, %r61, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f50, [%rd31]; ld.global.nc.f32 %f51, [%rd32]; mul.f32 %f52, %f51, %f51; fma.rn.f32 %f53, %f50, %f50, %f52; ld.global.nc.f32 %f54, [%rd33]; fma.rn.f32 %f55, %f54, %f54, %f53; setp.eq.f32 %p11, %f55, 0f00000000; selp.f32 %f56, %f1, %f50, %p11; selp.f32 %f57, %f2, %f51, %p11; selp.f32 %f58, %f3, %f54, %p11; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs9, [%rd34]; setp.gt.u16 %p12, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p12; selp.b32 %r65, %r63, %r7, %p12; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd35, %r69, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f59, [%rd36]; mul.f32 %f60, %f59, %f30; sub.f32 %f61, %f56, %f1; sub.f32 %f62, %f57, %f2; sub.f32 %f63, %f58, %f3; fma.rn.f32 %f7, %f60, %f61, %f4; fma.rn.f32 %f8, %f60, %f62, %f5; fma.rn.f32 %f9, %f60, %f63, %f6; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p13, %rs3, 0; add.s32 %r16, %r2, -1; @%p13 bra BB0_10; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_11; BB0_10: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_11: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd37, %r74; mul.wide.s32 %rd38, %r74, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f64, [%rd39]; ld.global.nc.f32 %f65, [%rd40]; mul.f32 %f66, %f65, %f65; fma.rn.f32 %f67, %f64, %f64, %f66; ld.global.nc.f32 %f68, [%rd41]; fma.rn.f32 %f69, %f68, %f68, %f67; setp.eq.f32 %p14, %f69, 0f00000000; selp.f32 %f70, %f1, %f64, %p14; selp.f32 %f71, %f2, %f65, %p14; selp.f32 %f72, %f3, %f68, %p14; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs12, [%rd42]; setp.gt.u16 %p15, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p15; selp.b32 %r78, %r76, %r7, %p15; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd43, %r82, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f73, [%rd44]; mul.f32 %f74, %f73, %f31; sub.f32 %f75, %f70, %f1; sub.f32 %f76, %f71, %f2; sub.f32 %f77, %f72, %f3; fma.rn.f32 %f10, %f74, %f75, %f7; fma.rn.f32 %f11, %f74, %f76, %f8; fma.rn.f32 %f12, %f74, %f77, %f9; add.s32 %r20, %r2, 1; @%p13 bra BB0_13; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_14; BB0_13: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_14: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd45, %r87; mul.wide.s32 %rd46, %r87, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f78, [%rd47]; ld.global.nc.f32 %f79, [%rd48]; mul.f32 %f80, %f79, %f79; fma.rn.f32 %f81, %f78, %f78, %f80; ld.global.nc.f32 %f82, [%rd49]; fma.rn.f32 %f83, %f82, %f82, %f81; setp.eq.f32 %p17, %f83, 0f00000000; selp.f32 %f84, %f1, %f78, %p17; selp.f32 %f85, %f2, %f79, %p17; selp.f32 %f86, %f3, %f82, %p17; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs16, [%rd50]; setp.gt.u16 %p18, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p18; selp.b32 %r91, %r89, %r7, %p18; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd51, %r95, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f87, [%rd52]; mul.f32 %f88, %f87, %f31; sub.f32 %f89, %f84, %f1; sub.f32 %f90, %f85, %f2; sub.f32 %f91, %f86, %f3; fma.rn.f32 %f128, %f88, %f89, %f10; fma.rn.f32 %f129, %f88, %f90, %f11; fma.rn.f32 %f130, %f88, %f91, %f12; setp.eq.s32 %p19, %r34, 1; @%p19 bra BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p20, %rs4, 0; add.s32 %r24, %r3, -1; @%p20 bra BB0_17; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_18; BB0_17: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_18: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd53, %r100; mul.wide.s32 %rd54, %r100, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f92, [%rd55]; ld.global.nc.f32 %f93, [%rd56]; mul.f32 %f94, %f93, %f93; fma.rn.f32 %f95, %f92, %f92, %f94; ld.global.nc.f32 %f96, [%rd57]; fma.rn.f32 %f97, %f96, %f96, %f95; setp.eq.f32 %p21, %f97, 0f00000000; selp.f32 %f98, %f1, %f92, %p21; selp.f32 %f99, %f2, %f93, %p21; selp.f32 %f100, %f3, %f96, %p21; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs19, [%rd58]; setp.gt.u16 %p22, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p22; selp.b32 %r104, %r102, %r7, %p22; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd59, %r108, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f101, [%rd60]; mul.f32 %f102, %f101, %f32; sub.f32 %f103, %f98, %f1; sub.f32 %f104, %f99, %f2; sub.f32 %f105, %f100, %f3; fma.rn.f32 %f16, %f102, %f103, %f128; fma.rn.f32 %f17, %f102, %f104, %f129; fma.rn.f32 %f18, %f102, %f105, %f130; add.s32 %r28, %r3, 1; @%p20 bra BB0_20; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_21; BB0_20: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_21: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd61, %r113; mul.wide.s32 %rd62, %r113, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f106, [%rd63]; ld.global.nc.f32 %f107, [%rd64]; mul.f32 %f108, %f107, %f107; fma.rn.f32 %f109, %f106, %f106, %f108; ld.global.nc.f32 %f110, [%rd65]; fma.rn.f32 %f111, %f110, %f110, %f109; setp.eq.f32 %p24, %f111, 0f00000000; selp.f32 %f112, %f1, %f106, %p24; selp.f32 %f113, %f2, %f107, %p24; selp.f32 %f114, %f3, %f110, %p24; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs23, [%rd66]; setp.gt.u16 %p25, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p25; selp.b32 %r117, %r115, %r7, %p25; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd67, %r121, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f115, [%rd68]; mul.f32 %f116, %f115, %f32; sub.f32 %f117, %f112, %f1; sub.f32 %f118, %f113, %f2; sub.f32 %f119, %f114, %f3; fma.rn.f32 %f128, %f116, %f117, %f16; fma.rn.f32 %f129, %f116, %f118, %f17; fma.rn.f32 %f130, %f116, %f119, %f18; BB0_22: setp.eq.s64 %p26, %rd9, 0; @%p26 bra BB0_24; cvta.to.global.u64 %rd69, %rd9; add.s64 %rd71, %rd69, %rd15; ld.global.nc.f32 %f120, [%rd71]; mul.f32 %f131, %f120, %f131; BB0_24: setp.eq.f32 %p27, %f131, 0f00000000; mov.f32 %f132, 0f00000000; @%p27 bra BB0_26; rcp.rn.f32 %f132, %f131; BB0_26: cvta.to.global.u64 %rd72, %rd8; cvta.to.global.u64 %rd73, %rd7; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd76, %rd74, %rd15; ld.global.f32 %f122, [%rd76]; fma.rn.f32 %f123, %f128, %f132, %f122; st.global.f32 [%rd76], %f123; add.s64 %rd77, %rd73, %rd15; ld.global.f32 %f124, [%rd77]; fma.rn.f32 %f125, %f129, %f132, %f124; st.global.f32 [%rd77], %f125; add.s64 %rd78, %rd72, %rd15; ld.global.f32 %f126, [%rd78]; fma.rn.f32 %f127, %f130, %f132, %f126; st.global.f32 [%rd78], %f127; BB0_27: ret; } ` addexchange_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<28>; .reg .b16 %rs<26>; .reg .f32 %f<133>; .reg .b32 %r<128>; .reg .b64 %rd<79>; ld.param.u64 %rd6, [addexchange_param_0]; ld.param.u64 %rd7, [addexchange_param_1]; ld.param.u64 %rd8, [addexchange_param_2]; ld.param.u64 %rd10, [addexchange_param_3]; ld.param.u64 %rd11, [addexchange_param_4]; ld.param.u64 %rd12, [addexchange_param_5]; ld.param.u64 %rd9, [addexchange_param_6]; ld.param.f32 %f131, [addexchange_param_7]; ld.param.u64 %rd13, [addexchange_param_8]; ld.param.u64 %rd14, [addexchange_param_9]; ld.param.f32 %f30, [addexchange_param_10]; ld.param.f32 %f31, [addexchange_param_11]; ld.param.f32 %f32, [addexchange_param_12]; ld.param.u32 %r32, [addexchange_param_13]; ld.param.u32 %r33, [addexchange_param_14]; ld.param.u32 %r34, [addexchange_param_15]; ld.param.u8 %rs5, [addexchange_param_16]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd12; cvta.to.global.u64 %rd4, %rd11; cvta.to.global.u64 %rd5, %rd10; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_27; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd15, %r6, 4; add.s64 %rd16, %rd5, %rd15; add.s64 %rd17, %rd4, %rd15; add.s64 %rd18, %rd3, %rd15; ld.global.nc.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd17]; mul.f32 %f33, %f2, %f2; fma.rn.f32 %f34, %f1, %f1, %f33; ld.global.nc.f32 %f3, [%rd18]; fma.rn.f32 %f35, %f3, %f3, %f34; setp.eq.f32 %p6, %f35, 0f00000000; @%p6 bra BB0_27; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd2, %rd19; ld.global.nc.u8 %rs1, [%rd20]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd21, %r49; mul.wide.s32 %rd22, %r49, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f36, [%rd23]; ld.global.nc.f32 %f37, [%rd24]; mul.f32 %f38, %f37, %f37; fma.rn.f32 %f39, %f36, %f36, %f38; ld.global.nc.f32 %f40, [%rd25]; fma.rn.f32 %f41, %f40, %f40, %f39; setp.eq.f32 %p8, %f41, 0f00000000; selp.f32 %f42, %f1, %f36, %p8; selp.f32 %f43, %f2, %f37, %p8; selp.f32 %f44, %f3, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; setp.gt.u16 %p9, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p9; selp.b32 %r53, %r51, %r7, %p9; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd27, %r57, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f45, [%rd28]; mul.f32 %f46, %f45, %f30; sub.f32 %f47, %f42, %f1; sub.f32 %f48, %f43, %f2; sub.f32 %f49, %f44, %f3; fma.rn.f32 %f4, %f46, %f47, 0f00000000; fma.rn.f32 %f5, %f46, %f48, 0f00000000; fma.rn.f32 %f6, %f46, %f49, 0f00000000; add.s32 %r12, %r1, 1; @%p7 bra BB0_7; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_8; BB0_7: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_8: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd29, %r61; mul.wide.s32 %rd30, %r61, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f50, [%rd31]; ld.global.nc.f32 %f51, [%rd32]; mul.f32 %f52, %f51, %f51; fma.rn.f32 %f53, %f50, %f50, %f52; ld.global.nc.f32 %f54, [%rd33]; fma.rn.f32 %f55, %f54, %f54, %f53; setp.eq.f32 %p11, %f55, 0f00000000; selp.f32 %f56, %f1, %f50, %p11; selp.f32 %f57, %f2, %f51, %p11; selp.f32 %f58, %f3, %f54, %p11; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs9, [%rd34]; setp.gt.u16 %p12, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p12; selp.b32 %r65, %r63, %r7, %p12; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd35, %r69, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f59, [%rd36]; mul.f32 %f60, %f59, %f30; sub.f32 %f61, %f56, %f1; sub.f32 %f62, %f57, %f2; sub.f32 %f63, %f58, %f3; fma.rn.f32 %f7, %f60, %f61, %f4; fma.rn.f32 %f8, %f60, %f62, %f5; fma.rn.f32 %f9, %f60, %f63, %f6; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p13, %rs3, 0; add.s32 %r16, %r2, -1; @%p13 bra BB0_10; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_11; BB0_10: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_11: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd37, %r74; mul.wide.s32 %rd38, %r74, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f64, [%rd39]; ld.global.nc.f32 %f65, [%rd40]; mul.f32 %f66, %f65, %f65; fma.rn.f32 %f67, %f64, %f64, %f66; ld.global.nc.f32 %f68, [%rd41]; fma.rn.f32 %f69, %f68, %f68, %f67; setp.eq.f32 %p14, %f69, 0f00000000; selp.f32 %f70, %f1, %f64, %p14; selp.f32 %f71, %f2, %f65, %p14; selp.f32 %f72, %f3, %f68, %p14; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs12, [%rd42]; setp.gt.u16 %p15, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p15; selp.b32 %r78, %r76, %r7, %p15; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd43, %r82, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f73, [%rd44]; mul.f32 %f74, %f73, %f31; sub.f32 %f75, %f70, %f1; sub.f32 %f76, %f71, %f2; sub.f32 %f77, %f72, %f3; fma.rn.f32 %f10, %f74, %f75, %f7; fma.rn.f32 %f11, %f74, %f76, %f8; fma.rn.f32 %f12, %f74, %f77, %f9; add.s32 %r20, %r2, 1; @%p13 bra BB0_13; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_14; BB0_13: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_14: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd45, %r87; mul.wide.s32 %rd46, %r87, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f78, [%rd47]; ld.global.nc.f32 %f79, [%rd48]; mul.f32 %f80, %f79, %f79; fma.rn.f32 %f81, %f78, %f78, %f80; ld.global.nc.f32 %f82, [%rd49]; fma.rn.f32 %f83, %f82, %f82, %f81; setp.eq.f32 %p17, %f83, 0f00000000; selp.f32 %f84, %f1, %f78, %p17; selp.f32 %f85, %f2, %f79, %p17; selp.f32 %f86, %f3, %f82, %p17; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs16, [%rd50]; setp.gt.u16 %p18, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p18; selp.b32 %r91, %r89, %r7, %p18; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd51, %r95, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f87, [%rd52]; mul.f32 %f88, %f87, %f31; sub.f32 %f89, %f84, %f1; sub.f32 %f90, %f85, %f2; sub.f32 %f91, %f86, %f3; fma.rn.f32 %f128, %f88, %f89, %f10; fma.rn.f32 %f129, %f88, %f90, %f11; fma.rn.f32 %f130, %f88, %f91, %f12; setp.eq.s32 %p19, %r34, 1; @%p19 bra BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p20, %rs4, 0; add.s32 %r24, %r3, -1; @%p20 bra BB0_17; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_18; BB0_17: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_18: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd53, %r100; mul.wide.s32 %rd54, %r100, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f92, [%rd55]; ld.global.nc.f32 %f93, [%rd56]; mul.f32 %f94, %f93, %f93; fma.rn.f32 %f95, %f92, %f92, %f94; ld.global.nc.f32 %f96, [%rd57]; fma.rn.f32 %f97, %f96, %f96, %f95; setp.eq.f32 %p21, %f97, 0f00000000; selp.f32 %f98, %f1, %f92, %p21; selp.f32 %f99, %f2, %f93, %p21; selp.f32 %f100, %f3, %f96, %p21; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs19, [%rd58]; setp.gt.u16 %p22, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p22; selp.b32 %r104, %r102, %r7, %p22; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd59, %r108, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f101, [%rd60]; mul.f32 %f102, %f101, %f32; sub.f32 %f103, %f98, %f1; sub.f32 %f104, %f99, %f2; sub.f32 %f105, %f100, %f3; fma.rn.f32 %f16, %f102, %f103, %f128; fma.rn.f32 %f17, %f102, %f104, %f129; fma.rn.f32 %f18, %f102, %f105, %f130; add.s32 %r28, %r3, 1; @%p20 bra BB0_20; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_21; BB0_20: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_21: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd61, %r113; mul.wide.s32 %rd62, %r113, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f106, [%rd63]; ld.global.nc.f32 %f107, [%rd64]; mul.f32 %f108, %f107, %f107; fma.rn.f32 %f109, %f106, %f106, %f108; ld.global.nc.f32 %f110, [%rd65]; fma.rn.f32 %f111, %f110, %f110, %f109; setp.eq.f32 %p24, %f111, 0f00000000; selp.f32 %f112, %f1, %f106, %p24; selp.f32 %f113, %f2, %f107, %p24; selp.f32 %f114, %f3, %f110, %p24; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs23, [%rd66]; setp.gt.u16 %p25, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p25; selp.b32 %r117, %r115, %r7, %p25; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd67, %r121, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f115, [%rd68]; mul.f32 %f116, %f115, %f32; sub.f32 %f117, %f112, %f1; sub.f32 %f118, %f113, %f2; sub.f32 %f119, %f114, %f3; fma.rn.f32 %f128, %f116, %f117, %f16; fma.rn.f32 %f129, %f116, %f118, %f17; fma.rn.f32 %f130, %f116, %f119, %f18; BB0_22: setp.eq.s64 %p26, %rd9, 0; @%p26 bra BB0_24; cvta.to.global.u64 %rd69, %rd9; add.s64 %rd71, %rd69, %rd15; ld.global.nc.f32 %f120, [%rd71]; mul.f32 %f131, %f120, %f131; BB0_24: setp.eq.f32 %p27, %f131, 0f00000000; mov.f32 %f132, 0f00000000; @%p27 bra BB0_26; rcp.rn.f32 %f132, %f131; BB0_26: cvta.to.global.u64 %rd72, %rd8; cvta.to.global.u64 %rd73, %rd7; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd76, %rd74, %rd15; ld.global.f32 %f122, [%rd76]; fma.rn.f32 %f123, %f128, %f132, %f122; st.global.f32 [%rd76], %f123; add.s64 %rd77, %rd73, %rd15; ld.global.f32 %f124, [%rd77]; fma.rn.f32 %f125, %f129, %f132, %f124; st.global.f32 [%rd77], %f125; add.s64 %rd78, %rd72, %rd15; ld.global.f32 %f126, [%rd78]; fma.rn.f32 %f127, %f130, %f132, %f126; st.global.f32 [%rd78], %f127; BB0_27: ret; } ` addexchange_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<28>; .reg .b16 %rs<26>; .reg .f32 %f<133>; .reg .b32 %r<128>; .reg .b64 %rd<79>; ld.param.u64 %rd6, [addexchange_param_0]; ld.param.u64 %rd7, [addexchange_param_1]; ld.param.u64 %rd8, [addexchange_param_2]; ld.param.u64 %rd10, [addexchange_param_3]; ld.param.u64 %rd11, [addexchange_param_4]; ld.param.u64 %rd12, [addexchange_param_5]; ld.param.u64 %rd9, [addexchange_param_6]; ld.param.f32 %f131, [addexchange_param_7]; ld.param.u64 %rd13, [addexchange_param_8]; ld.param.u64 %rd14, [addexchange_param_9]; ld.param.f32 %f30, [addexchange_param_10]; ld.param.f32 %f31, [addexchange_param_11]; ld.param.f32 %f32, [addexchange_param_12]; ld.param.u32 %r32, [addexchange_param_13]; ld.param.u32 %r33, [addexchange_param_14]; ld.param.u32 %r34, [addexchange_param_15]; ld.param.u8 %rs5, [addexchange_param_16]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd12; cvta.to.global.u64 %rd4, %rd11; cvta.to.global.u64 %rd5, %rd10; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_27; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd15, %r6, 4; add.s64 %rd16, %rd5, %rd15; add.s64 %rd17, %rd4, %rd15; add.s64 %rd18, %rd3, %rd15; ld.global.nc.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd17]; mul.f32 %f33, %f2, %f2; fma.rn.f32 %f34, %f1, %f1, %f33; ld.global.nc.f32 %f3, [%rd18]; fma.rn.f32 %f35, %f3, %f3, %f34; setp.eq.f32 %p6, %f35, 0f00000000; @%p6 bra BB0_27; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd2, %rd19; ld.global.nc.u8 %rs1, [%rd20]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd21, %r49; mul.wide.s32 %rd22, %r49, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f36, [%rd23]; ld.global.nc.f32 %f37, [%rd24]; mul.f32 %f38, %f37, %f37; fma.rn.f32 %f39, %f36, %f36, %f38; ld.global.nc.f32 %f40, [%rd25]; fma.rn.f32 %f41, %f40, %f40, %f39; setp.eq.f32 %p8, %f41, 0f00000000; selp.f32 %f42, %f1, %f36, %p8; selp.f32 %f43, %f2, %f37, %p8; selp.f32 %f44, %f3, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; setp.gt.u16 %p9, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p9; selp.b32 %r53, %r51, %r7, %p9; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd27, %r57, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f45, [%rd28]; mul.f32 %f46, %f45, %f30; sub.f32 %f47, %f42, %f1; sub.f32 %f48, %f43, %f2; sub.f32 %f49, %f44, %f3; fma.rn.f32 %f4, %f46, %f47, 0f00000000; fma.rn.f32 %f5, %f46, %f48, 0f00000000; fma.rn.f32 %f6, %f46, %f49, 0f00000000; add.s32 %r12, %r1, 1; @%p7 bra BB0_7; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_8; BB0_7: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_8: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd29, %r61; mul.wide.s32 %rd30, %r61, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f50, [%rd31]; ld.global.nc.f32 %f51, [%rd32]; mul.f32 %f52, %f51, %f51; fma.rn.f32 %f53, %f50, %f50, %f52; ld.global.nc.f32 %f54, [%rd33]; fma.rn.f32 %f55, %f54, %f54, %f53; setp.eq.f32 %p11, %f55, 0f00000000; selp.f32 %f56, %f1, %f50, %p11; selp.f32 %f57, %f2, %f51, %p11; selp.f32 %f58, %f3, %f54, %p11; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs9, [%rd34]; setp.gt.u16 %p12, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p12; selp.b32 %r65, %r63, %r7, %p12; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd35, %r69, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f59, [%rd36]; mul.f32 %f60, %f59, %f30; sub.f32 %f61, %f56, %f1; sub.f32 %f62, %f57, %f2; sub.f32 %f63, %f58, %f3; fma.rn.f32 %f7, %f60, %f61, %f4; fma.rn.f32 %f8, %f60, %f62, %f5; fma.rn.f32 %f9, %f60, %f63, %f6; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p13, %rs3, 0; add.s32 %r16, %r2, -1; @%p13 bra BB0_10; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_11; BB0_10: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_11: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd37, %r74; mul.wide.s32 %rd38, %r74, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f64, [%rd39]; ld.global.nc.f32 %f65, [%rd40]; mul.f32 %f66, %f65, %f65; fma.rn.f32 %f67, %f64, %f64, %f66; ld.global.nc.f32 %f68, [%rd41]; fma.rn.f32 %f69, %f68, %f68, %f67; setp.eq.f32 %p14, %f69, 0f00000000; selp.f32 %f70, %f1, %f64, %p14; selp.f32 %f71, %f2, %f65, %p14; selp.f32 %f72, %f3, %f68, %p14; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs12, [%rd42]; setp.gt.u16 %p15, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p15; selp.b32 %r78, %r76, %r7, %p15; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd43, %r82, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f73, [%rd44]; mul.f32 %f74, %f73, %f31; sub.f32 %f75, %f70, %f1; sub.f32 %f76, %f71, %f2; sub.f32 %f77, %f72, %f3; fma.rn.f32 %f10, %f74, %f75, %f7; fma.rn.f32 %f11, %f74, %f76, %f8; fma.rn.f32 %f12, %f74, %f77, %f9; add.s32 %r20, %r2, 1; @%p13 bra BB0_13; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_14; BB0_13: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_14: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd45, %r87; mul.wide.s32 %rd46, %r87, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f78, [%rd47]; ld.global.nc.f32 %f79, [%rd48]; mul.f32 %f80, %f79, %f79; fma.rn.f32 %f81, %f78, %f78, %f80; ld.global.nc.f32 %f82, [%rd49]; fma.rn.f32 %f83, %f82, %f82, %f81; setp.eq.f32 %p17, %f83, 0f00000000; selp.f32 %f84, %f1, %f78, %p17; selp.f32 %f85, %f2, %f79, %p17; selp.f32 %f86, %f3, %f82, %p17; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs16, [%rd50]; setp.gt.u16 %p18, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p18; selp.b32 %r91, %r89, %r7, %p18; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd51, %r95, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f87, [%rd52]; mul.f32 %f88, %f87, %f31; sub.f32 %f89, %f84, %f1; sub.f32 %f90, %f85, %f2; sub.f32 %f91, %f86, %f3; fma.rn.f32 %f128, %f88, %f89, %f10; fma.rn.f32 %f129, %f88, %f90, %f11; fma.rn.f32 %f130, %f88, %f91, %f12; setp.eq.s32 %p19, %r34, 1; @%p19 bra BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p20, %rs4, 0; add.s32 %r24, %r3, -1; @%p20 bra BB0_17; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_18; BB0_17: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_18: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd53, %r100; mul.wide.s32 %rd54, %r100, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f92, [%rd55]; ld.global.nc.f32 %f93, [%rd56]; mul.f32 %f94, %f93, %f93; fma.rn.f32 %f95, %f92, %f92, %f94; ld.global.nc.f32 %f96, [%rd57]; fma.rn.f32 %f97, %f96, %f96, %f95; setp.eq.f32 %p21, %f97, 0f00000000; selp.f32 %f98, %f1, %f92, %p21; selp.f32 %f99, %f2, %f93, %p21; selp.f32 %f100, %f3, %f96, %p21; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs19, [%rd58]; setp.gt.u16 %p22, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p22; selp.b32 %r104, %r102, %r7, %p22; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd59, %r108, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f101, [%rd60]; mul.f32 %f102, %f101, %f32; sub.f32 %f103, %f98, %f1; sub.f32 %f104, %f99, %f2; sub.f32 %f105, %f100, %f3; fma.rn.f32 %f16, %f102, %f103, %f128; fma.rn.f32 %f17, %f102, %f104, %f129; fma.rn.f32 %f18, %f102, %f105, %f130; add.s32 %r28, %r3, 1; @%p20 bra BB0_20; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_21; BB0_20: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_21: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd61, %r113; mul.wide.s32 %rd62, %r113, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f106, [%rd63]; ld.global.nc.f32 %f107, [%rd64]; mul.f32 %f108, %f107, %f107; fma.rn.f32 %f109, %f106, %f106, %f108; ld.global.nc.f32 %f110, [%rd65]; fma.rn.f32 %f111, %f110, %f110, %f109; setp.eq.f32 %p24, %f111, 0f00000000; selp.f32 %f112, %f1, %f106, %p24; selp.f32 %f113, %f2, %f107, %p24; selp.f32 %f114, %f3, %f110, %p24; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs23, [%rd66]; setp.gt.u16 %p25, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p25; selp.b32 %r117, %r115, %r7, %p25; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd67, %r121, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f115, [%rd68]; mul.f32 %f116, %f115, %f32; sub.f32 %f117, %f112, %f1; sub.f32 %f118, %f113, %f2; sub.f32 %f119, %f114, %f3; fma.rn.f32 %f128, %f116, %f117, %f16; fma.rn.f32 %f129, %f116, %f118, %f17; fma.rn.f32 %f130, %f116, %f119, %f18; BB0_22: setp.eq.s64 %p26, %rd9, 0; @%p26 bra BB0_24; cvta.to.global.u64 %rd69, %rd9; add.s64 %rd71, %rd69, %rd15; ld.global.nc.f32 %f120, [%rd71]; mul.f32 %f131, %f120, %f131; BB0_24: setp.eq.f32 %p27, %f131, 0f00000000; mov.f32 %f132, 0f00000000; @%p27 bra BB0_26; rcp.rn.f32 %f132, %f131; BB0_26: cvta.to.global.u64 %rd72, %rd8; cvta.to.global.u64 %rd73, %rd7; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd76, %rd74, %rd15; ld.global.f32 %f122, [%rd76]; fma.rn.f32 %f123, %f128, %f132, %f122; st.global.f32 [%rd76], %f123; add.s64 %rd77, %rd73, %rd15; ld.global.f32 %f124, [%rd77]; fma.rn.f32 %f125, %f129, %f132, %f124; st.global.f32 [%rd77], %f125; add.s64 %rd78, %rd72, %rd15; ld.global.f32 %f126, [%rd78]; fma.rn.f32 %f127, %f130, %f132, %f126; st.global.f32 [%rd78], %f127; BB0_27: ret; } ` addexchange_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<28>; .reg .b16 %rs<26>; .reg .f32 %f<133>; .reg .b32 %r<128>; .reg .b64 %rd<79>; ld.param.u64 %rd6, [addexchange_param_0]; ld.param.u64 %rd7, [addexchange_param_1]; ld.param.u64 %rd8, [addexchange_param_2]; ld.param.u64 %rd10, [addexchange_param_3]; ld.param.u64 %rd11, [addexchange_param_4]; ld.param.u64 %rd12, [addexchange_param_5]; ld.param.u64 %rd9, [addexchange_param_6]; ld.param.f32 %f131, [addexchange_param_7]; ld.param.u64 %rd13, [addexchange_param_8]; ld.param.u64 %rd14, [addexchange_param_9]; ld.param.f32 %f30, [addexchange_param_10]; ld.param.f32 %f31, [addexchange_param_11]; ld.param.f32 %f32, [addexchange_param_12]; ld.param.u32 %r32, [addexchange_param_13]; ld.param.u32 %r33, [addexchange_param_14]; ld.param.u32 %r34, [addexchange_param_15]; ld.param.u8 %rs5, [addexchange_param_16]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd12; cvta.to.global.u64 %rd4, %rd11; cvta.to.global.u64 %rd5, %rd10; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_27; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd15, %r6, 4; add.s64 %rd16, %rd5, %rd15; add.s64 %rd17, %rd4, %rd15; add.s64 %rd18, %rd3, %rd15; ld.global.nc.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd17]; mul.f32 %f33, %f2, %f2; fma.rn.f32 %f34, %f1, %f1, %f33; ld.global.nc.f32 %f3, [%rd18]; fma.rn.f32 %f35, %f3, %f3, %f34; setp.eq.f32 %p6, %f35, 0f00000000; @%p6 bra BB0_27; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd2, %rd19; ld.global.nc.u8 %rs1, [%rd20]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd21, %r49; mul.wide.s32 %rd22, %r49, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f36, [%rd23]; ld.global.nc.f32 %f37, [%rd24]; mul.f32 %f38, %f37, %f37; fma.rn.f32 %f39, %f36, %f36, %f38; ld.global.nc.f32 %f40, [%rd25]; fma.rn.f32 %f41, %f40, %f40, %f39; setp.eq.f32 %p8, %f41, 0f00000000; selp.f32 %f42, %f1, %f36, %p8; selp.f32 %f43, %f2, %f37, %p8; selp.f32 %f44, %f3, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; setp.gt.u16 %p9, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p9; selp.b32 %r53, %r51, %r7, %p9; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd27, %r57, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f45, [%rd28]; mul.f32 %f46, %f45, %f30; sub.f32 %f47, %f42, %f1; sub.f32 %f48, %f43, %f2; sub.f32 %f49, %f44, %f3; fma.rn.f32 %f4, %f46, %f47, 0f00000000; fma.rn.f32 %f5, %f46, %f48, 0f00000000; fma.rn.f32 %f6, %f46, %f49, 0f00000000; add.s32 %r12, %r1, 1; @%p7 bra BB0_7; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_8; BB0_7: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_8: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd29, %r61; mul.wide.s32 %rd30, %r61, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f50, [%rd31]; ld.global.nc.f32 %f51, [%rd32]; mul.f32 %f52, %f51, %f51; fma.rn.f32 %f53, %f50, %f50, %f52; ld.global.nc.f32 %f54, [%rd33]; fma.rn.f32 %f55, %f54, %f54, %f53; setp.eq.f32 %p11, %f55, 0f00000000; selp.f32 %f56, %f1, %f50, %p11; selp.f32 %f57, %f2, %f51, %p11; selp.f32 %f58, %f3, %f54, %p11; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs9, [%rd34]; setp.gt.u16 %p12, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p12; selp.b32 %r65, %r63, %r7, %p12; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd35, %r69, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f59, [%rd36]; mul.f32 %f60, %f59, %f30; sub.f32 %f61, %f56, %f1; sub.f32 %f62, %f57, %f2; sub.f32 %f63, %f58, %f3; fma.rn.f32 %f7, %f60, %f61, %f4; fma.rn.f32 %f8, %f60, %f62, %f5; fma.rn.f32 %f9, %f60, %f63, %f6; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p13, %rs3, 0; add.s32 %r16, %r2, -1; @%p13 bra BB0_10; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_11; BB0_10: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_11: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd37, %r74; mul.wide.s32 %rd38, %r74, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f64, [%rd39]; ld.global.nc.f32 %f65, [%rd40]; mul.f32 %f66, %f65, %f65; fma.rn.f32 %f67, %f64, %f64, %f66; ld.global.nc.f32 %f68, [%rd41]; fma.rn.f32 %f69, %f68, %f68, %f67; setp.eq.f32 %p14, %f69, 0f00000000; selp.f32 %f70, %f1, %f64, %p14; selp.f32 %f71, %f2, %f65, %p14; selp.f32 %f72, %f3, %f68, %p14; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs12, [%rd42]; setp.gt.u16 %p15, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p15; selp.b32 %r78, %r76, %r7, %p15; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd43, %r82, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f73, [%rd44]; mul.f32 %f74, %f73, %f31; sub.f32 %f75, %f70, %f1; sub.f32 %f76, %f71, %f2; sub.f32 %f77, %f72, %f3; fma.rn.f32 %f10, %f74, %f75, %f7; fma.rn.f32 %f11, %f74, %f76, %f8; fma.rn.f32 %f12, %f74, %f77, %f9; add.s32 %r20, %r2, 1; @%p13 bra BB0_13; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_14; BB0_13: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_14: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd45, %r87; mul.wide.s32 %rd46, %r87, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f78, [%rd47]; ld.global.nc.f32 %f79, [%rd48]; mul.f32 %f80, %f79, %f79; fma.rn.f32 %f81, %f78, %f78, %f80; ld.global.nc.f32 %f82, [%rd49]; fma.rn.f32 %f83, %f82, %f82, %f81; setp.eq.f32 %p17, %f83, 0f00000000; selp.f32 %f84, %f1, %f78, %p17; selp.f32 %f85, %f2, %f79, %p17; selp.f32 %f86, %f3, %f82, %p17; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs16, [%rd50]; setp.gt.u16 %p18, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p18; selp.b32 %r91, %r89, %r7, %p18; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd51, %r95, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f87, [%rd52]; mul.f32 %f88, %f87, %f31; sub.f32 %f89, %f84, %f1; sub.f32 %f90, %f85, %f2; sub.f32 %f91, %f86, %f3; fma.rn.f32 %f128, %f88, %f89, %f10; fma.rn.f32 %f129, %f88, %f90, %f11; fma.rn.f32 %f130, %f88, %f91, %f12; setp.eq.s32 %p19, %r34, 1; @%p19 bra BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p20, %rs4, 0; add.s32 %r24, %r3, -1; @%p20 bra BB0_17; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_18; BB0_17: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_18: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd53, %r100; mul.wide.s32 %rd54, %r100, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f92, [%rd55]; ld.global.nc.f32 %f93, [%rd56]; mul.f32 %f94, %f93, %f93; fma.rn.f32 %f95, %f92, %f92, %f94; ld.global.nc.f32 %f96, [%rd57]; fma.rn.f32 %f97, %f96, %f96, %f95; setp.eq.f32 %p21, %f97, 0f00000000; selp.f32 %f98, %f1, %f92, %p21; selp.f32 %f99, %f2, %f93, %p21; selp.f32 %f100, %f3, %f96, %p21; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs19, [%rd58]; setp.gt.u16 %p22, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p22; selp.b32 %r104, %r102, %r7, %p22; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd59, %r108, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f101, [%rd60]; mul.f32 %f102, %f101, %f32; sub.f32 %f103, %f98, %f1; sub.f32 %f104, %f99, %f2; sub.f32 %f105, %f100, %f3; fma.rn.f32 %f16, %f102, %f103, %f128; fma.rn.f32 %f17, %f102, %f104, %f129; fma.rn.f32 %f18, %f102, %f105, %f130; add.s32 %r28, %r3, 1; @%p20 bra BB0_20; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_21; BB0_20: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_21: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd61, %r113; mul.wide.s32 %rd62, %r113, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f106, [%rd63]; ld.global.nc.f32 %f107, [%rd64]; mul.f32 %f108, %f107, %f107; fma.rn.f32 %f109, %f106, %f106, %f108; ld.global.nc.f32 %f110, [%rd65]; fma.rn.f32 %f111, %f110, %f110, %f109; setp.eq.f32 %p24, %f111, 0f00000000; selp.f32 %f112, %f1, %f106, %p24; selp.f32 %f113, %f2, %f107, %p24; selp.f32 %f114, %f3, %f110, %p24; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs23, [%rd66]; setp.gt.u16 %p25, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p25; selp.b32 %r117, %r115, %r7, %p25; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd67, %r121, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f115, [%rd68]; mul.f32 %f116, %f115, %f32; sub.f32 %f117, %f112, %f1; sub.f32 %f118, %f113, %f2; sub.f32 %f119, %f114, %f3; fma.rn.f32 %f128, %f116, %f117, %f16; fma.rn.f32 %f129, %f116, %f118, %f17; fma.rn.f32 %f130, %f116, %f119, %f18; BB0_22: setp.eq.s64 %p26, %rd9, 0; @%p26 bra BB0_24; cvta.to.global.u64 %rd69, %rd9; add.s64 %rd71, %rd69, %rd15; ld.global.nc.f32 %f120, [%rd71]; mul.f32 %f131, %f120, %f131; BB0_24: setp.eq.f32 %p27, %f131, 0f00000000; mov.f32 %f132, 0f00000000; @%p27 bra BB0_26; rcp.rn.f32 %f132, %f131; BB0_26: cvta.to.global.u64 %rd72, %rd8; cvta.to.global.u64 %rd73, %rd7; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd76, %rd74, %rd15; ld.global.f32 %f122, [%rd76]; fma.rn.f32 %f123, %f128, %f132, %f122; st.global.f32 [%rd76], %f123; add.s64 %rd77, %rd73, %rd15; ld.global.f32 %f124, [%rd77]; fma.rn.f32 %f125, %f129, %f132, %f124; st.global.f32 [%rd77], %f125; add.s64 %rd78, %rd72, %rd15; ld.global.f32 %f126, [%rd78]; fma.rn.f32 %f127, %f130, %f132, %f126; st.global.f32 [%rd78], %f127; BB0_27: ret; } ` addexchange_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl addexchange .visible .entry addexchange( .param .u64 addexchange_param_0, .param .u64 addexchange_param_1, .param .u64 addexchange_param_2, .param .u64 addexchange_param_3, .param .u64 addexchange_param_4, .param .u64 addexchange_param_5, .param .u64 addexchange_param_6, .param .f32 addexchange_param_7, .param .u64 addexchange_param_8, .param .u64 addexchange_param_9, .param .f32 addexchange_param_10, .param .f32 addexchange_param_11, .param .f32 addexchange_param_12, .param .u32 addexchange_param_13, .param .u32 addexchange_param_14, .param .u32 addexchange_param_15, .param .u8 addexchange_param_16 ) { .reg .pred %p<28>; .reg .b16 %rs<26>; .reg .f32 %f<133>; .reg .b32 %r<128>; .reg .b64 %rd<79>; ld.param.u64 %rd6, [addexchange_param_0]; ld.param.u64 %rd7, [addexchange_param_1]; ld.param.u64 %rd8, [addexchange_param_2]; ld.param.u64 %rd10, [addexchange_param_3]; ld.param.u64 %rd11, [addexchange_param_4]; ld.param.u64 %rd12, [addexchange_param_5]; ld.param.u64 %rd9, [addexchange_param_6]; ld.param.f32 %f131, [addexchange_param_7]; ld.param.u64 %rd13, [addexchange_param_8]; ld.param.u64 %rd14, [addexchange_param_9]; ld.param.f32 %f30, [addexchange_param_10]; ld.param.f32 %f31, [addexchange_param_11]; ld.param.f32 %f32, [addexchange_param_12]; ld.param.u32 %r32, [addexchange_param_13]; ld.param.u32 %r33, [addexchange_param_14]; ld.param.u32 %r34, [addexchange_param_15]; ld.param.u8 %rs5, [addexchange_param_16]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd14; cvta.to.global.u64 %rd3, %rd12; cvta.to.global.u64 %rd4, %rd11; cvta.to.global.u64 %rd5, %rd10; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_27; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd15, %r6, 4; add.s64 %rd16, %rd5, %rd15; add.s64 %rd17, %rd4, %rd15; add.s64 %rd18, %rd3, %rd15; ld.global.nc.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd17]; mul.f32 %f33, %f2, %f2; fma.rn.f32 %f34, %f1, %f1, %f33; ld.global.nc.f32 %f3, [%rd18]; fma.rn.f32 %f35, %f3, %f3, %f34; setp.eq.f32 %p6, %f35, 0f00000000; @%p6 bra BB0_27; cvt.s64.s32 %rd19, %r6; add.s64 %rd20, %rd2, %rd19; ld.global.nc.u8 %rs1, [%rd20]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd21, %r49; mul.wide.s32 %rd22, %r49, 4; add.s64 %rd23, %rd5, %rd22; add.s64 %rd24, %rd4, %rd22; add.s64 %rd25, %rd3, %rd22; ld.global.nc.f32 %f36, [%rd23]; ld.global.nc.f32 %f37, [%rd24]; mul.f32 %f38, %f37, %f37; fma.rn.f32 %f39, %f36, %f36, %f38; ld.global.nc.f32 %f40, [%rd25]; fma.rn.f32 %f41, %f40, %f40, %f39; setp.eq.f32 %p8, %f41, 0f00000000; selp.f32 %f42, %f1, %f36, %p8; selp.f32 %f43, %f2, %f37, %p8; selp.f32 %f44, %f3, %f40, %p8; add.s64 %rd26, %rd2, %rd21; ld.global.nc.u8 %rs6, [%rd26]; setp.gt.u16 %p9, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p9; selp.b32 %r53, %r51, %r7, %p9; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd27, %r57, 4; add.s64 %rd28, %rd1, %rd27; ld.global.nc.f32 %f45, [%rd28]; mul.f32 %f46, %f45, %f30; sub.f32 %f47, %f42, %f1; sub.f32 %f48, %f43, %f2; sub.f32 %f49, %f44, %f3; fma.rn.f32 %f4, %f46, %f47, 0f00000000; fma.rn.f32 %f5, %f46, %f48, 0f00000000; fma.rn.f32 %f6, %f46, %f49, 0f00000000; add.s32 %r12, %r1, 1; @%p7 bra BB0_7; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_8; BB0_7: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_8: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd29, %r61; mul.wide.s32 %rd30, %r61, 4; add.s64 %rd31, %rd5, %rd30; add.s64 %rd32, %rd4, %rd30; add.s64 %rd33, %rd3, %rd30; ld.global.nc.f32 %f50, [%rd31]; ld.global.nc.f32 %f51, [%rd32]; mul.f32 %f52, %f51, %f51; fma.rn.f32 %f53, %f50, %f50, %f52; ld.global.nc.f32 %f54, [%rd33]; fma.rn.f32 %f55, %f54, %f54, %f53; setp.eq.f32 %p11, %f55, 0f00000000; selp.f32 %f56, %f1, %f50, %p11; selp.f32 %f57, %f2, %f51, %p11; selp.f32 %f58, %f3, %f54, %p11; add.s64 %rd34, %rd2, %rd29; ld.global.nc.u8 %rs9, [%rd34]; setp.gt.u16 %p12, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p12; selp.b32 %r65, %r63, %r7, %p12; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd35, %r69, 4; add.s64 %rd36, %rd1, %rd35; ld.global.nc.f32 %f59, [%rd36]; mul.f32 %f60, %f59, %f30; sub.f32 %f61, %f56, %f1; sub.f32 %f62, %f57, %f2; sub.f32 %f63, %f58, %f3; fma.rn.f32 %f7, %f60, %f61, %f4; fma.rn.f32 %f8, %f60, %f62, %f5; fma.rn.f32 %f9, %f60, %f63, %f6; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p13, %rs3, 0; add.s32 %r16, %r2, -1; @%p13 bra BB0_10; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_11; BB0_10: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_11: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd37, %r74; mul.wide.s32 %rd38, %r74, 4; add.s64 %rd39, %rd5, %rd38; add.s64 %rd40, %rd4, %rd38; add.s64 %rd41, %rd3, %rd38; ld.global.nc.f32 %f64, [%rd39]; ld.global.nc.f32 %f65, [%rd40]; mul.f32 %f66, %f65, %f65; fma.rn.f32 %f67, %f64, %f64, %f66; ld.global.nc.f32 %f68, [%rd41]; fma.rn.f32 %f69, %f68, %f68, %f67; setp.eq.f32 %p14, %f69, 0f00000000; selp.f32 %f70, %f1, %f64, %p14; selp.f32 %f71, %f2, %f65, %p14; selp.f32 %f72, %f3, %f68, %p14; add.s64 %rd42, %rd2, %rd37; ld.global.nc.u8 %rs12, [%rd42]; setp.gt.u16 %p15, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p15; selp.b32 %r78, %r76, %r7, %p15; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd43, %r82, 4; add.s64 %rd44, %rd1, %rd43; ld.global.nc.f32 %f73, [%rd44]; mul.f32 %f74, %f73, %f31; sub.f32 %f75, %f70, %f1; sub.f32 %f76, %f71, %f2; sub.f32 %f77, %f72, %f3; fma.rn.f32 %f10, %f74, %f75, %f7; fma.rn.f32 %f11, %f74, %f76, %f8; fma.rn.f32 %f12, %f74, %f77, %f9; add.s32 %r20, %r2, 1; @%p13 bra BB0_13; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_14; BB0_13: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_14: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd45, %r87; mul.wide.s32 %rd46, %r87, 4; add.s64 %rd47, %rd5, %rd46; add.s64 %rd48, %rd4, %rd46; add.s64 %rd49, %rd3, %rd46; ld.global.nc.f32 %f78, [%rd47]; ld.global.nc.f32 %f79, [%rd48]; mul.f32 %f80, %f79, %f79; fma.rn.f32 %f81, %f78, %f78, %f80; ld.global.nc.f32 %f82, [%rd49]; fma.rn.f32 %f83, %f82, %f82, %f81; setp.eq.f32 %p17, %f83, 0f00000000; selp.f32 %f84, %f1, %f78, %p17; selp.f32 %f85, %f2, %f79, %p17; selp.f32 %f86, %f3, %f82, %p17; add.s64 %rd50, %rd2, %rd45; ld.global.nc.u8 %rs16, [%rd50]; setp.gt.u16 %p18, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p18; selp.b32 %r91, %r89, %r7, %p18; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd51, %r95, 4; add.s64 %rd52, %rd1, %rd51; ld.global.nc.f32 %f87, [%rd52]; mul.f32 %f88, %f87, %f31; sub.f32 %f89, %f84, %f1; sub.f32 %f90, %f85, %f2; sub.f32 %f91, %f86, %f3; fma.rn.f32 %f128, %f88, %f89, %f10; fma.rn.f32 %f129, %f88, %f90, %f11; fma.rn.f32 %f130, %f88, %f91, %f12; setp.eq.s32 %p19, %r34, 1; @%p19 bra BB0_22; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p20, %rs4, 0; add.s32 %r24, %r3, -1; @%p20 bra BB0_17; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_18; BB0_17: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_18: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd53, %r100; mul.wide.s32 %rd54, %r100, 4; add.s64 %rd55, %rd5, %rd54; add.s64 %rd56, %rd4, %rd54; add.s64 %rd57, %rd3, %rd54; ld.global.nc.f32 %f92, [%rd55]; ld.global.nc.f32 %f93, [%rd56]; mul.f32 %f94, %f93, %f93; fma.rn.f32 %f95, %f92, %f92, %f94; ld.global.nc.f32 %f96, [%rd57]; fma.rn.f32 %f97, %f96, %f96, %f95; setp.eq.f32 %p21, %f97, 0f00000000; selp.f32 %f98, %f1, %f92, %p21; selp.f32 %f99, %f2, %f93, %p21; selp.f32 %f100, %f3, %f96, %p21; add.s64 %rd58, %rd2, %rd53; ld.global.nc.u8 %rs19, [%rd58]; setp.gt.u16 %p22, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p22; selp.b32 %r104, %r102, %r7, %p22; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd59, %r108, 4; add.s64 %rd60, %rd1, %rd59; ld.global.nc.f32 %f101, [%rd60]; mul.f32 %f102, %f101, %f32; sub.f32 %f103, %f98, %f1; sub.f32 %f104, %f99, %f2; sub.f32 %f105, %f100, %f3; fma.rn.f32 %f16, %f102, %f103, %f128; fma.rn.f32 %f17, %f102, %f104, %f129; fma.rn.f32 %f18, %f102, %f105, %f130; add.s32 %r28, %r3, 1; @%p20 bra BB0_20; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_21; BB0_20: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_21: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd61, %r113; mul.wide.s32 %rd62, %r113, 4; add.s64 %rd63, %rd5, %rd62; add.s64 %rd64, %rd4, %rd62; add.s64 %rd65, %rd3, %rd62; ld.global.nc.f32 %f106, [%rd63]; ld.global.nc.f32 %f107, [%rd64]; mul.f32 %f108, %f107, %f107; fma.rn.f32 %f109, %f106, %f106, %f108; ld.global.nc.f32 %f110, [%rd65]; fma.rn.f32 %f111, %f110, %f110, %f109; setp.eq.f32 %p24, %f111, 0f00000000; selp.f32 %f112, %f1, %f106, %p24; selp.f32 %f113, %f2, %f107, %p24; selp.f32 %f114, %f3, %f110, %p24; add.s64 %rd66, %rd2, %rd61; ld.global.nc.u8 %rs23, [%rd66]; setp.gt.u16 %p25, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p25; selp.b32 %r117, %r115, %r7, %p25; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd67, %r121, 4; add.s64 %rd68, %rd1, %rd67; ld.global.nc.f32 %f115, [%rd68]; mul.f32 %f116, %f115, %f32; sub.f32 %f117, %f112, %f1; sub.f32 %f118, %f113, %f2; sub.f32 %f119, %f114, %f3; fma.rn.f32 %f128, %f116, %f117, %f16; fma.rn.f32 %f129, %f116, %f118, %f17; fma.rn.f32 %f130, %f116, %f119, %f18; BB0_22: setp.eq.s64 %p26, %rd9, 0; @%p26 bra BB0_24; cvta.to.global.u64 %rd69, %rd9; add.s64 %rd71, %rd69, %rd15; ld.global.nc.f32 %f120, [%rd71]; mul.f32 %f131, %f120, %f131; BB0_24: setp.eq.f32 %p27, %f131, 0f00000000; mov.f32 %f132, 0f00000000; @%p27 bra BB0_26; rcp.rn.f32 %f132, %f131; BB0_26: cvta.to.global.u64 %rd72, %rd8; cvta.to.global.u64 %rd73, %rd7; cvta.to.global.u64 %rd74, %rd6; add.s64 %rd76, %rd74, %rd15; ld.global.f32 %f122, [%rd76]; fma.rn.f32 %f123, %f128, %f132, %f122; st.global.f32 [%rd76], %f123; add.s64 %rd77, %rd73, %rd15; ld.global.f32 %f124, [%rd77]; fma.rn.f32 %f125, %f129, %f132, %f124; st.global.f32 [%rd77], %f125; add.s64 %rd78, %rd72, %rd15; ld.global.f32 %f126, [%rd78]; fma.rn.f32 %f127, %f130, %f132, %f126; st.global.f32 [%rd78], %f127; BB0_27: ret; } ` ) mumax3-3.10/cuda/exchangedecode.cu000066400000000000000000000026761371432437400170650ustar00rootroot00000000000000#include #include "stencil.h" #include "float3.h" #include "exchange.h" // see exchange.go extern "C" __global__ void exchangedecode(float* __restrict__ dst, float* __restrict__ aLUT2d, uint8_t* __restrict__ regions, float wx, float wy, float wz, int Nx, int Ny, int Nz, uint8_t PBC) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } // central cell int I = idx(ix, iy, iz); uint8_t r0 = regions[I]; int i_; // neighbor index float avg = 0.0f; // left neighbor i_ = idx(lclampx(ix-1), iy, iz); // clamps or wraps index according to PBC avg += aLUT2d[symidx(r0, regions[i_])]; // right neighbor i_ = idx(hclampx(ix+1), iy, iz); avg += aLUT2d[symidx(r0, regions[i_])]; // back neighbor i_ = idx(ix, lclampy(iy-1), iz); avg += aLUT2d[symidx(r0, regions[i_])]; // front neighbor i_ = idx(ix, hclampy(iy+1), iz); avg += aLUT2d[symidx(r0, regions[i_])]; // only take vertical derivative for 3D sim if (Nz != 1) { // bottom neighbor i_ = idx(ix, iy, lclampz(iz-1)); avg += aLUT2d[symidx(r0, regions[i_])]; // top neighbor i_ = idx(ix, iy, hclampz(iz+1)); avg += aLUT2d[symidx(r0, regions[i_])]; } dst[I] = avg; } mumax3-3.10/cuda/exchangedecode_wrapper.go000066400000000000000000002605271371432437400206240ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for exchangedecode kernel var exchangedecode_code cu.Function // Stores the arguments for exchangedecode kernel invocation type exchangedecode_args_t struct { arg_dst unsafe.Pointer arg_aLUT2d unsafe.Pointer arg_regions unsafe.Pointer arg_wx float32 arg_wy float32 arg_wz float32 arg_Nx int arg_Ny int arg_Nz int arg_PBC byte argptr [10]unsafe.Pointer sync.Mutex } // Stores the arguments for exchangedecode kernel invocation var exchangedecode_args exchangedecode_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. exchangedecode_args.argptr[0] = unsafe.Pointer(&exchangedecode_args.arg_dst) exchangedecode_args.argptr[1] = unsafe.Pointer(&exchangedecode_args.arg_aLUT2d) exchangedecode_args.argptr[2] = unsafe.Pointer(&exchangedecode_args.arg_regions) exchangedecode_args.argptr[3] = unsafe.Pointer(&exchangedecode_args.arg_wx) exchangedecode_args.argptr[4] = unsafe.Pointer(&exchangedecode_args.arg_wy) exchangedecode_args.argptr[5] = unsafe.Pointer(&exchangedecode_args.arg_wz) exchangedecode_args.argptr[6] = unsafe.Pointer(&exchangedecode_args.arg_Nx) exchangedecode_args.argptr[7] = unsafe.Pointer(&exchangedecode_args.arg_Ny) exchangedecode_args.argptr[8] = unsafe.Pointer(&exchangedecode_args.arg_Nz) exchangedecode_args.argptr[9] = unsafe.Pointer(&exchangedecode_args.arg_PBC) } // Wrapper for exchangedecode CUDA kernel, asynchronous. func k_exchangedecode_async(dst unsafe.Pointer, aLUT2d unsafe.Pointer, regions unsafe.Pointer, wx float32, wy float32, wz float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("exchangedecode") } exchangedecode_args.Lock() defer exchangedecode_args.Unlock() if exchangedecode_code == 0 { exchangedecode_code = fatbinLoad(exchangedecode_map, "exchangedecode") } exchangedecode_args.arg_dst = dst exchangedecode_args.arg_aLUT2d = aLUT2d exchangedecode_args.arg_regions = regions exchangedecode_args.arg_wx = wx exchangedecode_args.arg_wy = wy exchangedecode_args.arg_wz = wz exchangedecode_args.arg_Nx = Nx exchangedecode_args.arg_Ny = Ny exchangedecode_args.arg_Nz = Nz exchangedecode_args.arg_PBC = PBC args := exchangedecode_args.argptr[:] cu.LaunchKernel(exchangedecode_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("exchangedecode") } } // maps compute capability on PTX code for exchangedecode kernel. var exchangedecode_map = map[int]string{0: "", 30: exchangedecode_ptx_30, 32: exchangedecode_ptx_32, 35: exchangedecode_ptx_35, 37: exchangedecode_ptx_37, 50: exchangedecode_ptx_50, 52: exchangedecode_ptx_52, 53: exchangedecode_ptx_53, 60: exchangedecode_ptx_60, 61: exchangedecode_ptx_61, 62: exchangedecode_ptx_62, 70: exchangedecode_ptx_70, 72: exchangedecode_ptx_72, 75: exchangedecode_ptx_75} // exchangedecode PTX code for various compute capabilities. const ( exchangedecode_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<19>; .reg .b16 %rs<20>; .reg .f32 %f<15>; .reg .b32 %r<132>; .reg .b64 %rd<40>; ld.param.u64 %rd2, [exchangedecode_param_0]; ld.param.u64 %rd3, [exchangedecode_param_1]; ld.param.u64 %rd4, [exchangedecode_param_2]; ld.param.u32 %r31, [exchangedecode_param_6]; ld.param.u32 %r32, [exchangedecode_param_7]; ld.param.u32 %r33, [exchangedecode_param_8]; ld.param.u8 %rs5, [exchangedecode_param_9]; cvta.to.global.u64 %rd1, %rd4; mov.u32 %r34, %ntid.x; mov.u32 %r35, %ctaid.x; mov.u32 %r36, %tid.x; mad.lo.s32 %r1, %r34, %r35, %r36; mov.u32 %r37, %ntid.y; mov.u32 %r38, %ctaid.y; mov.u32 %r39, %tid.y; mad.lo.s32 %r2, %r37, %r38, %r39; mov.u32 %r40, %ntid.z; mov.u32 %r41, %ctaid.z; mov.u32 %r42, %tid.z; mad.lo.s32 %r3, %r40, %r41, %r42; setp.ge.s32 %p1, %r2, %r32; setp.ge.s32 %p2, %r1, %r31; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r33; or.pred %p5, %p3, %p4; @%p5 bra BB0_22; mul.lo.s32 %r4, %r3, %r32; add.s32 %r43, %r4, %r2; mul.lo.s32 %r5, %r43, %r31; add.s32 %r6, %r5, %r1; cvt.s64.s32 %rd5, %r6; add.s64 %rd6, %rd1, %rd5; ld.global.u8 %rs1, [%rd6]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r7, %r1, -1; @%p6 bra BB0_3; rem.s32 %r44, %r7, %r31; add.s32 %r45, %r44, %r31; rem.s32 %r126, %r45, %r31; bra.uni BB0_4; BB0_3: mov.u32 %r46, 0; max.s32 %r126, %r7, %r46; BB0_4: add.s32 %r47, %r126, %r5; cvt.s64.s32 %rd7, %r47; add.s64 %rd8, %rd1, %rd7; ld.global.u8 %rs6, [%rd8]; setp.gt.u16 %p7, %rs6, %rs1; cvt.u32.u16 %r48, %rs6; cvt.u32.u16 %r49, %rs1; and.b32 %r50, %r49, 255; selp.b32 %r51, %r50, %r48, %p7; selp.b32 %r52, %r48, %r50, %p7; add.s32 %r53, %r52, 1; mul.lo.s32 %r54, %r53, %r52; shr.u32 %r55, %r54, 1; add.s32 %r56, %r55, %r51; cvta.to.global.u64 %rd9, %rd3; mul.wide.s32 %rd10, %r56, 4; add.s64 %rd11, %rd9, %rd10; ld.global.f32 %f1, [%rd11]; add.s32 %r11, %r1, 1; @%p6 bra BB0_6; rem.s32 %r57, %r11, %r31; add.s32 %r58, %r57, %r31; rem.s32 %r127, %r58, %r31; bra.uni BB0_7; BB0_6: add.s32 %r59, %r31, -1; min.s32 %r127, %r11, %r59; BB0_7: add.s32 %r60, %r127, %r5; cvt.s64.s32 %rd12, %r60; add.s64 %rd13, %rd1, %rd12; ld.global.u8 %rs8, [%rd13]; setp.gt.u16 %p9, %rs8, %rs1; cvt.u32.u16 %r61, %rs8; selp.b32 %r64, %r50, %r61, %p9; selp.b32 %r65, %r61, %r50, %p9; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd15, %r69, 4; add.s64 %rd16, %rd9, %rd15; ld.global.f32 %f8, [%rd16]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p10, %rs3, 0; add.s32 %r15, %r2, -1; @%p10 bra BB0_9; rem.s32 %r70, %r15, %r32; add.s32 %r71, %r70, %r32; rem.s32 %r128, %r71, %r32; bra.uni BB0_10; BB0_9: mov.u32 %r72, 0; max.s32 %r128, %r15, %r72; BB0_10: add.s32 %r73, %r128, %r4; mad.lo.s32 %r74, %r73, %r31, %r1; cvt.s64.s32 %rd17, %r74; add.s64 %rd18, %rd1, %rd17; ld.global.u8 %rs10, [%rd18]; setp.gt.u16 %p11, %rs10, %rs1; cvt.u32.u16 %r75, %rs10; selp.b32 %r78, %r50, %r75, %p11; selp.b32 %r79, %r75, %r50, %p11; add.s32 %r80, %r79, 1; mul.lo.s32 %r81, %r80, %r79; shr.u32 %r82, %r81, 1; add.s32 %r83, %r82, %r78; mul.wide.s32 %rd20, %r83, 4; add.s64 %rd21, %rd9, %rd20; ld.global.f32 %f10, [%rd21]; add.f32 %f3, %f2, %f10; add.s32 %r19, %r2, 1; @%p10 bra BB0_12; rem.s32 %r84, %r19, %r32; add.s32 %r85, %r84, %r32; rem.s32 %r129, %r85, %r32; bra.uni BB0_13; BB0_12: add.s32 %r86, %r32, -1; min.s32 %r129, %r19, %r86; BB0_13: add.s32 %r87, %r129, %r4; mad.lo.s32 %r88, %r87, %r31, %r1; cvt.s64.s32 %rd22, %r88; add.s64 %rd23, %rd1, %rd22; ld.global.u8 %rs13, [%rd23]; setp.gt.u16 %p13, %rs13, %rs1; cvt.u32.u16 %r89, %rs13; selp.b32 %r92, %r50, %r89, %p13; selp.b32 %r93, %r89, %r50, %p13; add.s32 %r94, %r93, 1; mul.lo.s32 %r95, %r94, %r93; shr.u32 %r96, %r95, 1; add.s32 %r97, %r96, %r92; mul.wide.s32 %rd25, %r97, 4; add.s64 %rd26, %rd9, %rd25; ld.global.f32 %f11, [%rd26]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p14, %r33, 1; @%p14 bra BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p15, %rs4, 0; add.s32 %r23, %r3, -1; @%p15 bra BB0_16; rem.s32 %r98, %r23, %r33; add.s32 %r99, %r98, %r33; rem.s32 %r130, %r99, %r33; bra.uni BB0_17; BB0_16: mov.u32 %r100, 0; max.s32 %r130, %r23, %r100; BB0_17: mad.lo.s32 %r101, %r130, %r32, %r2; mad.lo.s32 %r102, %r101, %r31, %r1; cvt.s64.s32 %rd27, %r102; add.s64 %rd28, %rd1, %rd27; ld.global.u8 %rs15, [%rd28]; setp.gt.u16 %p16, %rs15, %rs1; cvt.u32.u16 %r103, %rs15; selp.b32 %r106, %r50, %r103, %p16; selp.b32 %r107, %r103, %r50, %p16; add.s32 %r108, %r107, 1; mul.lo.s32 %r109, %r108, %r107; shr.u32 %r110, %r109, 1; add.s32 %r111, %r110, %r106; mul.wide.s32 %rd30, %r111, 4; add.s64 %rd31, %rd9, %rd30; ld.global.f32 %f12, [%rd31]; add.f32 %f5, %f14, %f12; add.s32 %r27, %r3, 1; @%p15 bra BB0_19; rem.s32 %r112, %r27, %r33; add.s32 %r113, %r112, %r33; rem.s32 %r131, %r113, %r33; bra.uni BB0_20; BB0_19: add.s32 %r114, %r33, -1; min.s32 %r131, %r27, %r114; BB0_20: mad.lo.s32 %r115, %r131, %r32, %r2; mad.lo.s32 %r116, %r115, %r31, %r1; cvt.s64.s32 %rd32, %r116; add.s64 %rd33, %rd1, %rd32; ld.global.u8 %rs18, [%rd33]; setp.gt.u16 %p18, %rs18, %rs1; cvt.u32.u16 %r117, %rs18; selp.b32 %r120, %r50, %r117, %p18; selp.b32 %r121, %r117, %r50, %p18; add.s32 %r122, %r121, 1; mul.lo.s32 %r123, %r122, %r121; shr.u32 %r124, %r123, 1; add.s32 %r125, %r124, %r120; mul.wide.s32 %rd35, %r125, 4; add.s64 %rd36, %rd9, %rd35; ld.global.f32 %f13, [%rd36]; add.f32 %f14, %f5, %f13; BB0_21: cvta.to.global.u64 %rd37, %rd2; mul.wide.s32 %rd38, %r6, 4; add.s64 %rd39, %rd37, %rd38; st.global.f32 [%rd39], %f14; BB0_22: ret; } ` exchangedecode_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<19>; .reg .b16 %rs<26>; .reg .f32 %f<15>; .reg .b32 %r<215>; .reg .b64 %rd<46>; ld.param.u64 %rd1, [exchangedecode_param_0]; ld.param.u64 %rd2, [exchangedecode_param_1]; ld.param.u64 %rd3, [exchangedecode_param_2]; ld.param.u32 %r26, [exchangedecode_param_6]; ld.param.u32 %r27, [exchangedecode_param_7]; ld.param.u32 %r28, [exchangedecode_param_8]; ld.param.u8 %rs5, [exchangedecode_param_9]; mov.u32 %r29, %ctaid.x; mov.u32 %r30, %ntid.x; mov.u32 %r31, %tid.x; mad.lo.s32 %r32, %r30, %r29, %r31; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r36, %r33, %r34, %r35; mov.u32 %r37, %ntid.z; mov.u32 %r38, %ctaid.z; mov.u32 %r39, %tid.z; mad.lo.s32 %r40, %r37, %r38, %r39; setp.ge.s32 %p1, %r36, %r27; setp.ge.s32 %p2, %r32, %r26; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r40, %r28; or.pred %p5, %p3, %p4; @%p5 bra BB0_22; mad.lo.s32 %r49, %r40, %r27, %r36; mul.lo.s32 %r1, %r49, %r26; add.s32 %r54, %r1, %r32; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r54; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r2, %r32, -1; @%p6 bra BB0_3; rem.s32 %r55, %r2, %r26; add.s32 %r56, %r55, %r26; rem.s32 %r209, %r56, %r26; bra.uni BB0_4; BB0_3: mov.u32 %r57, 0; max.s32 %r209, %r2, %r57; BB0_4: add.s32 %r58, %r209, %r1; cvt.s64.s32 %rd8, %r58; add.s64 %rd9, %rd4, %rd8; ld.global.nc.u8 %rs6, [%rd9]; setp.gt.u16 %p7, %rs6, %rs1; cvt.u32.u16 %r59, %rs6; and.b32 %r60, %r59, 255; cvt.u32.u16 %r61, %rs1; and.b32 %r62, %r61, 255; selp.b32 %r63, %r62, %r60, %p7; selp.b32 %r64, %r60, %r62, %p7; add.s32 %r65, %r64, 1; mul.lo.s32 %r66, %r65, %r64; shr.u32 %r67, %r66, 1; add.s32 %r68, %r67, %r63; cvta.to.global.u64 %rd10, %rd2; mul.wide.s32 %rd11, %r68, 4; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f1, [%rd12]; add.s32 %r6, %r32, 1; @%p6 bra BB0_6; rem.s32 %r73, %r6, %r26; add.s32 %r74, %r73, %r26; rem.s32 %r210, %r74, %r26; bra.uni BB0_7; BB0_6: add.s32 %r75, %r26, -1; min.s32 %r210, %r6, %r75; BB0_7: add.s32 %r76, %r210, %r1; cvt.s64.s32 %rd14, %r76; add.s64 %rd15, %rd4, %rd14; ld.global.nc.u8 %rs9, [%rd15]; setp.gt.u16 %p9, %rs9, %rs1; cvt.u32.u16 %r77, %rs9; and.b32 %r78, %r77, 255; selp.b32 %r81, %r62, %r78, %p9; selp.b32 %r82, %r78, %r62, %p9; add.s32 %r83, %r82, 1; mul.lo.s32 %r84, %r83, %r82; shr.u32 %r85, %r84, 1; add.s32 %r86, %r85, %r81; mul.wide.s32 %rd17, %r86, 4; add.s64 %rd18, %rd10, %rd17; ld.global.nc.f32 %f8, [%rd18]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p10, %rs3, 0; add.s32 %r10, %r36, -1; @%p10 bra BB0_9; rem.s32 %r91, %r10, %r27; add.s32 %r92, %r91, %r27; rem.s32 %r211, %r92, %r27; bra.uni BB0_10; BB0_9: mov.u32 %r93, 0; max.s32 %r211, %r10, %r93; BB0_10: mad.lo.s32 %r98, %r40, %r27, %r211; mad.lo.s32 %r103, %r98, %r26, %r32; cvt.s64.s32 %rd20, %r103; add.s64 %rd21, %rd4, %rd20; ld.global.nc.u8 %rs12, [%rd21]; setp.gt.u16 %p11, %rs12, %rs1; cvt.u32.u16 %r104, %rs12; and.b32 %r105, %r104, 255; selp.b32 %r108, %r62, %r105, %p11; selp.b32 %r109, %r105, %r62, %p11; add.s32 %r110, %r109, 1; mul.lo.s32 %r111, %r110, %r109; shr.u32 %r112, %r111, 1; add.s32 %r113, %r112, %r108; mul.wide.s32 %rd23, %r113, 4; add.s64 %rd24, %rd10, %rd23; ld.global.nc.f32 %f10, [%rd24]; add.f32 %f3, %f2, %f10; add.s32 %r14, %r36, 1; @%p10 bra BB0_12; rem.s32 %r118, %r14, %r27; add.s32 %r119, %r118, %r27; rem.s32 %r212, %r119, %r27; bra.uni BB0_13; BB0_12: add.s32 %r120, %r27, -1; min.s32 %r212, %r14, %r120; BB0_13: mad.lo.s32 %r125, %r40, %r27, %r212; mad.lo.s32 %r130, %r125, %r26, %r32; cvt.s64.s32 %rd26, %r130; add.s64 %rd27, %rd4, %rd26; ld.global.nc.u8 %rs16, [%rd27]; setp.gt.u16 %p13, %rs16, %rs1; cvt.u32.u16 %r131, %rs16; and.b32 %r132, %r131, 255; selp.b32 %r135, %r62, %r132, %p13; selp.b32 %r136, %r132, %r62, %p13; add.s32 %r137, %r136, 1; mul.lo.s32 %r138, %r137, %r136; shr.u32 %r139, %r138, 1; add.s32 %r140, %r139, %r135; mul.wide.s32 %rd29, %r140, 4; add.s64 %rd30, %rd10, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p14, %r28, 1; @%p14 bra BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p15, %rs4, 0; add.s32 %r18, %r40, -1; @%p15 bra BB0_16; rem.s32 %r145, %r18, %r28; add.s32 %r146, %r145, %r28; rem.s32 %r213, %r146, %r28; bra.uni BB0_17; BB0_16: mov.u32 %r147, 0; max.s32 %r213, %r18, %r147; BB0_17: mad.lo.s32 %r152, %r213, %r27, %r36; mad.lo.s32 %r157, %r152, %r26, %r32; cvt.s64.s32 %rd32, %r157; add.s64 %rd33, %rd4, %rd32; ld.global.nc.u8 %rs19, [%rd33]; setp.gt.u16 %p16, %rs19, %rs1; cvt.u32.u16 %r158, %rs19; and.b32 %r159, %r158, 255; selp.b32 %r162, %r62, %r159, %p16; selp.b32 %r163, %r159, %r62, %p16; add.s32 %r164, %r163, 1; mul.lo.s32 %r165, %r164, %r163; shr.u32 %r166, %r165, 1; add.s32 %r167, %r166, %r162; mul.wide.s32 %rd35, %r167, 4; add.s64 %rd36, %rd10, %rd35; ld.global.nc.f32 %f12, [%rd36]; add.f32 %f5, %f14, %f12; add.s32 %r22, %r40, 1; @%p15 bra BB0_19; rem.s32 %r172, %r22, %r28; add.s32 %r173, %r172, %r28; rem.s32 %r214, %r173, %r28; bra.uni BB0_20; BB0_19: add.s32 %r174, %r28, -1; min.s32 %r214, %r22, %r174; BB0_20: mad.lo.s32 %r179, %r214, %r27, %r36; mad.lo.s32 %r184, %r179, %r26, %r32; cvt.s64.s32 %rd38, %r184; add.s64 %rd39, %rd4, %rd38; ld.global.nc.u8 %rs23, [%rd39]; setp.gt.u16 %p18, %rs23, %rs1; cvt.u32.u16 %r185, %rs23; and.b32 %r186, %r185, 255; selp.b32 %r189, %r62, %r186, %p18; selp.b32 %r190, %r186, %r62, %p18; add.s32 %r191, %r190, 1; mul.lo.s32 %r192, %r191, %r190; shr.u32 %r193, %r192, 1; add.s32 %r194, %r193, %r189; mul.wide.s32 %rd41, %r194, 4; add.s64 %rd42, %rd10, %rd41; ld.global.nc.f32 %f13, [%rd42]; add.f32 %f14, %f5, %f13; BB0_21: mad.lo.s32 %r208, %r49, %r26, %r32; cvta.to.global.u64 %rd43, %rd1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd43, %rd44; st.global.f32 [%rd45], %f14; BB0_22: ret; } ` exchangedecode_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<19>; .reg .b16 %rs<26>; .reg .f32 %f<15>; .reg .b32 %r<215>; .reg .b64 %rd<46>; ld.param.u64 %rd1, [exchangedecode_param_0]; ld.param.u64 %rd2, [exchangedecode_param_1]; ld.param.u64 %rd3, [exchangedecode_param_2]; ld.param.u32 %r26, [exchangedecode_param_6]; ld.param.u32 %r27, [exchangedecode_param_7]; ld.param.u32 %r28, [exchangedecode_param_8]; ld.param.u8 %rs5, [exchangedecode_param_9]; mov.u32 %r29, %ctaid.x; mov.u32 %r30, %ntid.x; mov.u32 %r31, %tid.x; mad.lo.s32 %r32, %r30, %r29, %r31; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r36, %r33, %r34, %r35; mov.u32 %r37, %ntid.z; mov.u32 %r38, %ctaid.z; mov.u32 %r39, %tid.z; mad.lo.s32 %r40, %r37, %r38, %r39; setp.ge.s32 %p1, %r36, %r27; setp.ge.s32 %p2, %r32, %r26; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r40, %r28; or.pred %p5, %p3, %p4; @%p5 bra BB0_22; mad.lo.s32 %r49, %r40, %r27, %r36; mul.lo.s32 %r1, %r49, %r26; add.s32 %r54, %r1, %r32; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r54; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r2, %r32, -1; @%p6 bra BB0_3; rem.s32 %r55, %r2, %r26; add.s32 %r56, %r55, %r26; rem.s32 %r209, %r56, %r26; bra.uni BB0_4; BB0_3: mov.u32 %r57, 0; max.s32 %r209, %r2, %r57; BB0_4: add.s32 %r58, %r209, %r1; cvt.s64.s32 %rd8, %r58; add.s64 %rd9, %rd4, %rd8; ld.global.nc.u8 %rs6, [%rd9]; setp.gt.u16 %p7, %rs6, %rs1; cvt.u32.u16 %r59, %rs6; and.b32 %r60, %r59, 255; cvt.u32.u16 %r61, %rs1; and.b32 %r62, %r61, 255; selp.b32 %r63, %r62, %r60, %p7; selp.b32 %r64, %r60, %r62, %p7; add.s32 %r65, %r64, 1; mul.lo.s32 %r66, %r65, %r64; shr.u32 %r67, %r66, 1; add.s32 %r68, %r67, %r63; cvta.to.global.u64 %rd10, %rd2; mul.wide.s32 %rd11, %r68, 4; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f1, [%rd12]; add.s32 %r6, %r32, 1; @%p6 bra BB0_6; rem.s32 %r73, %r6, %r26; add.s32 %r74, %r73, %r26; rem.s32 %r210, %r74, %r26; bra.uni BB0_7; BB0_6: add.s32 %r75, %r26, -1; min.s32 %r210, %r6, %r75; BB0_7: add.s32 %r76, %r210, %r1; cvt.s64.s32 %rd14, %r76; add.s64 %rd15, %rd4, %rd14; ld.global.nc.u8 %rs9, [%rd15]; setp.gt.u16 %p9, %rs9, %rs1; cvt.u32.u16 %r77, %rs9; and.b32 %r78, %r77, 255; selp.b32 %r81, %r62, %r78, %p9; selp.b32 %r82, %r78, %r62, %p9; add.s32 %r83, %r82, 1; mul.lo.s32 %r84, %r83, %r82; shr.u32 %r85, %r84, 1; add.s32 %r86, %r85, %r81; mul.wide.s32 %rd17, %r86, 4; add.s64 %rd18, %rd10, %rd17; ld.global.nc.f32 %f8, [%rd18]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p10, %rs3, 0; add.s32 %r10, %r36, -1; @%p10 bra BB0_9; rem.s32 %r91, %r10, %r27; add.s32 %r92, %r91, %r27; rem.s32 %r211, %r92, %r27; bra.uni BB0_10; BB0_9: mov.u32 %r93, 0; max.s32 %r211, %r10, %r93; BB0_10: mad.lo.s32 %r98, %r40, %r27, %r211; mad.lo.s32 %r103, %r98, %r26, %r32; cvt.s64.s32 %rd20, %r103; add.s64 %rd21, %rd4, %rd20; ld.global.nc.u8 %rs12, [%rd21]; setp.gt.u16 %p11, %rs12, %rs1; cvt.u32.u16 %r104, %rs12; and.b32 %r105, %r104, 255; selp.b32 %r108, %r62, %r105, %p11; selp.b32 %r109, %r105, %r62, %p11; add.s32 %r110, %r109, 1; mul.lo.s32 %r111, %r110, %r109; shr.u32 %r112, %r111, 1; add.s32 %r113, %r112, %r108; mul.wide.s32 %rd23, %r113, 4; add.s64 %rd24, %rd10, %rd23; ld.global.nc.f32 %f10, [%rd24]; add.f32 %f3, %f2, %f10; add.s32 %r14, %r36, 1; @%p10 bra BB0_12; rem.s32 %r118, %r14, %r27; add.s32 %r119, %r118, %r27; rem.s32 %r212, %r119, %r27; bra.uni BB0_13; BB0_12: add.s32 %r120, %r27, -1; min.s32 %r212, %r14, %r120; BB0_13: mad.lo.s32 %r125, %r40, %r27, %r212; mad.lo.s32 %r130, %r125, %r26, %r32; cvt.s64.s32 %rd26, %r130; add.s64 %rd27, %rd4, %rd26; ld.global.nc.u8 %rs16, [%rd27]; setp.gt.u16 %p13, %rs16, %rs1; cvt.u32.u16 %r131, %rs16; and.b32 %r132, %r131, 255; selp.b32 %r135, %r62, %r132, %p13; selp.b32 %r136, %r132, %r62, %p13; add.s32 %r137, %r136, 1; mul.lo.s32 %r138, %r137, %r136; shr.u32 %r139, %r138, 1; add.s32 %r140, %r139, %r135; mul.wide.s32 %rd29, %r140, 4; add.s64 %rd30, %rd10, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p14, %r28, 1; @%p14 bra BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p15, %rs4, 0; add.s32 %r18, %r40, -1; @%p15 bra BB0_16; rem.s32 %r145, %r18, %r28; add.s32 %r146, %r145, %r28; rem.s32 %r213, %r146, %r28; bra.uni BB0_17; BB0_16: mov.u32 %r147, 0; max.s32 %r213, %r18, %r147; BB0_17: mad.lo.s32 %r152, %r213, %r27, %r36; mad.lo.s32 %r157, %r152, %r26, %r32; cvt.s64.s32 %rd32, %r157; add.s64 %rd33, %rd4, %rd32; ld.global.nc.u8 %rs19, [%rd33]; setp.gt.u16 %p16, %rs19, %rs1; cvt.u32.u16 %r158, %rs19; and.b32 %r159, %r158, 255; selp.b32 %r162, %r62, %r159, %p16; selp.b32 %r163, %r159, %r62, %p16; add.s32 %r164, %r163, 1; mul.lo.s32 %r165, %r164, %r163; shr.u32 %r166, %r165, 1; add.s32 %r167, %r166, %r162; mul.wide.s32 %rd35, %r167, 4; add.s64 %rd36, %rd10, %rd35; ld.global.nc.f32 %f12, [%rd36]; add.f32 %f5, %f14, %f12; add.s32 %r22, %r40, 1; @%p15 bra BB0_19; rem.s32 %r172, %r22, %r28; add.s32 %r173, %r172, %r28; rem.s32 %r214, %r173, %r28; bra.uni BB0_20; BB0_19: add.s32 %r174, %r28, -1; min.s32 %r214, %r22, %r174; BB0_20: mad.lo.s32 %r179, %r214, %r27, %r36; mad.lo.s32 %r184, %r179, %r26, %r32; cvt.s64.s32 %rd38, %r184; add.s64 %rd39, %rd4, %rd38; ld.global.nc.u8 %rs23, [%rd39]; setp.gt.u16 %p18, %rs23, %rs1; cvt.u32.u16 %r185, %rs23; and.b32 %r186, %r185, 255; selp.b32 %r189, %r62, %r186, %p18; selp.b32 %r190, %r186, %r62, %p18; add.s32 %r191, %r190, 1; mul.lo.s32 %r192, %r191, %r190; shr.u32 %r193, %r192, 1; add.s32 %r194, %r193, %r189; mul.wide.s32 %rd41, %r194, 4; add.s64 %rd42, %rd10, %rd41; ld.global.nc.f32 %f13, [%rd42]; add.f32 %f14, %f5, %f13; BB0_21: mad.lo.s32 %r208, %r49, %r26, %r32; cvta.to.global.u64 %rd43, %rd1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd43, %rd44; st.global.f32 [%rd45], %f14; BB0_22: ret; } ` exchangedecode_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<19>; .reg .b16 %rs<26>; .reg .f32 %f<15>; .reg .b32 %r<215>; .reg .b64 %rd<46>; ld.param.u64 %rd1, [exchangedecode_param_0]; ld.param.u64 %rd2, [exchangedecode_param_1]; ld.param.u64 %rd3, [exchangedecode_param_2]; ld.param.u32 %r26, [exchangedecode_param_6]; ld.param.u32 %r27, [exchangedecode_param_7]; ld.param.u32 %r28, [exchangedecode_param_8]; ld.param.u8 %rs5, [exchangedecode_param_9]; mov.u32 %r29, %ctaid.x; mov.u32 %r30, %ntid.x; mov.u32 %r31, %tid.x; mad.lo.s32 %r32, %r30, %r29, %r31; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r36, %r33, %r34, %r35; mov.u32 %r37, %ntid.z; mov.u32 %r38, %ctaid.z; mov.u32 %r39, %tid.z; mad.lo.s32 %r40, %r37, %r38, %r39; setp.ge.s32 %p1, %r36, %r27; setp.ge.s32 %p2, %r32, %r26; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r40, %r28; or.pred %p5, %p3, %p4; @%p5 bra BB0_22; mad.lo.s32 %r49, %r40, %r27, %r36; mul.lo.s32 %r1, %r49, %r26; add.s32 %r54, %r1, %r32; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r54; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r2, %r32, -1; @%p6 bra BB0_3; rem.s32 %r55, %r2, %r26; add.s32 %r56, %r55, %r26; rem.s32 %r209, %r56, %r26; bra.uni BB0_4; BB0_3: mov.u32 %r57, 0; max.s32 %r209, %r2, %r57; BB0_4: add.s32 %r58, %r209, %r1; cvt.s64.s32 %rd8, %r58; add.s64 %rd9, %rd4, %rd8; ld.global.nc.u8 %rs6, [%rd9]; setp.gt.u16 %p7, %rs6, %rs1; cvt.u32.u16 %r59, %rs6; and.b32 %r60, %r59, 255; cvt.u32.u16 %r61, %rs1; and.b32 %r62, %r61, 255; selp.b32 %r63, %r62, %r60, %p7; selp.b32 %r64, %r60, %r62, %p7; add.s32 %r65, %r64, 1; mul.lo.s32 %r66, %r65, %r64; shr.u32 %r67, %r66, 1; add.s32 %r68, %r67, %r63; cvta.to.global.u64 %rd10, %rd2; mul.wide.s32 %rd11, %r68, 4; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f1, [%rd12]; add.s32 %r6, %r32, 1; @%p6 bra BB0_6; rem.s32 %r73, %r6, %r26; add.s32 %r74, %r73, %r26; rem.s32 %r210, %r74, %r26; bra.uni BB0_7; BB0_6: add.s32 %r75, %r26, -1; min.s32 %r210, %r6, %r75; BB0_7: add.s32 %r76, %r210, %r1; cvt.s64.s32 %rd14, %r76; add.s64 %rd15, %rd4, %rd14; ld.global.nc.u8 %rs9, [%rd15]; setp.gt.u16 %p9, %rs9, %rs1; cvt.u32.u16 %r77, %rs9; and.b32 %r78, %r77, 255; selp.b32 %r81, %r62, %r78, %p9; selp.b32 %r82, %r78, %r62, %p9; add.s32 %r83, %r82, 1; mul.lo.s32 %r84, %r83, %r82; shr.u32 %r85, %r84, 1; add.s32 %r86, %r85, %r81; mul.wide.s32 %rd17, %r86, 4; add.s64 %rd18, %rd10, %rd17; ld.global.nc.f32 %f8, [%rd18]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p10, %rs3, 0; add.s32 %r10, %r36, -1; @%p10 bra BB0_9; rem.s32 %r91, %r10, %r27; add.s32 %r92, %r91, %r27; rem.s32 %r211, %r92, %r27; bra.uni BB0_10; BB0_9: mov.u32 %r93, 0; max.s32 %r211, %r10, %r93; BB0_10: mad.lo.s32 %r98, %r40, %r27, %r211; mad.lo.s32 %r103, %r98, %r26, %r32; cvt.s64.s32 %rd20, %r103; add.s64 %rd21, %rd4, %rd20; ld.global.nc.u8 %rs12, [%rd21]; setp.gt.u16 %p11, %rs12, %rs1; cvt.u32.u16 %r104, %rs12; and.b32 %r105, %r104, 255; selp.b32 %r108, %r62, %r105, %p11; selp.b32 %r109, %r105, %r62, %p11; add.s32 %r110, %r109, 1; mul.lo.s32 %r111, %r110, %r109; shr.u32 %r112, %r111, 1; add.s32 %r113, %r112, %r108; mul.wide.s32 %rd23, %r113, 4; add.s64 %rd24, %rd10, %rd23; ld.global.nc.f32 %f10, [%rd24]; add.f32 %f3, %f2, %f10; add.s32 %r14, %r36, 1; @%p10 bra BB0_12; rem.s32 %r118, %r14, %r27; add.s32 %r119, %r118, %r27; rem.s32 %r212, %r119, %r27; bra.uni BB0_13; BB0_12: add.s32 %r120, %r27, -1; min.s32 %r212, %r14, %r120; BB0_13: mad.lo.s32 %r125, %r40, %r27, %r212; mad.lo.s32 %r130, %r125, %r26, %r32; cvt.s64.s32 %rd26, %r130; add.s64 %rd27, %rd4, %rd26; ld.global.nc.u8 %rs16, [%rd27]; setp.gt.u16 %p13, %rs16, %rs1; cvt.u32.u16 %r131, %rs16; and.b32 %r132, %r131, 255; selp.b32 %r135, %r62, %r132, %p13; selp.b32 %r136, %r132, %r62, %p13; add.s32 %r137, %r136, 1; mul.lo.s32 %r138, %r137, %r136; shr.u32 %r139, %r138, 1; add.s32 %r140, %r139, %r135; mul.wide.s32 %rd29, %r140, 4; add.s64 %rd30, %rd10, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p14, %r28, 1; @%p14 bra BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p15, %rs4, 0; add.s32 %r18, %r40, -1; @%p15 bra BB0_16; rem.s32 %r145, %r18, %r28; add.s32 %r146, %r145, %r28; rem.s32 %r213, %r146, %r28; bra.uni BB0_17; BB0_16: mov.u32 %r147, 0; max.s32 %r213, %r18, %r147; BB0_17: mad.lo.s32 %r152, %r213, %r27, %r36; mad.lo.s32 %r157, %r152, %r26, %r32; cvt.s64.s32 %rd32, %r157; add.s64 %rd33, %rd4, %rd32; ld.global.nc.u8 %rs19, [%rd33]; setp.gt.u16 %p16, %rs19, %rs1; cvt.u32.u16 %r158, %rs19; and.b32 %r159, %r158, 255; selp.b32 %r162, %r62, %r159, %p16; selp.b32 %r163, %r159, %r62, %p16; add.s32 %r164, %r163, 1; mul.lo.s32 %r165, %r164, %r163; shr.u32 %r166, %r165, 1; add.s32 %r167, %r166, %r162; mul.wide.s32 %rd35, %r167, 4; add.s64 %rd36, %rd10, %rd35; ld.global.nc.f32 %f12, [%rd36]; add.f32 %f5, %f14, %f12; add.s32 %r22, %r40, 1; @%p15 bra BB0_19; rem.s32 %r172, %r22, %r28; add.s32 %r173, %r172, %r28; rem.s32 %r214, %r173, %r28; bra.uni BB0_20; BB0_19: add.s32 %r174, %r28, -1; min.s32 %r214, %r22, %r174; BB0_20: mad.lo.s32 %r179, %r214, %r27, %r36; mad.lo.s32 %r184, %r179, %r26, %r32; cvt.s64.s32 %rd38, %r184; add.s64 %rd39, %rd4, %rd38; ld.global.nc.u8 %rs23, [%rd39]; setp.gt.u16 %p18, %rs23, %rs1; cvt.u32.u16 %r185, %rs23; and.b32 %r186, %r185, 255; selp.b32 %r189, %r62, %r186, %p18; selp.b32 %r190, %r186, %r62, %p18; add.s32 %r191, %r190, 1; mul.lo.s32 %r192, %r191, %r190; shr.u32 %r193, %r192, 1; add.s32 %r194, %r193, %r189; mul.wide.s32 %rd41, %r194, 4; add.s64 %rd42, %rd10, %rd41; ld.global.nc.f32 %f13, [%rd42]; add.f32 %f14, %f5, %f13; BB0_21: mad.lo.s32 %r208, %r49, %r26, %r32; cvta.to.global.u64 %rd43, %rd1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd43, %rd44; st.global.f32 [%rd45], %f14; BB0_22: ret; } ` exchangedecode_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<19>; .reg .b16 %rs<26>; .reg .f32 %f<15>; .reg .b32 %r<215>; .reg .b64 %rd<46>; ld.param.u64 %rd1, [exchangedecode_param_0]; ld.param.u64 %rd2, [exchangedecode_param_1]; ld.param.u64 %rd3, [exchangedecode_param_2]; ld.param.u32 %r26, [exchangedecode_param_6]; ld.param.u32 %r27, [exchangedecode_param_7]; ld.param.u32 %r28, [exchangedecode_param_8]; ld.param.u8 %rs5, [exchangedecode_param_9]; mov.u32 %r29, %ctaid.x; mov.u32 %r30, %ntid.x; mov.u32 %r31, %tid.x; mad.lo.s32 %r32, %r30, %r29, %r31; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r36, %r33, %r34, %r35; mov.u32 %r37, %ntid.z; mov.u32 %r38, %ctaid.z; mov.u32 %r39, %tid.z; mad.lo.s32 %r40, %r37, %r38, %r39; setp.ge.s32 %p1, %r36, %r27; setp.ge.s32 %p2, %r32, %r26; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r40, %r28; or.pred %p5, %p3, %p4; @%p5 bra BB0_22; mad.lo.s32 %r49, %r40, %r27, %r36; mul.lo.s32 %r1, %r49, %r26; add.s32 %r54, %r1, %r32; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r54; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r2, %r32, -1; @%p6 bra BB0_3; rem.s32 %r55, %r2, %r26; add.s32 %r56, %r55, %r26; rem.s32 %r209, %r56, %r26; bra.uni BB0_4; BB0_3: mov.u32 %r57, 0; max.s32 %r209, %r2, %r57; BB0_4: add.s32 %r58, %r209, %r1; cvt.s64.s32 %rd8, %r58; add.s64 %rd9, %rd4, %rd8; ld.global.nc.u8 %rs6, [%rd9]; setp.gt.u16 %p7, %rs6, %rs1; cvt.u32.u16 %r59, %rs6; and.b32 %r60, %r59, 255; cvt.u32.u16 %r61, %rs1; and.b32 %r62, %r61, 255; selp.b32 %r63, %r62, %r60, %p7; selp.b32 %r64, %r60, %r62, %p7; add.s32 %r65, %r64, 1; mul.lo.s32 %r66, %r65, %r64; shr.u32 %r67, %r66, 1; add.s32 %r68, %r67, %r63; cvta.to.global.u64 %rd10, %rd2; mul.wide.s32 %rd11, %r68, 4; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f1, [%rd12]; add.s32 %r6, %r32, 1; @%p6 bra BB0_6; rem.s32 %r73, %r6, %r26; add.s32 %r74, %r73, %r26; rem.s32 %r210, %r74, %r26; bra.uni BB0_7; BB0_6: add.s32 %r75, %r26, -1; min.s32 %r210, %r6, %r75; BB0_7: add.s32 %r76, %r210, %r1; cvt.s64.s32 %rd14, %r76; add.s64 %rd15, %rd4, %rd14; ld.global.nc.u8 %rs9, [%rd15]; setp.gt.u16 %p9, %rs9, %rs1; cvt.u32.u16 %r77, %rs9; and.b32 %r78, %r77, 255; selp.b32 %r81, %r62, %r78, %p9; selp.b32 %r82, %r78, %r62, %p9; add.s32 %r83, %r82, 1; mul.lo.s32 %r84, %r83, %r82; shr.u32 %r85, %r84, 1; add.s32 %r86, %r85, %r81; mul.wide.s32 %rd17, %r86, 4; add.s64 %rd18, %rd10, %rd17; ld.global.nc.f32 %f8, [%rd18]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p10, %rs3, 0; add.s32 %r10, %r36, -1; @%p10 bra BB0_9; rem.s32 %r91, %r10, %r27; add.s32 %r92, %r91, %r27; rem.s32 %r211, %r92, %r27; bra.uni BB0_10; BB0_9: mov.u32 %r93, 0; max.s32 %r211, %r10, %r93; BB0_10: mad.lo.s32 %r98, %r40, %r27, %r211; mad.lo.s32 %r103, %r98, %r26, %r32; cvt.s64.s32 %rd20, %r103; add.s64 %rd21, %rd4, %rd20; ld.global.nc.u8 %rs12, [%rd21]; setp.gt.u16 %p11, %rs12, %rs1; cvt.u32.u16 %r104, %rs12; and.b32 %r105, %r104, 255; selp.b32 %r108, %r62, %r105, %p11; selp.b32 %r109, %r105, %r62, %p11; add.s32 %r110, %r109, 1; mul.lo.s32 %r111, %r110, %r109; shr.u32 %r112, %r111, 1; add.s32 %r113, %r112, %r108; mul.wide.s32 %rd23, %r113, 4; add.s64 %rd24, %rd10, %rd23; ld.global.nc.f32 %f10, [%rd24]; add.f32 %f3, %f2, %f10; add.s32 %r14, %r36, 1; @%p10 bra BB0_12; rem.s32 %r118, %r14, %r27; add.s32 %r119, %r118, %r27; rem.s32 %r212, %r119, %r27; bra.uni BB0_13; BB0_12: add.s32 %r120, %r27, -1; min.s32 %r212, %r14, %r120; BB0_13: mad.lo.s32 %r125, %r40, %r27, %r212; mad.lo.s32 %r130, %r125, %r26, %r32; cvt.s64.s32 %rd26, %r130; add.s64 %rd27, %rd4, %rd26; ld.global.nc.u8 %rs16, [%rd27]; setp.gt.u16 %p13, %rs16, %rs1; cvt.u32.u16 %r131, %rs16; and.b32 %r132, %r131, 255; selp.b32 %r135, %r62, %r132, %p13; selp.b32 %r136, %r132, %r62, %p13; add.s32 %r137, %r136, 1; mul.lo.s32 %r138, %r137, %r136; shr.u32 %r139, %r138, 1; add.s32 %r140, %r139, %r135; mul.wide.s32 %rd29, %r140, 4; add.s64 %rd30, %rd10, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p14, %r28, 1; @%p14 bra BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p15, %rs4, 0; add.s32 %r18, %r40, -1; @%p15 bra BB0_16; rem.s32 %r145, %r18, %r28; add.s32 %r146, %r145, %r28; rem.s32 %r213, %r146, %r28; bra.uni BB0_17; BB0_16: mov.u32 %r147, 0; max.s32 %r213, %r18, %r147; BB0_17: mad.lo.s32 %r152, %r213, %r27, %r36; mad.lo.s32 %r157, %r152, %r26, %r32; cvt.s64.s32 %rd32, %r157; add.s64 %rd33, %rd4, %rd32; ld.global.nc.u8 %rs19, [%rd33]; setp.gt.u16 %p16, %rs19, %rs1; cvt.u32.u16 %r158, %rs19; and.b32 %r159, %r158, 255; selp.b32 %r162, %r62, %r159, %p16; selp.b32 %r163, %r159, %r62, %p16; add.s32 %r164, %r163, 1; mul.lo.s32 %r165, %r164, %r163; shr.u32 %r166, %r165, 1; add.s32 %r167, %r166, %r162; mul.wide.s32 %rd35, %r167, 4; add.s64 %rd36, %rd10, %rd35; ld.global.nc.f32 %f12, [%rd36]; add.f32 %f5, %f14, %f12; add.s32 %r22, %r40, 1; @%p15 bra BB0_19; rem.s32 %r172, %r22, %r28; add.s32 %r173, %r172, %r28; rem.s32 %r214, %r173, %r28; bra.uni BB0_20; BB0_19: add.s32 %r174, %r28, -1; min.s32 %r214, %r22, %r174; BB0_20: mad.lo.s32 %r179, %r214, %r27, %r36; mad.lo.s32 %r184, %r179, %r26, %r32; cvt.s64.s32 %rd38, %r184; add.s64 %rd39, %rd4, %rd38; ld.global.nc.u8 %rs23, [%rd39]; setp.gt.u16 %p18, %rs23, %rs1; cvt.u32.u16 %r185, %rs23; and.b32 %r186, %r185, 255; selp.b32 %r189, %r62, %r186, %p18; selp.b32 %r190, %r186, %r62, %p18; add.s32 %r191, %r190, 1; mul.lo.s32 %r192, %r191, %r190; shr.u32 %r193, %r192, 1; add.s32 %r194, %r193, %r189; mul.wide.s32 %rd41, %r194, 4; add.s64 %rd42, %rd10, %rd41; ld.global.nc.f32 %f13, [%rd42]; add.f32 %f14, %f5, %f13; BB0_21: mad.lo.s32 %r208, %r49, %r26, %r32; cvta.to.global.u64 %rd43, %rd1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd43, %rd44; st.global.f32 [%rd45], %f14; BB0_22: ret; } ` exchangedecode_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<19>; .reg .b16 %rs<26>; .reg .f32 %f<15>; .reg .b32 %r<215>; .reg .b64 %rd<46>; ld.param.u64 %rd1, [exchangedecode_param_0]; ld.param.u64 %rd2, [exchangedecode_param_1]; ld.param.u64 %rd3, [exchangedecode_param_2]; ld.param.u32 %r26, [exchangedecode_param_6]; ld.param.u32 %r27, [exchangedecode_param_7]; ld.param.u32 %r28, [exchangedecode_param_8]; ld.param.u8 %rs5, [exchangedecode_param_9]; mov.u32 %r29, %ctaid.x; mov.u32 %r30, %ntid.x; mov.u32 %r31, %tid.x; mad.lo.s32 %r32, %r30, %r29, %r31; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r36, %r33, %r34, %r35; mov.u32 %r37, %ntid.z; mov.u32 %r38, %ctaid.z; mov.u32 %r39, %tid.z; mad.lo.s32 %r40, %r37, %r38, %r39; setp.ge.s32 %p1, %r36, %r27; setp.ge.s32 %p2, %r32, %r26; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r40, %r28; or.pred %p5, %p3, %p4; @%p5 bra BB0_22; mad.lo.s32 %r49, %r40, %r27, %r36; mul.lo.s32 %r1, %r49, %r26; add.s32 %r54, %r1, %r32; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r54; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r2, %r32, -1; @%p6 bra BB0_3; rem.s32 %r55, %r2, %r26; add.s32 %r56, %r55, %r26; rem.s32 %r209, %r56, %r26; bra.uni BB0_4; BB0_3: mov.u32 %r57, 0; max.s32 %r209, %r2, %r57; BB0_4: add.s32 %r58, %r209, %r1; cvt.s64.s32 %rd8, %r58; add.s64 %rd9, %rd4, %rd8; ld.global.nc.u8 %rs6, [%rd9]; setp.gt.u16 %p7, %rs6, %rs1; cvt.u32.u16 %r59, %rs6; and.b32 %r60, %r59, 255; cvt.u32.u16 %r61, %rs1; and.b32 %r62, %r61, 255; selp.b32 %r63, %r62, %r60, %p7; selp.b32 %r64, %r60, %r62, %p7; add.s32 %r65, %r64, 1; mul.lo.s32 %r66, %r65, %r64; shr.u32 %r67, %r66, 1; add.s32 %r68, %r67, %r63; cvta.to.global.u64 %rd10, %rd2; mul.wide.s32 %rd11, %r68, 4; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f1, [%rd12]; add.s32 %r6, %r32, 1; @%p6 bra BB0_6; rem.s32 %r73, %r6, %r26; add.s32 %r74, %r73, %r26; rem.s32 %r210, %r74, %r26; bra.uni BB0_7; BB0_6: add.s32 %r75, %r26, -1; min.s32 %r210, %r6, %r75; BB0_7: add.s32 %r76, %r210, %r1; cvt.s64.s32 %rd14, %r76; add.s64 %rd15, %rd4, %rd14; ld.global.nc.u8 %rs9, [%rd15]; setp.gt.u16 %p9, %rs9, %rs1; cvt.u32.u16 %r77, %rs9; and.b32 %r78, %r77, 255; selp.b32 %r81, %r62, %r78, %p9; selp.b32 %r82, %r78, %r62, %p9; add.s32 %r83, %r82, 1; mul.lo.s32 %r84, %r83, %r82; shr.u32 %r85, %r84, 1; add.s32 %r86, %r85, %r81; mul.wide.s32 %rd17, %r86, 4; add.s64 %rd18, %rd10, %rd17; ld.global.nc.f32 %f8, [%rd18]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p10, %rs3, 0; add.s32 %r10, %r36, -1; @%p10 bra BB0_9; rem.s32 %r91, %r10, %r27; add.s32 %r92, %r91, %r27; rem.s32 %r211, %r92, %r27; bra.uni BB0_10; BB0_9: mov.u32 %r93, 0; max.s32 %r211, %r10, %r93; BB0_10: mad.lo.s32 %r98, %r40, %r27, %r211; mad.lo.s32 %r103, %r98, %r26, %r32; cvt.s64.s32 %rd20, %r103; add.s64 %rd21, %rd4, %rd20; ld.global.nc.u8 %rs12, [%rd21]; setp.gt.u16 %p11, %rs12, %rs1; cvt.u32.u16 %r104, %rs12; and.b32 %r105, %r104, 255; selp.b32 %r108, %r62, %r105, %p11; selp.b32 %r109, %r105, %r62, %p11; add.s32 %r110, %r109, 1; mul.lo.s32 %r111, %r110, %r109; shr.u32 %r112, %r111, 1; add.s32 %r113, %r112, %r108; mul.wide.s32 %rd23, %r113, 4; add.s64 %rd24, %rd10, %rd23; ld.global.nc.f32 %f10, [%rd24]; add.f32 %f3, %f2, %f10; add.s32 %r14, %r36, 1; @%p10 bra BB0_12; rem.s32 %r118, %r14, %r27; add.s32 %r119, %r118, %r27; rem.s32 %r212, %r119, %r27; bra.uni BB0_13; BB0_12: add.s32 %r120, %r27, -1; min.s32 %r212, %r14, %r120; BB0_13: mad.lo.s32 %r125, %r40, %r27, %r212; mad.lo.s32 %r130, %r125, %r26, %r32; cvt.s64.s32 %rd26, %r130; add.s64 %rd27, %rd4, %rd26; ld.global.nc.u8 %rs16, [%rd27]; setp.gt.u16 %p13, %rs16, %rs1; cvt.u32.u16 %r131, %rs16; and.b32 %r132, %r131, 255; selp.b32 %r135, %r62, %r132, %p13; selp.b32 %r136, %r132, %r62, %p13; add.s32 %r137, %r136, 1; mul.lo.s32 %r138, %r137, %r136; shr.u32 %r139, %r138, 1; add.s32 %r140, %r139, %r135; mul.wide.s32 %rd29, %r140, 4; add.s64 %rd30, %rd10, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p14, %r28, 1; @%p14 bra BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p15, %rs4, 0; add.s32 %r18, %r40, -1; @%p15 bra BB0_16; rem.s32 %r145, %r18, %r28; add.s32 %r146, %r145, %r28; rem.s32 %r213, %r146, %r28; bra.uni BB0_17; BB0_16: mov.u32 %r147, 0; max.s32 %r213, %r18, %r147; BB0_17: mad.lo.s32 %r152, %r213, %r27, %r36; mad.lo.s32 %r157, %r152, %r26, %r32; cvt.s64.s32 %rd32, %r157; add.s64 %rd33, %rd4, %rd32; ld.global.nc.u8 %rs19, [%rd33]; setp.gt.u16 %p16, %rs19, %rs1; cvt.u32.u16 %r158, %rs19; and.b32 %r159, %r158, 255; selp.b32 %r162, %r62, %r159, %p16; selp.b32 %r163, %r159, %r62, %p16; add.s32 %r164, %r163, 1; mul.lo.s32 %r165, %r164, %r163; shr.u32 %r166, %r165, 1; add.s32 %r167, %r166, %r162; mul.wide.s32 %rd35, %r167, 4; add.s64 %rd36, %rd10, %rd35; ld.global.nc.f32 %f12, [%rd36]; add.f32 %f5, %f14, %f12; add.s32 %r22, %r40, 1; @%p15 bra BB0_19; rem.s32 %r172, %r22, %r28; add.s32 %r173, %r172, %r28; rem.s32 %r214, %r173, %r28; bra.uni BB0_20; BB0_19: add.s32 %r174, %r28, -1; min.s32 %r214, %r22, %r174; BB0_20: mad.lo.s32 %r179, %r214, %r27, %r36; mad.lo.s32 %r184, %r179, %r26, %r32; cvt.s64.s32 %rd38, %r184; add.s64 %rd39, %rd4, %rd38; ld.global.nc.u8 %rs23, [%rd39]; setp.gt.u16 %p18, %rs23, %rs1; cvt.u32.u16 %r185, %rs23; and.b32 %r186, %r185, 255; selp.b32 %r189, %r62, %r186, %p18; selp.b32 %r190, %r186, %r62, %p18; add.s32 %r191, %r190, 1; mul.lo.s32 %r192, %r191, %r190; shr.u32 %r193, %r192, 1; add.s32 %r194, %r193, %r189; mul.wide.s32 %rd41, %r194, 4; add.s64 %rd42, %rd10, %rd41; ld.global.nc.f32 %f13, [%rd42]; add.f32 %f14, %f5, %f13; BB0_21: mad.lo.s32 %r208, %r49, %r26, %r32; cvta.to.global.u64 %rd43, %rd1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd43, %rd44; st.global.f32 [%rd45], %f14; BB0_22: ret; } ` exchangedecode_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<19>; .reg .b16 %rs<26>; .reg .f32 %f<15>; .reg .b32 %r<215>; .reg .b64 %rd<46>; ld.param.u64 %rd1, [exchangedecode_param_0]; ld.param.u64 %rd2, [exchangedecode_param_1]; ld.param.u64 %rd3, [exchangedecode_param_2]; ld.param.u32 %r26, [exchangedecode_param_6]; ld.param.u32 %r27, [exchangedecode_param_7]; ld.param.u32 %r28, [exchangedecode_param_8]; ld.param.u8 %rs5, [exchangedecode_param_9]; mov.u32 %r29, %ctaid.x; mov.u32 %r30, %ntid.x; mov.u32 %r31, %tid.x; mad.lo.s32 %r32, %r30, %r29, %r31; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r36, %r33, %r34, %r35; mov.u32 %r37, %ntid.z; mov.u32 %r38, %ctaid.z; mov.u32 %r39, %tid.z; mad.lo.s32 %r40, %r37, %r38, %r39; setp.ge.s32 %p1, %r36, %r27; setp.ge.s32 %p2, %r32, %r26; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r40, %r28; or.pred %p5, %p3, %p4; @%p5 bra BB0_22; mad.lo.s32 %r49, %r40, %r27, %r36; mul.lo.s32 %r1, %r49, %r26; add.s32 %r54, %r1, %r32; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r54; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r2, %r32, -1; @%p6 bra BB0_3; rem.s32 %r55, %r2, %r26; add.s32 %r56, %r55, %r26; rem.s32 %r209, %r56, %r26; bra.uni BB0_4; BB0_3: mov.u32 %r57, 0; max.s32 %r209, %r2, %r57; BB0_4: add.s32 %r58, %r209, %r1; cvt.s64.s32 %rd8, %r58; add.s64 %rd9, %rd4, %rd8; ld.global.nc.u8 %rs6, [%rd9]; setp.gt.u16 %p7, %rs6, %rs1; cvt.u32.u16 %r59, %rs6; and.b32 %r60, %r59, 255; cvt.u32.u16 %r61, %rs1; and.b32 %r62, %r61, 255; selp.b32 %r63, %r62, %r60, %p7; selp.b32 %r64, %r60, %r62, %p7; add.s32 %r65, %r64, 1; mul.lo.s32 %r66, %r65, %r64; shr.u32 %r67, %r66, 1; add.s32 %r68, %r67, %r63; cvta.to.global.u64 %rd10, %rd2; mul.wide.s32 %rd11, %r68, 4; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f1, [%rd12]; add.s32 %r6, %r32, 1; @%p6 bra BB0_6; rem.s32 %r73, %r6, %r26; add.s32 %r74, %r73, %r26; rem.s32 %r210, %r74, %r26; bra.uni BB0_7; BB0_6: add.s32 %r75, %r26, -1; min.s32 %r210, %r6, %r75; BB0_7: add.s32 %r76, %r210, %r1; cvt.s64.s32 %rd14, %r76; add.s64 %rd15, %rd4, %rd14; ld.global.nc.u8 %rs9, [%rd15]; setp.gt.u16 %p9, %rs9, %rs1; cvt.u32.u16 %r77, %rs9; and.b32 %r78, %r77, 255; selp.b32 %r81, %r62, %r78, %p9; selp.b32 %r82, %r78, %r62, %p9; add.s32 %r83, %r82, 1; mul.lo.s32 %r84, %r83, %r82; shr.u32 %r85, %r84, 1; add.s32 %r86, %r85, %r81; mul.wide.s32 %rd17, %r86, 4; add.s64 %rd18, %rd10, %rd17; ld.global.nc.f32 %f8, [%rd18]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p10, %rs3, 0; add.s32 %r10, %r36, -1; @%p10 bra BB0_9; rem.s32 %r91, %r10, %r27; add.s32 %r92, %r91, %r27; rem.s32 %r211, %r92, %r27; bra.uni BB0_10; BB0_9: mov.u32 %r93, 0; max.s32 %r211, %r10, %r93; BB0_10: mad.lo.s32 %r98, %r40, %r27, %r211; mad.lo.s32 %r103, %r98, %r26, %r32; cvt.s64.s32 %rd20, %r103; add.s64 %rd21, %rd4, %rd20; ld.global.nc.u8 %rs12, [%rd21]; setp.gt.u16 %p11, %rs12, %rs1; cvt.u32.u16 %r104, %rs12; and.b32 %r105, %r104, 255; selp.b32 %r108, %r62, %r105, %p11; selp.b32 %r109, %r105, %r62, %p11; add.s32 %r110, %r109, 1; mul.lo.s32 %r111, %r110, %r109; shr.u32 %r112, %r111, 1; add.s32 %r113, %r112, %r108; mul.wide.s32 %rd23, %r113, 4; add.s64 %rd24, %rd10, %rd23; ld.global.nc.f32 %f10, [%rd24]; add.f32 %f3, %f2, %f10; add.s32 %r14, %r36, 1; @%p10 bra BB0_12; rem.s32 %r118, %r14, %r27; add.s32 %r119, %r118, %r27; rem.s32 %r212, %r119, %r27; bra.uni BB0_13; BB0_12: add.s32 %r120, %r27, -1; min.s32 %r212, %r14, %r120; BB0_13: mad.lo.s32 %r125, %r40, %r27, %r212; mad.lo.s32 %r130, %r125, %r26, %r32; cvt.s64.s32 %rd26, %r130; add.s64 %rd27, %rd4, %rd26; ld.global.nc.u8 %rs16, [%rd27]; setp.gt.u16 %p13, %rs16, %rs1; cvt.u32.u16 %r131, %rs16; and.b32 %r132, %r131, 255; selp.b32 %r135, %r62, %r132, %p13; selp.b32 %r136, %r132, %r62, %p13; add.s32 %r137, %r136, 1; mul.lo.s32 %r138, %r137, %r136; shr.u32 %r139, %r138, 1; add.s32 %r140, %r139, %r135; mul.wide.s32 %rd29, %r140, 4; add.s64 %rd30, %rd10, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p14, %r28, 1; @%p14 bra BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p15, %rs4, 0; add.s32 %r18, %r40, -1; @%p15 bra BB0_16; rem.s32 %r145, %r18, %r28; add.s32 %r146, %r145, %r28; rem.s32 %r213, %r146, %r28; bra.uni BB0_17; BB0_16: mov.u32 %r147, 0; max.s32 %r213, %r18, %r147; BB0_17: mad.lo.s32 %r152, %r213, %r27, %r36; mad.lo.s32 %r157, %r152, %r26, %r32; cvt.s64.s32 %rd32, %r157; add.s64 %rd33, %rd4, %rd32; ld.global.nc.u8 %rs19, [%rd33]; setp.gt.u16 %p16, %rs19, %rs1; cvt.u32.u16 %r158, %rs19; and.b32 %r159, %r158, 255; selp.b32 %r162, %r62, %r159, %p16; selp.b32 %r163, %r159, %r62, %p16; add.s32 %r164, %r163, 1; mul.lo.s32 %r165, %r164, %r163; shr.u32 %r166, %r165, 1; add.s32 %r167, %r166, %r162; mul.wide.s32 %rd35, %r167, 4; add.s64 %rd36, %rd10, %rd35; ld.global.nc.f32 %f12, [%rd36]; add.f32 %f5, %f14, %f12; add.s32 %r22, %r40, 1; @%p15 bra BB0_19; rem.s32 %r172, %r22, %r28; add.s32 %r173, %r172, %r28; rem.s32 %r214, %r173, %r28; bra.uni BB0_20; BB0_19: add.s32 %r174, %r28, -1; min.s32 %r214, %r22, %r174; BB0_20: mad.lo.s32 %r179, %r214, %r27, %r36; mad.lo.s32 %r184, %r179, %r26, %r32; cvt.s64.s32 %rd38, %r184; add.s64 %rd39, %rd4, %rd38; ld.global.nc.u8 %rs23, [%rd39]; setp.gt.u16 %p18, %rs23, %rs1; cvt.u32.u16 %r185, %rs23; and.b32 %r186, %r185, 255; selp.b32 %r189, %r62, %r186, %p18; selp.b32 %r190, %r186, %r62, %p18; add.s32 %r191, %r190, 1; mul.lo.s32 %r192, %r191, %r190; shr.u32 %r193, %r192, 1; add.s32 %r194, %r193, %r189; mul.wide.s32 %rd41, %r194, 4; add.s64 %rd42, %rd10, %rd41; ld.global.nc.f32 %f13, [%rd42]; add.f32 %f14, %f5, %f13; BB0_21: mad.lo.s32 %r208, %r49, %r26, %r32; cvta.to.global.u64 %rd43, %rd1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd43, %rd44; st.global.f32 [%rd45], %f14; BB0_22: ret; } ` exchangedecode_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<19>; .reg .b16 %rs<26>; .reg .f32 %f<15>; .reg .b32 %r<215>; .reg .b64 %rd<46>; ld.param.u64 %rd1, [exchangedecode_param_0]; ld.param.u64 %rd2, [exchangedecode_param_1]; ld.param.u64 %rd3, [exchangedecode_param_2]; ld.param.u32 %r26, [exchangedecode_param_6]; ld.param.u32 %r27, [exchangedecode_param_7]; ld.param.u32 %r28, [exchangedecode_param_8]; ld.param.u8 %rs5, [exchangedecode_param_9]; mov.u32 %r29, %ctaid.x; mov.u32 %r30, %ntid.x; mov.u32 %r31, %tid.x; mad.lo.s32 %r32, %r30, %r29, %r31; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r36, %r33, %r34, %r35; mov.u32 %r37, %ntid.z; mov.u32 %r38, %ctaid.z; mov.u32 %r39, %tid.z; mad.lo.s32 %r40, %r37, %r38, %r39; setp.ge.s32 %p1, %r36, %r27; setp.ge.s32 %p2, %r32, %r26; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r40, %r28; or.pred %p5, %p3, %p4; @%p5 bra BB0_22; mad.lo.s32 %r49, %r40, %r27, %r36; mul.lo.s32 %r1, %r49, %r26; add.s32 %r54, %r1, %r32; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r54; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r2, %r32, -1; @%p6 bra BB0_3; rem.s32 %r55, %r2, %r26; add.s32 %r56, %r55, %r26; rem.s32 %r209, %r56, %r26; bra.uni BB0_4; BB0_3: mov.u32 %r57, 0; max.s32 %r209, %r2, %r57; BB0_4: add.s32 %r58, %r209, %r1; cvt.s64.s32 %rd8, %r58; add.s64 %rd9, %rd4, %rd8; ld.global.nc.u8 %rs6, [%rd9]; setp.gt.u16 %p7, %rs6, %rs1; cvt.u32.u16 %r59, %rs6; and.b32 %r60, %r59, 255; cvt.u32.u16 %r61, %rs1; and.b32 %r62, %r61, 255; selp.b32 %r63, %r62, %r60, %p7; selp.b32 %r64, %r60, %r62, %p7; add.s32 %r65, %r64, 1; mul.lo.s32 %r66, %r65, %r64; shr.u32 %r67, %r66, 1; add.s32 %r68, %r67, %r63; cvta.to.global.u64 %rd10, %rd2; mul.wide.s32 %rd11, %r68, 4; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f1, [%rd12]; add.s32 %r6, %r32, 1; @%p6 bra BB0_6; rem.s32 %r73, %r6, %r26; add.s32 %r74, %r73, %r26; rem.s32 %r210, %r74, %r26; bra.uni BB0_7; BB0_6: add.s32 %r75, %r26, -1; min.s32 %r210, %r6, %r75; BB0_7: add.s32 %r76, %r210, %r1; cvt.s64.s32 %rd14, %r76; add.s64 %rd15, %rd4, %rd14; ld.global.nc.u8 %rs9, [%rd15]; setp.gt.u16 %p9, %rs9, %rs1; cvt.u32.u16 %r77, %rs9; and.b32 %r78, %r77, 255; selp.b32 %r81, %r62, %r78, %p9; selp.b32 %r82, %r78, %r62, %p9; add.s32 %r83, %r82, 1; mul.lo.s32 %r84, %r83, %r82; shr.u32 %r85, %r84, 1; add.s32 %r86, %r85, %r81; mul.wide.s32 %rd17, %r86, 4; add.s64 %rd18, %rd10, %rd17; ld.global.nc.f32 %f8, [%rd18]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p10, %rs3, 0; add.s32 %r10, %r36, -1; @%p10 bra BB0_9; rem.s32 %r91, %r10, %r27; add.s32 %r92, %r91, %r27; rem.s32 %r211, %r92, %r27; bra.uni BB0_10; BB0_9: mov.u32 %r93, 0; max.s32 %r211, %r10, %r93; BB0_10: mad.lo.s32 %r98, %r40, %r27, %r211; mad.lo.s32 %r103, %r98, %r26, %r32; cvt.s64.s32 %rd20, %r103; add.s64 %rd21, %rd4, %rd20; ld.global.nc.u8 %rs12, [%rd21]; setp.gt.u16 %p11, %rs12, %rs1; cvt.u32.u16 %r104, %rs12; and.b32 %r105, %r104, 255; selp.b32 %r108, %r62, %r105, %p11; selp.b32 %r109, %r105, %r62, %p11; add.s32 %r110, %r109, 1; mul.lo.s32 %r111, %r110, %r109; shr.u32 %r112, %r111, 1; add.s32 %r113, %r112, %r108; mul.wide.s32 %rd23, %r113, 4; add.s64 %rd24, %rd10, %rd23; ld.global.nc.f32 %f10, [%rd24]; add.f32 %f3, %f2, %f10; add.s32 %r14, %r36, 1; @%p10 bra BB0_12; rem.s32 %r118, %r14, %r27; add.s32 %r119, %r118, %r27; rem.s32 %r212, %r119, %r27; bra.uni BB0_13; BB0_12: add.s32 %r120, %r27, -1; min.s32 %r212, %r14, %r120; BB0_13: mad.lo.s32 %r125, %r40, %r27, %r212; mad.lo.s32 %r130, %r125, %r26, %r32; cvt.s64.s32 %rd26, %r130; add.s64 %rd27, %rd4, %rd26; ld.global.nc.u8 %rs16, [%rd27]; setp.gt.u16 %p13, %rs16, %rs1; cvt.u32.u16 %r131, %rs16; and.b32 %r132, %r131, 255; selp.b32 %r135, %r62, %r132, %p13; selp.b32 %r136, %r132, %r62, %p13; add.s32 %r137, %r136, 1; mul.lo.s32 %r138, %r137, %r136; shr.u32 %r139, %r138, 1; add.s32 %r140, %r139, %r135; mul.wide.s32 %rd29, %r140, 4; add.s64 %rd30, %rd10, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p14, %r28, 1; @%p14 bra BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p15, %rs4, 0; add.s32 %r18, %r40, -1; @%p15 bra BB0_16; rem.s32 %r145, %r18, %r28; add.s32 %r146, %r145, %r28; rem.s32 %r213, %r146, %r28; bra.uni BB0_17; BB0_16: mov.u32 %r147, 0; max.s32 %r213, %r18, %r147; BB0_17: mad.lo.s32 %r152, %r213, %r27, %r36; mad.lo.s32 %r157, %r152, %r26, %r32; cvt.s64.s32 %rd32, %r157; add.s64 %rd33, %rd4, %rd32; ld.global.nc.u8 %rs19, [%rd33]; setp.gt.u16 %p16, %rs19, %rs1; cvt.u32.u16 %r158, %rs19; and.b32 %r159, %r158, 255; selp.b32 %r162, %r62, %r159, %p16; selp.b32 %r163, %r159, %r62, %p16; add.s32 %r164, %r163, 1; mul.lo.s32 %r165, %r164, %r163; shr.u32 %r166, %r165, 1; add.s32 %r167, %r166, %r162; mul.wide.s32 %rd35, %r167, 4; add.s64 %rd36, %rd10, %rd35; ld.global.nc.f32 %f12, [%rd36]; add.f32 %f5, %f14, %f12; add.s32 %r22, %r40, 1; @%p15 bra BB0_19; rem.s32 %r172, %r22, %r28; add.s32 %r173, %r172, %r28; rem.s32 %r214, %r173, %r28; bra.uni BB0_20; BB0_19: add.s32 %r174, %r28, -1; min.s32 %r214, %r22, %r174; BB0_20: mad.lo.s32 %r179, %r214, %r27, %r36; mad.lo.s32 %r184, %r179, %r26, %r32; cvt.s64.s32 %rd38, %r184; add.s64 %rd39, %rd4, %rd38; ld.global.nc.u8 %rs23, [%rd39]; setp.gt.u16 %p18, %rs23, %rs1; cvt.u32.u16 %r185, %rs23; and.b32 %r186, %r185, 255; selp.b32 %r189, %r62, %r186, %p18; selp.b32 %r190, %r186, %r62, %p18; add.s32 %r191, %r190, 1; mul.lo.s32 %r192, %r191, %r190; shr.u32 %r193, %r192, 1; add.s32 %r194, %r193, %r189; mul.wide.s32 %rd41, %r194, 4; add.s64 %rd42, %rd10, %rd41; ld.global.nc.f32 %f13, [%rd42]; add.f32 %f14, %f5, %f13; BB0_21: mad.lo.s32 %r208, %r49, %r26, %r32; cvta.to.global.u64 %rd43, %rd1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd43, %rd44; st.global.f32 [%rd45], %f14; BB0_22: ret; } ` exchangedecode_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<19>; .reg .b16 %rs<26>; .reg .f32 %f<15>; .reg .b32 %r<215>; .reg .b64 %rd<46>; ld.param.u64 %rd1, [exchangedecode_param_0]; ld.param.u64 %rd2, [exchangedecode_param_1]; ld.param.u64 %rd3, [exchangedecode_param_2]; ld.param.u32 %r26, [exchangedecode_param_6]; ld.param.u32 %r27, [exchangedecode_param_7]; ld.param.u32 %r28, [exchangedecode_param_8]; ld.param.u8 %rs5, [exchangedecode_param_9]; mov.u32 %r29, %ctaid.x; mov.u32 %r30, %ntid.x; mov.u32 %r31, %tid.x; mad.lo.s32 %r32, %r30, %r29, %r31; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r36, %r33, %r34, %r35; mov.u32 %r37, %ntid.z; mov.u32 %r38, %ctaid.z; mov.u32 %r39, %tid.z; mad.lo.s32 %r40, %r37, %r38, %r39; setp.ge.s32 %p1, %r36, %r27; setp.ge.s32 %p2, %r32, %r26; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r40, %r28; or.pred %p5, %p3, %p4; @%p5 bra BB0_22; mad.lo.s32 %r49, %r40, %r27, %r36; mul.lo.s32 %r1, %r49, %r26; add.s32 %r54, %r1, %r32; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r54; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r2, %r32, -1; @%p6 bra BB0_3; rem.s32 %r55, %r2, %r26; add.s32 %r56, %r55, %r26; rem.s32 %r209, %r56, %r26; bra.uni BB0_4; BB0_3: mov.u32 %r57, 0; max.s32 %r209, %r2, %r57; BB0_4: add.s32 %r58, %r209, %r1; cvt.s64.s32 %rd8, %r58; add.s64 %rd9, %rd4, %rd8; ld.global.nc.u8 %rs6, [%rd9]; setp.gt.u16 %p7, %rs6, %rs1; cvt.u32.u16 %r59, %rs6; and.b32 %r60, %r59, 255; cvt.u32.u16 %r61, %rs1; and.b32 %r62, %r61, 255; selp.b32 %r63, %r62, %r60, %p7; selp.b32 %r64, %r60, %r62, %p7; add.s32 %r65, %r64, 1; mul.lo.s32 %r66, %r65, %r64; shr.u32 %r67, %r66, 1; add.s32 %r68, %r67, %r63; cvta.to.global.u64 %rd10, %rd2; mul.wide.s32 %rd11, %r68, 4; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f1, [%rd12]; add.s32 %r6, %r32, 1; @%p6 bra BB0_6; rem.s32 %r73, %r6, %r26; add.s32 %r74, %r73, %r26; rem.s32 %r210, %r74, %r26; bra.uni BB0_7; BB0_6: add.s32 %r75, %r26, -1; min.s32 %r210, %r6, %r75; BB0_7: add.s32 %r76, %r210, %r1; cvt.s64.s32 %rd14, %r76; add.s64 %rd15, %rd4, %rd14; ld.global.nc.u8 %rs9, [%rd15]; setp.gt.u16 %p9, %rs9, %rs1; cvt.u32.u16 %r77, %rs9; and.b32 %r78, %r77, 255; selp.b32 %r81, %r62, %r78, %p9; selp.b32 %r82, %r78, %r62, %p9; add.s32 %r83, %r82, 1; mul.lo.s32 %r84, %r83, %r82; shr.u32 %r85, %r84, 1; add.s32 %r86, %r85, %r81; mul.wide.s32 %rd17, %r86, 4; add.s64 %rd18, %rd10, %rd17; ld.global.nc.f32 %f8, [%rd18]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p10, %rs3, 0; add.s32 %r10, %r36, -1; @%p10 bra BB0_9; rem.s32 %r91, %r10, %r27; add.s32 %r92, %r91, %r27; rem.s32 %r211, %r92, %r27; bra.uni BB0_10; BB0_9: mov.u32 %r93, 0; max.s32 %r211, %r10, %r93; BB0_10: mad.lo.s32 %r98, %r40, %r27, %r211; mad.lo.s32 %r103, %r98, %r26, %r32; cvt.s64.s32 %rd20, %r103; add.s64 %rd21, %rd4, %rd20; ld.global.nc.u8 %rs12, [%rd21]; setp.gt.u16 %p11, %rs12, %rs1; cvt.u32.u16 %r104, %rs12; and.b32 %r105, %r104, 255; selp.b32 %r108, %r62, %r105, %p11; selp.b32 %r109, %r105, %r62, %p11; add.s32 %r110, %r109, 1; mul.lo.s32 %r111, %r110, %r109; shr.u32 %r112, %r111, 1; add.s32 %r113, %r112, %r108; mul.wide.s32 %rd23, %r113, 4; add.s64 %rd24, %rd10, %rd23; ld.global.nc.f32 %f10, [%rd24]; add.f32 %f3, %f2, %f10; add.s32 %r14, %r36, 1; @%p10 bra BB0_12; rem.s32 %r118, %r14, %r27; add.s32 %r119, %r118, %r27; rem.s32 %r212, %r119, %r27; bra.uni BB0_13; BB0_12: add.s32 %r120, %r27, -1; min.s32 %r212, %r14, %r120; BB0_13: mad.lo.s32 %r125, %r40, %r27, %r212; mad.lo.s32 %r130, %r125, %r26, %r32; cvt.s64.s32 %rd26, %r130; add.s64 %rd27, %rd4, %rd26; ld.global.nc.u8 %rs16, [%rd27]; setp.gt.u16 %p13, %rs16, %rs1; cvt.u32.u16 %r131, %rs16; and.b32 %r132, %r131, 255; selp.b32 %r135, %r62, %r132, %p13; selp.b32 %r136, %r132, %r62, %p13; add.s32 %r137, %r136, 1; mul.lo.s32 %r138, %r137, %r136; shr.u32 %r139, %r138, 1; add.s32 %r140, %r139, %r135; mul.wide.s32 %rd29, %r140, 4; add.s64 %rd30, %rd10, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p14, %r28, 1; @%p14 bra BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p15, %rs4, 0; add.s32 %r18, %r40, -1; @%p15 bra BB0_16; rem.s32 %r145, %r18, %r28; add.s32 %r146, %r145, %r28; rem.s32 %r213, %r146, %r28; bra.uni BB0_17; BB0_16: mov.u32 %r147, 0; max.s32 %r213, %r18, %r147; BB0_17: mad.lo.s32 %r152, %r213, %r27, %r36; mad.lo.s32 %r157, %r152, %r26, %r32; cvt.s64.s32 %rd32, %r157; add.s64 %rd33, %rd4, %rd32; ld.global.nc.u8 %rs19, [%rd33]; setp.gt.u16 %p16, %rs19, %rs1; cvt.u32.u16 %r158, %rs19; and.b32 %r159, %r158, 255; selp.b32 %r162, %r62, %r159, %p16; selp.b32 %r163, %r159, %r62, %p16; add.s32 %r164, %r163, 1; mul.lo.s32 %r165, %r164, %r163; shr.u32 %r166, %r165, 1; add.s32 %r167, %r166, %r162; mul.wide.s32 %rd35, %r167, 4; add.s64 %rd36, %rd10, %rd35; ld.global.nc.f32 %f12, [%rd36]; add.f32 %f5, %f14, %f12; add.s32 %r22, %r40, 1; @%p15 bra BB0_19; rem.s32 %r172, %r22, %r28; add.s32 %r173, %r172, %r28; rem.s32 %r214, %r173, %r28; bra.uni BB0_20; BB0_19: add.s32 %r174, %r28, -1; min.s32 %r214, %r22, %r174; BB0_20: mad.lo.s32 %r179, %r214, %r27, %r36; mad.lo.s32 %r184, %r179, %r26, %r32; cvt.s64.s32 %rd38, %r184; add.s64 %rd39, %rd4, %rd38; ld.global.nc.u8 %rs23, [%rd39]; setp.gt.u16 %p18, %rs23, %rs1; cvt.u32.u16 %r185, %rs23; and.b32 %r186, %r185, 255; selp.b32 %r189, %r62, %r186, %p18; selp.b32 %r190, %r186, %r62, %p18; add.s32 %r191, %r190, 1; mul.lo.s32 %r192, %r191, %r190; shr.u32 %r193, %r192, 1; add.s32 %r194, %r193, %r189; mul.wide.s32 %rd41, %r194, 4; add.s64 %rd42, %rd10, %rd41; ld.global.nc.f32 %f13, [%rd42]; add.f32 %f14, %f5, %f13; BB0_21: mad.lo.s32 %r208, %r49, %r26, %r32; cvta.to.global.u64 %rd43, %rd1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd43, %rd44; st.global.f32 [%rd45], %f14; BB0_22: ret; } ` exchangedecode_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<19>; .reg .b16 %rs<26>; .reg .f32 %f<15>; .reg .b32 %r<215>; .reg .b64 %rd<46>; ld.param.u64 %rd1, [exchangedecode_param_0]; ld.param.u64 %rd2, [exchangedecode_param_1]; ld.param.u64 %rd3, [exchangedecode_param_2]; ld.param.u32 %r26, [exchangedecode_param_6]; ld.param.u32 %r27, [exchangedecode_param_7]; ld.param.u32 %r28, [exchangedecode_param_8]; ld.param.u8 %rs5, [exchangedecode_param_9]; mov.u32 %r29, %ctaid.x; mov.u32 %r30, %ntid.x; mov.u32 %r31, %tid.x; mad.lo.s32 %r32, %r30, %r29, %r31; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r36, %r33, %r34, %r35; mov.u32 %r37, %ntid.z; mov.u32 %r38, %ctaid.z; mov.u32 %r39, %tid.z; mad.lo.s32 %r40, %r37, %r38, %r39; setp.ge.s32 %p1, %r36, %r27; setp.ge.s32 %p2, %r32, %r26; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r40, %r28; or.pred %p5, %p3, %p4; @%p5 bra BB0_22; mad.lo.s32 %r49, %r40, %r27, %r36; mul.lo.s32 %r1, %r49, %r26; add.s32 %r54, %r1, %r32; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r54; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r2, %r32, -1; @%p6 bra BB0_3; rem.s32 %r55, %r2, %r26; add.s32 %r56, %r55, %r26; rem.s32 %r209, %r56, %r26; bra.uni BB0_4; BB0_3: mov.u32 %r57, 0; max.s32 %r209, %r2, %r57; BB0_4: add.s32 %r58, %r209, %r1; cvt.s64.s32 %rd8, %r58; add.s64 %rd9, %rd4, %rd8; ld.global.nc.u8 %rs6, [%rd9]; setp.gt.u16 %p7, %rs6, %rs1; cvt.u32.u16 %r59, %rs6; and.b32 %r60, %r59, 255; cvt.u32.u16 %r61, %rs1; and.b32 %r62, %r61, 255; selp.b32 %r63, %r62, %r60, %p7; selp.b32 %r64, %r60, %r62, %p7; add.s32 %r65, %r64, 1; mul.lo.s32 %r66, %r65, %r64; shr.u32 %r67, %r66, 1; add.s32 %r68, %r67, %r63; cvta.to.global.u64 %rd10, %rd2; mul.wide.s32 %rd11, %r68, 4; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f1, [%rd12]; add.s32 %r6, %r32, 1; @%p6 bra BB0_6; rem.s32 %r73, %r6, %r26; add.s32 %r74, %r73, %r26; rem.s32 %r210, %r74, %r26; bra.uni BB0_7; BB0_6: add.s32 %r75, %r26, -1; min.s32 %r210, %r6, %r75; BB0_7: add.s32 %r76, %r210, %r1; cvt.s64.s32 %rd14, %r76; add.s64 %rd15, %rd4, %rd14; ld.global.nc.u8 %rs9, [%rd15]; setp.gt.u16 %p9, %rs9, %rs1; cvt.u32.u16 %r77, %rs9; and.b32 %r78, %r77, 255; selp.b32 %r81, %r62, %r78, %p9; selp.b32 %r82, %r78, %r62, %p9; add.s32 %r83, %r82, 1; mul.lo.s32 %r84, %r83, %r82; shr.u32 %r85, %r84, 1; add.s32 %r86, %r85, %r81; mul.wide.s32 %rd17, %r86, 4; add.s64 %rd18, %rd10, %rd17; ld.global.nc.f32 %f8, [%rd18]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p10, %rs3, 0; add.s32 %r10, %r36, -1; @%p10 bra BB0_9; rem.s32 %r91, %r10, %r27; add.s32 %r92, %r91, %r27; rem.s32 %r211, %r92, %r27; bra.uni BB0_10; BB0_9: mov.u32 %r93, 0; max.s32 %r211, %r10, %r93; BB0_10: mad.lo.s32 %r98, %r40, %r27, %r211; mad.lo.s32 %r103, %r98, %r26, %r32; cvt.s64.s32 %rd20, %r103; add.s64 %rd21, %rd4, %rd20; ld.global.nc.u8 %rs12, [%rd21]; setp.gt.u16 %p11, %rs12, %rs1; cvt.u32.u16 %r104, %rs12; and.b32 %r105, %r104, 255; selp.b32 %r108, %r62, %r105, %p11; selp.b32 %r109, %r105, %r62, %p11; add.s32 %r110, %r109, 1; mul.lo.s32 %r111, %r110, %r109; shr.u32 %r112, %r111, 1; add.s32 %r113, %r112, %r108; mul.wide.s32 %rd23, %r113, 4; add.s64 %rd24, %rd10, %rd23; ld.global.nc.f32 %f10, [%rd24]; add.f32 %f3, %f2, %f10; add.s32 %r14, %r36, 1; @%p10 bra BB0_12; rem.s32 %r118, %r14, %r27; add.s32 %r119, %r118, %r27; rem.s32 %r212, %r119, %r27; bra.uni BB0_13; BB0_12: add.s32 %r120, %r27, -1; min.s32 %r212, %r14, %r120; BB0_13: mad.lo.s32 %r125, %r40, %r27, %r212; mad.lo.s32 %r130, %r125, %r26, %r32; cvt.s64.s32 %rd26, %r130; add.s64 %rd27, %rd4, %rd26; ld.global.nc.u8 %rs16, [%rd27]; setp.gt.u16 %p13, %rs16, %rs1; cvt.u32.u16 %r131, %rs16; and.b32 %r132, %r131, 255; selp.b32 %r135, %r62, %r132, %p13; selp.b32 %r136, %r132, %r62, %p13; add.s32 %r137, %r136, 1; mul.lo.s32 %r138, %r137, %r136; shr.u32 %r139, %r138, 1; add.s32 %r140, %r139, %r135; mul.wide.s32 %rd29, %r140, 4; add.s64 %rd30, %rd10, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p14, %r28, 1; @%p14 bra BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p15, %rs4, 0; add.s32 %r18, %r40, -1; @%p15 bra BB0_16; rem.s32 %r145, %r18, %r28; add.s32 %r146, %r145, %r28; rem.s32 %r213, %r146, %r28; bra.uni BB0_17; BB0_16: mov.u32 %r147, 0; max.s32 %r213, %r18, %r147; BB0_17: mad.lo.s32 %r152, %r213, %r27, %r36; mad.lo.s32 %r157, %r152, %r26, %r32; cvt.s64.s32 %rd32, %r157; add.s64 %rd33, %rd4, %rd32; ld.global.nc.u8 %rs19, [%rd33]; setp.gt.u16 %p16, %rs19, %rs1; cvt.u32.u16 %r158, %rs19; and.b32 %r159, %r158, 255; selp.b32 %r162, %r62, %r159, %p16; selp.b32 %r163, %r159, %r62, %p16; add.s32 %r164, %r163, 1; mul.lo.s32 %r165, %r164, %r163; shr.u32 %r166, %r165, 1; add.s32 %r167, %r166, %r162; mul.wide.s32 %rd35, %r167, 4; add.s64 %rd36, %rd10, %rd35; ld.global.nc.f32 %f12, [%rd36]; add.f32 %f5, %f14, %f12; add.s32 %r22, %r40, 1; @%p15 bra BB0_19; rem.s32 %r172, %r22, %r28; add.s32 %r173, %r172, %r28; rem.s32 %r214, %r173, %r28; bra.uni BB0_20; BB0_19: add.s32 %r174, %r28, -1; min.s32 %r214, %r22, %r174; BB0_20: mad.lo.s32 %r179, %r214, %r27, %r36; mad.lo.s32 %r184, %r179, %r26, %r32; cvt.s64.s32 %rd38, %r184; add.s64 %rd39, %rd4, %rd38; ld.global.nc.u8 %rs23, [%rd39]; setp.gt.u16 %p18, %rs23, %rs1; cvt.u32.u16 %r185, %rs23; and.b32 %r186, %r185, 255; selp.b32 %r189, %r62, %r186, %p18; selp.b32 %r190, %r186, %r62, %p18; add.s32 %r191, %r190, 1; mul.lo.s32 %r192, %r191, %r190; shr.u32 %r193, %r192, 1; add.s32 %r194, %r193, %r189; mul.wide.s32 %rd41, %r194, 4; add.s64 %rd42, %rd10, %rd41; ld.global.nc.f32 %f13, [%rd42]; add.f32 %f14, %f5, %f13; BB0_21: mad.lo.s32 %r208, %r49, %r26, %r32; cvta.to.global.u64 %rd43, %rd1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd43, %rd44; st.global.f32 [%rd45], %f14; BB0_22: ret; } ` exchangedecode_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<19>; .reg .b16 %rs<26>; .reg .f32 %f<15>; .reg .b32 %r<215>; .reg .b64 %rd<46>; ld.param.u64 %rd1, [exchangedecode_param_0]; ld.param.u64 %rd2, [exchangedecode_param_1]; ld.param.u64 %rd3, [exchangedecode_param_2]; ld.param.u32 %r26, [exchangedecode_param_6]; ld.param.u32 %r27, [exchangedecode_param_7]; ld.param.u32 %r28, [exchangedecode_param_8]; ld.param.u8 %rs5, [exchangedecode_param_9]; mov.u32 %r29, %ctaid.x; mov.u32 %r30, %ntid.x; mov.u32 %r31, %tid.x; mad.lo.s32 %r32, %r30, %r29, %r31; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r36, %r33, %r34, %r35; mov.u32 %r37, %ntid.z; mov.u32 %r38, %ctaid.z; mov.u32 %r39, %tid.z; mad.lo.s32 %r40, %r37, %r38, %r39; setp.ge.s32 %p1, %r36, %r27; setp.ge.s32 %p2, %r32, %r26; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r40, %r28; or.pred %p5, %p3, %p4; @%p5 bra BB0_22; mad.lo.s32 %r49, %r40, %r27, %r36; mul.lo.s32 %r1, %r49, %r26; add.s32 %r54, %r1, %r32; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r54; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r2, %r32, -1; @%p6 bra BB0_3; rem.s32 %r55, %r2, %r26; add.s32 %r56, %r55, %r26; rem.s32 %r209, %r56, %r26; bra.uni BB0_4; BB0_3: mov.u32 %r57, 0; max.s32 %r209, %r2, %r57; BB0_4: add.s32 %r58, %r209, %r1; cvt.s64.s32 %rd8, %r58; add.s64 %rd9, %rd4, %rd8; ld.global.nc.u8 %rs6, [%rd9]; setp.gt.u16 %p7, %rs6, %rs1; cvt.u32.u16 %r59, %rs6; and.b32 %r60, %r59, 255; cvt.u32.u16 %r61, %rs1; and.b32 %r62, %r61, 255; selp.b32 %r63, %r62, %r60, %p7; selp.b32 %r64, %r60, %r62, %p7; add.s32 %r65, %r64, 1; mul.lo.s32 %r66, %r65, %r64; shr.u32 %r67, %r66, 1; add.s32 %r68, %r67, %r63; cvta.to.global.u64 %rd10, %rd2; mul.wide.s32 %rd11, %r68, 4; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f1, [%rd12]; add.s32 %r6, %r32, 1; @%p6 bra BB0_6; rem.s32 %r73, %r6, %r26; add.s32 %r74, %r73, %r26; rem.s32 %r210, %r74, %r26; bra.uni BB0_7; BB0_6: add.s32 %r75, %r26, -1; min.s32 %r210, %r6, %r75; BB0_7: add.s32 %r76, %r210, %r1; cvt.s64.s32 %rd14, %r76; add.s64 %rd15, %rd4, %rd14; ld.global.nc.u8 %rs9, [%rd15]; setp.gt.u16 %p9, %rs9, %rs1; cvt.u32.u16 %r77, %rs9; and.b32 %r78, %r77, 255; selp.b32 %r81, %r62, %r78, %p9; selp.b32 %r82, %r78, %r62, %p9; add.s32 %r83, %r82, 1; mul.lo.s32 %r84, %r83, %r82; shr.u32 %r85, %r84, 1; add.s32 %r86, %r85, %r81; mul.wide.s32 %rd17, %r86, 4; add.s64 %rd18, %rd10, %rd17; ld.global.nc.f32 %f8, [%rd18]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p10, %rs3, 0; add.s32 %r10, %r36, -1; @%p10 bra BB0_9; rem.s32 %r91, %r10, %r27; add.s32 %r92, %r91, %r27; rem.s32 %r211, %r92, %r27; bra.uni BB0_10; BB0_9: mov.u32 %r93, 0; max.s32 %r211, %r10, %r93; BB0_10: mad.lo.s32 %r98, %r40, %r27, %r211; mad.lo.s32 %r103, %r98, %r26, %r32; cvt.s64.s32 %rd20, %r103; add.s64 %rd21, %rd4, %rd20; ld.global.nc.u8 %rs12, [%rd21]; setp.gt.u16 %p11, %rs12, %rs1; cvt.u32.u16 %r104, %rs12; and.b32 %r105, %r104, 255; selp.b32 %r108, %r62, %r105, %p11; selp.b32 %r109, %r105, %r62, %p11; add.s32 %r110, %r109, 1; mul.lo.s32 %r111, %r110, %r109; shr.u32 %r112, %r111, 1; add.s32 %r113, %r112, %r108; mul.wide.s32 %rd23, %r113, 4; add.s64 %rd24, %rd10, %rd23; ld.global.nc.f32 %f10, [%rd24]; add.f32 %f3, %f2, %f10; add.s32 %r14, %r36, 1; @%p10 bra BB0_12; rem.s32 %r118, %r14, %r27; add.s32 %r119, %r118, %r27; rem.s32 %r212, %r119, %r27; bra.uni BB0_13; BB0_12: add.s32 %r120, %r27, -1; min.s32 %r212, %r14, %r120; BB0_13: mad.lo.s32 %r125, %r40, %r27, %r212; mad.lo.s32 %r130, %r125, %r26, %r32; cvt.s64.s32 %rd26, %r130; add.s64 %rd27, %rd4, %rd26; ld.global.nc.u8 %rs16, [%rd27]; setp.gt.u16 %p13, %rs16, %rs1; cvt.u32.u16 %r131, %rs16; and.b32 %r132, %r131, 255; selp.b32 %r135, %r62, %r132, %p13; selp.b32 %r136, %r132, %r62, %p13; add.s32 %r137, %r136, 1; mul.lo.s32 %r138, %r137, %r136; shr.u32 %r139, %r138, 1; add.s32 %r140, %r139, %r135; mul.wide.s32 %rd29, %r140, 4; add.s64 %rd30, %rd10, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p14, %r28, 1; @%p14 bra BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p15, %rs4, 0; add.s32 %r18, %r40, -1; @%p15 bra BB0_16; rem.s32 %r145, %r18, %r28; add.s32 %r146, %r145, %r28; rem.s32 %r213, %r146, %r28; bra.uni BB0_17; BB0_16: mov.u32 %r147, 0; max.s32 %r213, %r18, %r147; BB0_17: mad.lo.s32 %r152, %r213, %r27, %r36; mad.lo.s32 %r157, %r152, %r26, %r32; cvt.s64.s32 %rd32, %r157; add.s64 %rd33, %rd4, %rd32; ld.global.nc.u8 %rs19, [%rd33]; setp.gt.u16 %p16, %rs19, %rs1; cvt.u32.u16 %r158, %rs19; and.b32 %r159, %r158, 255; selp.b32 %r162, %r62, %r159, %p16; selp.b32 %r163, %r159, %r62, %p16; add.s32 %r164, %r163, 1; mul.lo.s32 %r165, %r164, %r163; shr.u32 %r166, %r165, 1; add.s32 %r167, %r166, %r162; mul.wide.s32 %rd35, %r167, 4; add.s64 %rd36, %rd10, %rd35; ld.global.nc.f32 %f12, [%rd36]; add.f32 %f5, %f14, %f12; add.s32 %r22, %r40, 1; @%p15 bra BB0_19; rem.s32 %r172, %r22, %r28; add.s32 %r173, %r172, %r28; rem.s32 %r214, %r173, %r28; bra.uni BB0_20; BB0_19: add.s32 %r174, %r28, -1; min.s32 %r214, %r22, %r174; BB0_20: mad.lo.s32 %r179, %r214, %r27, %r36; mad.lo.s32 %r184, %r179, %r26, %r32; cvt.s64.s32 %rd38, %r184; add.s64 %rd39, %rd4, %rd38; ld.global.nc.u8 %rs23, [%rd39]; setp.gt.u16 %p18, %rs23, %rs1; cvt.u32.u16 %r185, %rs23; and.b32 %r186, %r185, 255; selp.b32 %r189, %r62, %r186, %p18; selp.b32 %r190, %r186, %r62, %p18; add.s32 %r191, %r190, 1; mul.lo.s32 %r192, %r191, %r190; shr.u32 %r193, %r192, 1; add.s32 %r194, %r193, %r189; mul.wide.s32 %rd41, %r194, 4; add.s64 %rd42, %rd10, %rd41; ld.global.nc.f32 %f13, [%rd42]; add.f32 %f14, %f5, %f13; BB0_21: mad.lo.s32 %r208, %r49, %r26, %r32; cvta.to.global.u64 %rd43, %rd1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd43, %rd44; st.global.f32 [%rd45], %f14; BB0_22: ret; } ` exchangedecode_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<19>; .reg .b16 %rs<26>; .reg .f32 %f<15>; .reg .b32 %r<215>; .reg .b64 %rd<46>; ld.param.u64 %rd1, [exchangedecode_param_0]; ld.param.u64 %rd2, [exchangedecode_param_1]; ld.param.u64 %rd3, [exchangedecode_param_2]; ld.param.u32 %r26, [exchangedecode_param_6]; ld.param.u32 %r27, [exchangedecode_param_7]; ld.param.u32 %r28, [exchangedecode_param_8]; ld.param.u8 %rs5, [exchangedecode_param_9]; mov.u32 %r29, %ctaid.x; mov.u32 %r30, %ntid.x; mov.u32 %r31, %tid.x; mad.lo.s32 %r32, %r30, %r29, %r31; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r36, %r33, %r34, %r35; mov.u32 %r37, %ntid.z; mov.u32 %r38, %ctaid.z; mov.u32 %r39, %tid.z; mad.lo.s32 %r40, %r37, %r38, %r39; setp.ge.s32 %p1, %r36, %r27; setp.ge.s32 %p2, %r32, %r26; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r40, %r28; or.pred %p5, %p3, %p4; @%p5 bra BB0_22; mad.lo.s32 %r49, %r40, %r27, %r36; mul.lo.s32 %r1, %r49, %r26; add.s32 %r54, %r1, %r32; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r54; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r2, %r32, -1; @%p6 bra BB0_3; rem.s32 %r55, %r2, %r26; add.s32 %r56, %r55, %r26; rem.s32 %r209, %r56, %r26; bra.uni BB0_4; BB0_3: mov.u32 %r57, 0; max.s32 %r209, %r2, %r57; BB0_4: add.s32 %r58, %r209, %r1; cvt.s64.s32 %rd8, %r58; add.s64 %rd9, %rd4, %rd8; ld.global.nc.u8 %rs6, [%rd9]; setp.gt.u16 %p7, %rs6, %rs1; cvt.u32.u16 %r59, %rs6; and.b32 %r60, %r59, 255; cvt.u32.u16 %r61, %rs1; and.b32 %r62, %r61, 255; selp.b32 %r63, %r62, %r60, %p7; selp.b32 %r64, %r60, %r62, %p7; add.s32 %r65, %r64, 1; mul.lo.s32 %r66, %r65, %r64; shr.u32 %r67, %r66, 1; add.s32 %r68, %r67, %r63; cvta.to.global.u64 %rd10, %rd2; mul.wide.s32 %rd11, %r68, 4; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f1, [%rd12]; add.s32 %r6, %r32, 1; @%p6 bra BB0_6; rem.s32 %r73, %r6, %r26; add.s32 %r74, %r73, %r26; rem.s32 %r210, %r74, %r26; bra.uni BB0_7; BB0_6: add.s32 %r75, %r26, -1; min.s32 %r210, %r6, %r75; BB0_7: add.s32 %r76, %r210, %r1; cvt.s64.s32 %rd14, %r76; add.s64 %rd15, %rd4, %rd14; ld.global.nc.u8 %rs9, [%rd15]; setp.gt.u16 %p9, %rs9, %rs1; cvt.u32.u16 %r77, %rs9; and.b32 %r78, %r77, 255; selp.b32 %r81, %r62, %r78, %p9; selp.b32 %r82, %r78, %r62, %p9; add.s32 %r83, %r82, 1; mul.lo.s32 %r84, %r83, %r82; shr.u32 %r85, %r84, 1; add.s32 %r86, %r85, %r81; mul.wide.s32 %rd17, %r86, 4; add.s64 %rd18, %rd10, %rd17; ld.global.nc.f32 %f8, [%rd18]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p10, %rs3, 0; add.s32 %r10, %r36, -1; @%p10 bra BB0_9; rem.s32 %r91, %r10, %r27; add.s32 %r92, %r91, %r27; rem.s32 %r211, %r92, %r27; bra.uni BB0_10; BB0_9: mov.u32 %r93, 0; max.s32 %r211, %r10, %r93; BB0_10: mad.lo.s32 %r98, %r40, %r27, %r211; mad.lo.s32 %r103, %r98, %r26, %r32; cvt.s64.s32 %rd20, %r103; add.s64 %rd21, %rd4, %rd20; ld.global.nc.u8 %rs12, [%rd21]; setp.gt.u16 %p11, %rs12, %rs1; cvt.u32.u16 %r104, %rs12; and.b32 %r105, %r104, 255; selp.b32 %r108, %r62, %r105, %p11; selp.b32 %r109, %r105, %r62, %p11; add.s32 %r110, %r109, 1; mul.lo.s32 %r111, %r110, %r109; shr.u32 %r112, %r111, 1; add.s32 %r113, %r112, %r108; mul.wide.s32 %rd23, %r113, 4; add.s64 %rd24, %rd10, %rd23; ld.global.nc.f32 %f10, [%rd24]; add.f32 %f3, %f2, %f10; add.s32 %r14, %r36, 1; @%p10 bra BB0_12; rem.s32 %r118, %r14, %r27; add.s32 %r119, %r118, %r27; rem.s32 %r212, %r119, %r27; bra.uni BB0_13; BB0_12: add.s32 %r120, %r27, -1; min.s32 %r212, %r14, %r120; BB0_13: mad.lo.s32 %r125, %r40, %r27, %r212; mad.lo.s32 %r130, %r125, %r26, %r32; cvt.s64.s32 %rd26, %r130; add.s64 %rd27, %rd4, %rd26; ld.global.nc.u8 %rs16, [%rd27]; setp.gt.u16 %p13, %rs16, %rs1; cvt.u32.u16 %r131, %rs16; and.b32 %r132, %r131, 255; selp.b32 %r135, %r62, %r132, %p13; selp.b32 %r136, %r132, %r62, %p13; add.s32 %r137, %r136, 1; mul.lo.s32 %r138, %r137, %r136; shr.u32 %r139, %r138, 1; add.s32 %r140, %r139, %r135; mul.wide.s32 %rd29, %r140, 4; add.s64 %rd30, %rd10, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p14, %r28, 1; @%p14 bra BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p15, %rs4, 0; add.s32 %r18, %r40, -1; @%p15 bra BB0_16; rem.s32 %r145, %r18, %r28; add.s32 %r146, %r145, %r28; rem.s32 %r213, %r146, %r28; bra.uni BB0_17; BB0_16: mov.u32 %r147, 0; max.s32 %r213, %r18, %r147; BB0_17: mad.lo.s32 %r152, %r213, %r27, %r36; mad.lo.s32 %r157, %r152, %r26, %r32; cvt.s64.s32 %rd32, %r157; add.s64 %rd33, %rd4, %rd32; ld.global.nc.u8 %rs19, [%rd33]; setp.gt.u16 %p16, %rs19, %rs1; cvt.u32.u16 %r158, %rs19; and.b32 %r159, %r158, 255; selp.b32 %r162, %r62, %r159, %p16; selp.b32 %r163, %r159, %r62, %p16; add.s32 %r164, %r163, 1; mul.lo.s32 %r165, %r164, %r163; shr.u32 %r166, %r165, 1; add.s32 %r167, %r166, %r162; mul.wide.s32 %rd35, %r167, 4; add.s64 %rd36, %rd10, %rd35; ld.global.nc.f32 %f12, [%rd36]; add.f32 %f5, %f14, %f12; add.s32 %r22, %r40, 1; @%p15 bra BB0_19; rem.s32 %r172, %r22, %r28; add.s32 %r173, %r172, %r28; rem.s32 %r214, %r173, %r28; bra.uni BB0_20; BB0_19: add.s32 %r174, %r28, -1; min.s32 %r214, %r22, %r174; BB0_20: mad.lo.s32 %r179, %r214, %r27, %r36; mad.lo.s32 %r184, %r179, %r26, %r32; cvt.s64.s32 %rd38, %r184; add.s64 %rd39, %rd4, %rd38; ld.global.nc.u8 %rs23, [%rd39]; setp.gt.u16 %p18, %rs23, %rs1; cvt.u32.u16 %r185, %rs23; and.b32 %r186, %r185, 255; selp.b32 %r189, %r62, %r186, %p18; selp.b32 %r190, %r186, %r62, %p18; add.s32 %r191, %r190, 1; mul.lo.s32 %r192, %r191, %r190; shr.u32 %r193, %r192, 1; add.s32 %r194, %r193, %r189; mul.wide.s32 %rd41, %r194, 4; add.s64 %rd42, %rd10, %rd41; ld.global.nc.f32 %f13, [%rd42]; add.f32 %f14, %f5, %f13; BB0_21: mad.lo.s32 %r208, %r49, %r26, %r32; cvta.to.global.u64 %rd43, %rd1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd43, %rd44; st.global.f32 [%rd45], %f14; BB0_22: ret; } ` exchangedecode_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl exchangedecode .visible .entry exchangedecode( .param .u64 exchangedecode_param_0, .param .u64 exchangedecode_param_1, .param .u64 exchangedecode_param_2, .param .f32 exchangedecode_param_3, .param .f32 exchangedecode_param_4, .param .f32 exchangedecode_param_5, .param .u32 exchangedecode_param_6, .param .u32 exchangedecode_param_7, .param .u32 exchangedecode_param_8, .param .u8 exchangedecode_param_9 ) { .reg .pred %p<19>; .reg .b16 %rs<26>; .reg .f32 %f<15>; .reg .b32 %r<215>; .reg .b64 %rd<46>; ld.param.u64 %rd1, [exchangedecode_param_0]; ld.param.u64 %rd2, [exchangedecode_param_1]; ld.param.u64 %rd3, [exchangedecode_param_2]; ld.param.u32 %r26, [exchangedecode_param_6]; ld.param.u32 %r27, [exchangedecode_param_7]; ld.param.u32 %r28, [exchangedecode_param_8]; ld.param.u8 %rs5, [exchangedecode_param_9]; mov.u32 %r29, %ctaid.x; mov.u32 %r30, %ntid.x; mov.u32 %r31, %tid.x; mad.lo.s32 %r32, %r30, %r29, %r31; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r36, %r33, %r34, %r35; mov.u32 %r37, %ntid.z; mov.u32 %r38, %ctaid.z; mov.u32 %r39, %tid.z; mad.lo.s32 %r40, %r37, %r38, %r39; setp.ge.s32 %p1, %r36, %r27; setp.ge.s32 %p2, %r32, %r26; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r40, %r28; or.pred %p5, %p3, %p4; @%p5 bra BB0_22; mad.lo.s32 %r49, %r40, %r27, %r36; mul.lo.s32 %r1, %r49, %r26; add.s32 %r54, %r1, %r32; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r54; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p6, %rs2, 0; add.s32 %r2, %r32, -1; @%p6 bra BB0_3; rem.s32 %r55, %r2, %r26; add.s32 %r56, %r55, %r26; rem.s32 %r209, %r56, %r26; bra.uni BB0_4; BB0_3: mov.u32 %r57, 0; max.s32 %r209, %r2, %r57; BB0_4: add.s32 %r58, %r209, %r1; cvt.s64.s32 %rd8, %r58; add.s64 %rd9, %rd4, %rd8; ld.global.nc.u8 %rs6, [%rd9]; setp.gt.u16 %p7, %rs6, %rs1; cvt.u32.u16 %r59, %rs6; and.b32 %r60, %r59, 255; cvt.u32.u16 %r61, %rs1; and.b32 %r62, %r61, 255; selp.b32 %r63, %r62, %r60, %p7; selp.b32 %r64, %r60, %r62, %p7; add.s32 %r65, %r64, 1; mul.lo.s32 %r66, %r65, %r64; shr.u32 %r67, %r66, 1; add.s32 %r68, %r67, %r63; cvta.to.global.u64 %rd10, %rd2; mul.wide.s32 %rd11, %r68, 4; add.s64 %rd12, %rd10, %rd11; ld.global.nc.f32 %f1, [%rd12]; add.s32 %r6, %r32, 1; @%p6 bra BB0_6; rem.s32 %r73, %r6, %r26; add.s32 %r74, %r73, %r26; rem.s32 %r210, %r74, %r26; bra.uni BB0_7; BB0_6: add.s32 %r75, %r26, -1; min.s32 %r210, %r6, %r75; BB0_7: add.s32 %r76, %r210, %r1; cvt.s64.s32 %rd14, %r76; add.s64 %rd15, %rd4, %rd14; ld.global.nc.u8 %rs9, [%rd15]; setp.gt.u16 %p9, %rs9, %rs1; cvt.u32.u16 %r77, %rs9; and.b32 %r78, %r77, 255; selp.b32 %r81, %r62, %r78, %p9; selp.b32 %r82, %r78, %r62, %p9; add.s32 %r83, %r82, 1; mul.lo.s32 %r84, %r83, %r82; shr.u32 %r85, %r84, 1; add.s32 %r86, %r85, %r81; mul.wide.s32 %rd17, %r86, 4; add.s64 %rd18, %rd10, %rd17; ld.global.nc.f32 %f8, [%rd18]; add.f32 %f9, %f1, 0f00000000; add.f32 %f2, %f9, %f8; and.b16 %rs3, %rs5, 2; setp.eq.s16 %p10, %rs3, 0; add.s32 %r10, %r36, -1; @%p10 bra BB0_9; rem.s32 %r91, %r10, %r27; add.s32 %r92, %r91, %r27; rem.s32 %r211, %r92, %r27; bra.uni BB0_10; BB0_9: mov.u32 %r93, 0; max.s32 %r211, %r10, %r93; BB0_10: mad.lo.s32 %r98, %r40, %r27, %r211; mad.lo.s32 %r103, %r98, %r26, %r32; cvt.s64.s32 %rd20, %r103; add.s64 %rd21, %rd4, %rd20; ld.global.nc.u8 %rs12, [%rd21]; setp.gt.u16 %p11, %rs12, %rs1; cvt.u32.u16 %r104, %rs12; and.b32 %r105, %r104, 255; selp.b32 %r108, %r62, %r105, %p11; selp.b32 %r109, %r105, %r62, %p11; add.s32 %r110, %r109, 1; mul.lo.s32 %r111, %r110, %r109; shr.u32 %r112, %r111, 1; add.s32 %r113, %r112, %r108; mul.wide.s32 %rd23, %r113, 4; add.s64 %rd24, %rd10, %rd23; ld.global.nc.f32 %f10, [%rd24]; add.f32 %f3, %f2, %f10; add.s32 %r14, %r36, 1; @%p10 bra BB0_12; rem.s32 %r118, %r14, %r27; add.s32 %r119, %r118, %r27; rem.s32 %r212, %r119, %r27; bra.uni BB0_13; BB0_12: add.s32 %r120, %r27, -1; min.s32 %r212, %r14, %r120; BB0_13: mad.lo.s32 %r125, %r40, %r27, %r212; mad.lo.s32 %r130, %r125, %r26, %r32; cvt.s64.s32 %rd26, %r130; add.s64 %rd27, %rd4, %rd26; ld.global.nc.u8 %rs16, [%rd27]; setp.gt.u16 %p13, %rs16, %rs1; cvt.u32.u16 %r131, %rs16; and.b32 %r132, %r131, 255; selp.b32 %r135, %r62, %r132, %p13; selp.b32 %r136, %r132, %r62, %p13; add.s32 %r137, %r136, 1; mul.lo.s32 %r138, %r137, %r136; shr.u32 %r139, %r138, 1; add.s32 %r140, %r139, %r135; mul.wide.s32 %rd29, %r140, 4; add.s64 %rd30, %rd10, %rd29; ld.global.nc.f32 %f11, [%rd30]; add.f32 %f14, %f3, %f11; setp.eq.s32 %p14, %r28, 1; @%p14 bra BB0_21; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p15, %rs4, 0; add.s32 %r18, %r40, -1; @%p15 bra BB0_16; rem.s32 %r145, %r18, %r28; add.s32 %r146, %r145, %r28; rem.s32 %r213, %r146, %r28; bra.uni BB0_17; BB0_16: mov.u32 %r147, 0; max.s32 %r213, %r18, %r147; BB0_17: mad.lo.s32 %r152, %r213, %r27, %r36; mad.lo.s32 %r157, %r152, %r26, %r32; cvt.s64.s32 %rd32, %r157; add.s64 %rd33, %rd4, %rd32; ld.global.nc.u8 %rs19, [%rd33]; setp.gt.u16 %p16, %rs19, %rs1; cvt.u32.u16 %r158, %rs19; and.b32 %r159, %r158, 255; selp.b32 %r162, %r62, %r159, %p16; selp.b32 %r163, %r159, %r62, %p16; add.s32 %r164, %r163, 1; mul.lo.s32 %r165, %r164, %r163; shr.u32 %r166, %r165, 1; add.s32 %r167, %r166, %r162; mul.wide.s32 %rd35, %r167, 4; add.s64 %rd36, %rd10, %rd35; ld.global.nc.f32 %f12, [%rd36]; add.f32 %f5, %f14, %f12; add.s32 %r22, %r40, 1; @%p15 bra BB0_19; rem.s32 %r172, %r22, %r28; add.s32 %r173, %r172, %r28; rem.s32 %r214, %r173, %r28; bra.uni BB0_20; BB0_19: add.s32 %r174, %r28, -1; min.s32 %r214, %r22, %r174; BB0_20: mad.lo.s32 %r179, %r214, %r27, %r36; mad.lo.s32 %r184, %r179, %r26, %r32; cvt.s64.s32 %rd38, %r184; add.s64 %rd39, %rd4, %rd38; ld.global.nc.u8 %rs23, [%rd39]; setp.gt.u16 %p18, %rs23, %rs1; cvt.u32.u16 %r185, %rs23; and.b32 %r186, %r185, 255; selp.b32 %r189, %r62, %r186, %p18; selp.b32 %r190, %r186, %r62, %p18; add.s32 %r191, %r190, 1; mul.lo.s32 %r192, %r191, %r190; shr.u32 %r193, %r192, 1; add.s32 %r194, %r193, %r189; mul.wide.s32 %rd41, %r194, 4; add.s64 %rd42, %rd10, %rd41; ld.global.nc.f32 %f13, [%rd42]; add.f32 %f14, %f5, %f13; BB0_21: mad.lo.s32 %r208, %r49, %r26, %r32; cvta.to.global.u64 %rd43, %rd1; mul.wide.s32 %rd44, %r208, 4; add.s64 %rd45, %rd43, %rd44; st.global.f32 [%rd45], %f14; BB0_22: ret; } ` ) mumax3-3.10/cuda/fatbin.go000066400000000000000000000014251371432437400153670ustar00rootroot00000000000000package cuda import ( "log" "github.com/mumax/3/cuda/cu" ) // load PTX code for function name, find highest SM that matches our card. func fatbinLoad(sm map[int]string, fn string) cu.Function { cc := determineCC() return cu.ModuleLoadData(sm[cc]).GetFunction(fn) } var UseCC = 0 func determineCC() int { if UseCC != 0 { return UseCC } for k, _ := range madd2_map { if k > UseCC && ccIsOK(k) { UseCC = k } } if UseCC == 0 { log.Fatalln("\nNo binary for GPU. Your nvidia driver may be out-of-date\n") } return UseCC } // check wheter compute capability cc works func ccIsOK(cc int) (ok bool) { defer func() { if err := recover(); err == cu.ERROR_NO_BINARY_FOR_GPU { ok = false } }() cu.ModuleLoadData(madd2_map[cc]).GetFunction("madd2") return true } mumax3-3.10/cuda/fft3dc2r.go000066400000000000000000000032371371432437400155440ustar00rootroot00000000000000package cuda import ( "fmt" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/cuda/cufft" "github.com/mumax/3/data" "github.com/mumax/3/timer" ) // 3D single-precission real-to-complex FFT plan. type fft3DC2RPlan struct { fftplan size [3]int } // 3D single-precission real-to-complex FFT plan. func newFFT3DC2R(Nx, Ny, Nz int) fft3DC2RPlan { handle := cufft.Plan3d(Nz, Ny, Nx, cufft.C2R) // new xyz swap handle.SetStream(stream0) return fft3DC2RPlan{fftplan{handle}, [3]int{Nx, Ny, Nz}} } // Execute the FFT plan, asynchronous. // src and dst are 3D arrays stored 1D arrays. func (p *fft3DC2RPlan) ExecAsync(src, dst *data.Slice) { if Synchronous { Sync() timer.Start("fft") } oksrclen := p.InputLenFloats() if src.Len() != oksrclen { panic(fmt.Errorf("fft size mismatch: expecting src len %v, got %v", oksrclen, src.Len())) } okdstlen := p.OutputLenFloats() if dst.Len() != okdstlen { panic(fmt.Errorf("fft size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len())) } p.handle.ExecC2R(cu.DevicePtr(uintptr(src.DevPtr(0))), cu.DevicePtr(uintptr(dst.DevPtr(0)))) if Synchronous { Sync() timer.Stop("fft") } } // 3D size of the input array. func (p *fft3DC2RPlan) InputSizeFloats() (Nx, Ny, Nz int) { return 2 * (p.size[X]/2 + 1), p.size[Y], p.size[Z] } // 3D size of the output array. func (p *fft3DC2RPlan) OutputSizeFloats() (Nx, Ny, Nz int) { return p.size[X], p.size[Y], p.size[Z] } // Required length of the (1D) input array. func (p *fft3DC2RPlan) InputLenFloats() int { return prod3(p.InputSizeFloats()) } // Required length of the (1D) output array. func (p *fft3DC2RPlan) OutputLenFloats() int { return prod3(p.OutputSizeFloats()) } mumax3-3.10/cuda/fft3dr2c.go000066400000000000000000000033111371432437400155350ustar00rootroot00000000000000package cuda import ( "log" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/cuda/cufft" "github.com/mumax/3/data" "github.com/mumax/3/timer" "github.com/mumax/3/util" ) // 3D single-precission real-to-complex FFT plan. type fft3DR2CPlan struct { fftplan size [3]int } // 3D single-precission real-to-complex FFT plan. func newFFT3DR2C(Nx, Ny, Nz int) fft3DR2CPlan { handle := cufft.Plan3d(Nz, Ny, Nx, cufft.R2C) // new xyz swap handle.SetStream(stream0) return fft3DR2CPlan{fftplan{handle}, [3]int{Nx, Ny, Nz}} } // Execute the FFT plan, asynchronous. // src and dst are 3D arrays stored 1D arrays. func (p *fft3DR2CPlan) ExecAsync(src, dst *data.Slice) { if Synchronous { Sync() timer.Start("fft") } util.Argument(src.NComp() == 1 && dst.NComp() == 1) oksrclen := p.InputLen() if src.Len() != oksrclen { log.Panicf("fft size mismatch: expecting src len %v, got %v", oksrclen, src.Len()) } okdstlen := p.OutputLen() if dst.Len() != okdstlen { log.Panicf("fft size mismatch: expecting dst len %v, got %v", okdstlen, dst.Len()) } p.handle.ExecR2C(cu.DevicePtr(uintptr(src.DevPtr(0))), cu.DevicePtr(uintptr(dst.DevPtr(0)))) if Synchronous { Sync() timer.Stop("fft") } } // 3D size of the input array. func (p *fft3DR2CPlan) InputSizeFloats() (Nx, Ny, Nz int) { return p.size[X], p.size[Y], p.size[Z] } // 3D size of the output array. func (p *fft3DR2CPlan) OutputSizeFloats() (Nx, Ny, Nz int) { return 2 * (p.size[X]/2 + 1), p.size[Y], p.size[Z] } // Required length of the (1D) input array. func (p *fft3DR2CPlan) InputLen() int { return prod3(p.InputSizeFloats()) } // Required length of the (1D) output array. func (p *fft3DR2CPlan) OutputLen() int { return prod3(p.OutputSizeFloats()) } mumax3-3.10/cuda/fftplan.go000066400000000000000000000010511371432437400155510ustar00rootroot00000000000000package cuda // INTERNAL // Base implementation for all FFT plans. import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/cuda/cufft" ) // Base implementation for all FFT plans. type fftplan struct { handle cufft.Handle } func prod3(x, y, z int) int { return x * y * z } // Releases all resources associated with the FFT plan. func (p *fftplan) Free() { if p.handle != 0 { p.handle.Destroy() p.handle = 0 } } // Associates a CUDA stream with the FFT plan. func (p *fftplan) setStream(stream cu.Stream) { p.handle.SetStream(stream) } mumax3-3.10/cuda/float3.h000066400000000000000000000034301371432437400151340ustar00rootroot00000000000000#ifndef _FLOAT3_H_ #define _FLOAT3_H_ // This file implements common functions on float3 (vector). // Author: Mykola Dvornik, Arne Vansteenkiste inline __device__ float3 operator+(float3 a, float3 b) { return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); } inline __device__ void operator+=(float3 &a, float3 b) { a.x += b.x; a.y += b.y; a.z += b.z; } inline __device__ float3 operator-(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); } inline __device__ float3 operator-(float3 a) { return make_float3(-a.x, -a.y, -a.z); } inline __device__ void operator-=(float3 &a, float3 b) { a.x -= b.x; a.y -= b.y; a.z -= b.z; } inline __device__ float3 operator*(float s, float3 a) { return make_float3(s*a.x, s*a.y, s*a.z); } inline __device__ float3 operator*(float3 a, float s) { return make_float3(s*a.x, s*a.y, s*a.z); } inline __device__ void operator*=(float3 &a, float s) { a.x *= s; a.y *= s; a.z *= s; } // dot product inline __device__ float dot(float3 a, float3 b) { return a.x * b.x + a.y * b.y + a.z * b.z; } // cross product inline __device__ float3 cross(float3 a, float3 b) { return make_float3( a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); } // lenght of the 3-components vector inline __device__ float len(float3 a) { return sqrtf(dot(a,a)); } // returns a normalized copy of the 3-components vector inline __device__ float3 normalized(float3 a){ float veclen = (len(a) != 0.0f) ? 1.0f / len(a) : 0.0f; return veclen * a; } // square inline __device__ float pow2(float x){ return x * x; } // pow(x, 3) inline __device__ float pow3(float x){ return x * x * x; } // pow(x, 4) inline __device__ float pow4(float x){ float s = x*x; return s*s; } #define is0(m) ( dot(m, m) == 0.0f ) #endif mumax3-3.10/cuda/init.go000066400000000000000000000034621371432437400150720ustar00rootroot00000000000000// Package cuda provides GPU interaction package cuda import ( "fmt" "log" "runtime" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/util" ) var ( DriverVersion int // cuda driver version DevName string // GPU name TotalMem int64 // total GPU memory GPUInfo string // Human-readable GPU description Synchronous bool // for debug: synchronize stream0 at every kernel launch cudaCtx cu.Context // global CUDA context cudaCC int // compute capablity (used for fatbin) ) // Locks to an OS thread and initializes CUDA for that thread. func Init(gpu int) { if cudaCtx != 0 { return // needed for tests } runtime.LockOSThread() tryCuInit() dev := cu.Device(gpu) cudaCtx = cu.CtxCreate(cu.CTX_SCHED_YIELD, dev) cudaCtx.SetCurrent() M, m := dev.ComputeCapability() cudaCC = 10*M + m DriverVersion = cu.Version() DevName = dev.Name() TotalMem = dev.TotalMem() GPUInfo = fmt.Sprintf("%s(%dMB), CUDA Driver %d.%d, cc=%d.%d", DevName, (TotalMem)/(1024*1024), DriverVersion/1000, (DriverVersion%1000)/10, M, m) if M < 2 { log.Fatalln("GPU has insufficient compute capability, need 2.0 or higher.") } if Synchronous { log.Println("DEBUG: synchronized CUDA calls") } // test PTX load so that we can catch CUDA_ERROR_NO_BINARY_FOR_GPU early fatbinLoad(madd2_map, "madd2") } // cu.Init(), but error is fatal and does not dump stack. func tryCuInit() { defer func() { err := recover() if err == cu.ERROR_UNKNOWN { log.Println("\n Try running: sudo nvidia-modprobe -u \n") } util.FatalErr(err) }() cu.Init(0) } // Global stream used for everything const stream0 = cu.Stream(0) // Synchronize the global stream // This is called before and after all memcopy operations between host and device. func Sync() { stream0.Synchronize() } mumax3-3.10/cuda/init_test.go000066400000000000000000000002651371432437400161270ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/cuda/cu" ) // needed for all other tests. func init() { cu.Init(0) ctx := cu.CtxCreate(cu.CTX_SCHED_AUTO, 0) cu.CtxSetCurrent(ctx) } mumax3-3.10/cuda/kernmulc.cu000066400000000000000000000007631371432437400157520ustar00rootroot00000000000000extern "C" __global__ void kernmulC(float* __restrict__ fftM, float* __restrict__ fftK, int Nx, int Ny) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; if(ix>= Nx || iy>=Ny) { return; } int I = iy*Nx + ix; int e = 2 * I; float reM = fftM[e ]; float imM = fftM[e+1]; float reK = fftK[e ]; float imK = fftK[e+1]; fftM[e ] = reM * reK - imM * imK; fftM[e+1] = reM * imK + imM * reK; } mumax3-3.10/cuda/kernmulc_wrapper.go000066400000000000000000000461221371432437400175070ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for kernmulC kernel var kernmulC_code cu.Function // Stores the arguments for kernmulC kernel invocation type kernmulC_args_t struct { arg_fftM unsafe.Pointer arg_fftK unsafe.Pointer arg_Nx int arg_Ny int argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for kernmulC kernel invocation var kernmulC_args kernmulC_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. kernmulC_args.argptr[0] = unsafe.Pointer(&kernmulC_args.arg_fftM) kernmulC_args.argptr[1] = unsafe.Pointer(&kernmulC_args.arg_fftK) kernmulC_args.argptr[2] = unsafe.Pointer(&kernmulC_args.arg_Nx) kernmulC_args.argptr[3] = unsafe.Pointer(&kernmulC_args.arg_Ny) } // Wrapper for kernmulC CUDA kernel, asynchronous. func k_kernmulC_async(fftM unsafe.Pointer, fftK unsafe.Pointer, Nx int, Ny int, cfg *config) { if Synchronous { // debug Sync() timer.Start("kernmulC") } kernmulC_args.Lock() defer kernmulC_args.Unlock() if kernmulC_code == 0 { kernmulC_code = fatbinLoad(kernmulC_map, "kernmulC") } kernmulC_args.arg_fftM = fftM kernmulC_args.arg_fftK = fftK kernmulC_args.arg_Nx = Nx kernmulC_args.arg_Ny = Ny args := kernmulC_args.argptr[:] cu.LaunchKernel(kernmulC_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("kernmulC") } } // maps compute capability on PTX code for kernmulC kernel. var kernmulC_map = map[int]string{0: "", 30: kernmulC_ptx_30, 32: kernmulC_ptx_32, 35: kernmulC_ptx_35, 37: kernmulC_ptx_37, 50: kernmulC_ptx_50, 52: kernmulC_ptx_52, 53: kernmulC_ptx_53, 60: kernmulC_ptx_60, 61: kernmulC_ptx_61, 62: kernmulC_ptx_62, 70: kernmulC_ptx_70, 72: kernmulC_ptx_72, 75: kernmulC_ptx_75} // kernmulC PTX code for various compute capabilities. const ( kernmulC_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; cvta.to.global.u64 %rd4, %rd1; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; add.s64 %rd7, %rd3, %rd5; ld.global.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; ld.global.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd6+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd6], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd6+4], %f9; BB0_2: ret; } ` kernmulC_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; cvta.to.global.u64 %rd4, %rd1; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; add.s64 %rd7, %rd3, %rd5; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd6+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd6], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd6+4], %f9; BB0_2: ret; } ` kernmulC_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; cvta.to.global.u64 %rd4, %rd1; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; add.s64 %rd7, %rd3, %rd5; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd6+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd6], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd6+4], %f9; BB0_2: ret; } ` kernmulC_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; cvta.to.global.u64 %rd4, %rd1; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; add.s64 %rd7, %rd3, %rd5; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd6+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd6], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd6+4], %f9; BB0_2: ret; } ` kernmulC_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; cvta.to.global.u64 %rd4, %rd1; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; add.s64 %rd7, %rd3, %rd5; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd6+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd6], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd6+4], %f9; BB0_2: ret; } ` kernmulC_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; cvta.to.global.u64 %rd4, %rd1; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; add.s64 %rd7, %rd3, %rd5; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd6+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd6], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd6+4], %f9; BB0_2: ret; } ` kernmulC_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; cvta.to.global.u64 %rd4, %rd1; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; add.s64 %rd7, %rd3, %rd5; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd6+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd6], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd6+4], %f9; BB0_2: ret; } ` kernmulC_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; cvta.to.global.u64 %rd4, %rd1; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; add.s64 %rd7, %rd3, %rd5; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd6+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd6], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd6+4], %f9; BB0_2: ret; } ` kernmulC_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; cvta.to.global.u64 %rd4, %rd1; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; add.s64 %rd7, %rd3, %rd5; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd6+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd6], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd6+4], %f9; BB0_2: ret; } ` kernmulC_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; cvta.to.global.u64 %rd4, %rd1; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; add.s64 %rd7, %rd3, %rd5; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd6+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd6], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd6+4], %f9; BB0_2: ret; } ` kernmulC_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; cvta.to.global.u64 %rd4, %rd1; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; add.s64 %rd7, %rd3, %rd5; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd6+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd6], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd6+4], %f9; BB0_2: ret; } ` kernmulC_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; cvta.to.global.u64 %rd4, %rd1; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; add.s64 %rd7, %rd3, %rd5; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd6+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd6], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd6+4], %f9; BB0_2: ret; } ` kernmulC_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl kernmulC .visible .entry kernmulC( .param .u64 kernmulC_param_0, .param .u64 kernmulC_param_1, .param .u32 kernmulC_param_2, .param .u32 kernmulC_param_3 ) { .reg .pred %p<4>; .reg .f32 %f<10>; .reg .b32 %r<13>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [kernmulC_param_0]; ld.param.u64 %rd2, [kernmulC_param_1]; ld.param.u32 %r3, [kernmulC_param_2]; ld.param.u32 %r4, [kernmulC_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; cvta.to.global.u64 %rd4, %rd1; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd4, %rd5; add.s64 %rd7, %rd3, %rd5; ld.global.nc.f32 %f1, [%rd7]; ld.global.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd7+4]; ld.global.f32 %f5, [%rd6+4]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; st.global.f32 [%rd6], %f7; mul.f32 %f8, %f2, %f4; fma.rn.f32 %f9, %f5, %f1, %f8; st.global.f32 [%rd6+4], %f9; BB0_2: ret; } ` ) mumax3-3.10/cuda/kernmulrsymm2dxy.cu000066400000000000000000000021271371432437400175020ustar00rootroot00000000000000// 2D XY (in-plane) micromagnetic kernel multiplication: // |Mx| = |Kxx Kxy| * |Mx| // |My| |Kyx Kyy| |My| // Using the same symmetries as kernmulrsymm3d.cu extern "C" __global__ void kernmulRSymm2Dxy(float* __restrict__ fftMx, float* __restrict__ fftMy, float* __restrict__ fftKxx, float* __restrict__ fftKyy, float* __restrict__ fftKxy, int Nx, int Ny) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; if(ix>= Nx || iy>=Ny) { return; } int I = iy*Nx + ix; int e = 2 * I; float reMx = fftMx[e ]; float imMx = fftMx[e+1]; float reMy = fftMy[e ]; float imMy = fftMy[e+1]; // symmetry factor float fxy = 1.0f; if (iy > Ny/2) { iy = Ny-iy; fxy = -fxy; } I = iy*Nx + ix; float Kxx = fftKxx[I]; float Kyy = fftKyy[I]; float Kxy = fxy * fftKxy[I]; fftMx[e ] = reMx * Kxx + reMy * Kxy; fftMx[e+1] = imMx * Kxx + imMy * Kxy; fftMy[e ] = reMx * Kxy + reMy * Kyy; fftMy[e+1] = imMx * Kxy + imMy * Kyy; } mumax3-3.10/cuda/kernmulrsymm2dxy_wrapper.go000066400000000000000000001043661371432437400212500ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for kernmulRSymm2Dxy kernel var kernmulRSymm2Dxy_code cu.Function // Stores the arguments for kernmulRSymm2Dxy kernel invocation type kernmulRSymm2Dxy_args_t struct { arg_fftMx unsafe.Pointer arg_fftMy unsafe.Pointer arg_fftKxx unsafe.Pointer arg_fftKyy unsafe.Pointer arg_fftKxy unsafe.Pointer arg_Nx int arg_Ny int argptr [7]unsafe.Pointer sync.Mutex } // Stores the arguments for kernmulRSymm2Dxy kernel invocation var kernmulRSymm2Dxy_args kernmulRSymm2Dxy_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. kernmulRSymm2Dxy_args.argptr[0] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_fftMx) kernmulRSymm2Dxy_args.argptr[1] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_fftMy) kernmulRSymm2Dxy_args.argptr[2] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_fftKxx) kernmulRSymm2Dxy_args.argptr[3] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_fftKyy) kernmulRSymm2Dxy_args.argptr[4] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_fftKxy) kernmulRSymm2Dxy_args.argptr[5] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_Nx) kernmulRSymm2Dxy_args.argptr[6] = unsafe.Pointer(&kernmulRSymm2Dxy_args.arg_Ny) } // Wrapper for kernmulRSymm2Dxy CUDA kernel, asynchronous. func k_kernmulRSymm2Dxy_async(fftMx unsafe.Pointer, fftMy unsafe.Pointer, fftKxx unsafe.Pointer, fftKyy unsafe.Pointer, fftKxy unsafe.Pointer, Nx int, Ny int, cfg *config) { if Synchronous { // debug Sync() timer.Start("kernmulRSymm2Dxy") } kernmulRSymm2Dxy_args.Lock() defer kernmulRSymm2Dxy_args.Unlock() if kernmulRSymm2Dxy_code == 0 { kernmulRSymm2Dxy_code = fatbinLoad(kernmulRSymm2Dxy_map, "kernmulRSymm2Dxy") } kernmulRSymm2Dxy_args.arg_fftMx = fftMx kernmulRSymm2Dxy_args.arg_fftMy = fftMy kernmulRSymm2Dxy_args.arg_fftKxx = fftKxx kernmulRSymm2Dxy_args.arg_fftKyy = fftKyy kernmulRSymm2Dxy_args.arg_fftKxy = fftKxy kernmulRSymm2Dxy_args.arg_Nx = Nx kernmulRSymm2Dxy_args.arg_Ny = Ny args := kernmulRSymm2Dxy_args.argptr[:] cu.LaunchKernel(kernmulRSymm2Dxy_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("kernmulRSymm2Dxy") } } // maps compute capability on PTX code for kernmulRSymm2Dxy kernel. var kernmulRSymm2Dxy_map = map[int]string{0: "", 30: kernmulRSymm2Dxy_ptx_30, 32: kernmulRSymm2Dxy_ptx_32, 35: kernmulRSymm2Dxy_ptx_35, 37: kernmulRSymm2Dxy_ptx_37, 50: kernmulRSymm2Dxy_ptx_50, 52: kernmulRSymm2Dxy_ptx_52, 53: kernmulRSymm2Dxy_ptx_53, 60: kernmulRSymm2Dxy_ptx_60, 61: kernmulRSymm2Dxy_ptx_61, 62: kernmulRSymm2Dxy_ptx_62, 70: kernmulRSymm2Dxy_ptx_70, 72: kernmulRSymm2Dxy_ptx_72, 75: kernmulRSymm2Dxy_ptx_75} // kernmulRSymm2Dxy PTX code for various compute capabilities. const ( kernmulRSymm2Dxy_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; cvta.to.global.u64 %rd7, %rd1; cvta.to.global.u64 %rd8, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd9, %r12, 4; add.s64 %rd10, %rd7, %rd9; ld.global.f32 %f1, [%rd10+4]; add.s64 %rd11, %rd6, %rd9; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd8, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd10]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd10], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd10+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; cvta.to.global.u64 %rd7, %rd1; cvta.to.global.u64 %rd8, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd9, %r12, 4; add.s64 %rd10, %rd7, %rd9; ld.global.f32 %f1, [%rd10+4]; add.s64 %rd11, %rd6, %rd9; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd8, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd10]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd10], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd10+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; cvta.to.global.u64 %rd7, %rd1; cvta.to.global.u64 %rd8, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd9, %r12, 4; add.s64 %rd10, %rd7, %rd9; ld.global.f32 %f1, [%rd10+4]; add.s64 %rd11, %rd6, %rd9; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd8, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd10]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd10], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd10+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; cvta.to.global.u64 %rd7, %rd1; cvta.to.global.u64 %rd8, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd9, %r12, 4; add.s64 %rd10, %rd7, %rd9; ld.global.f32 %f1, [%rd10+4]; add.s64 %rd11, %rd6, %rd9; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd8, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd10]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd10], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd10+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; cvta.to.global.u64 %rd7, %rd1; cvta.to.global.u64 %rd8, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd9, %r12, 4; add.s64 %rd10, %rd7, %rd9; ld.global.f32 %f1, [%rd10+4]; add.s64 %rd11, %rd6, %rd9; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd8, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd10]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd10], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd10+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; cvta.to.global.u64 %rd7, %rd1; cvta.to.global.u64 %rd8, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd9, %r12, 4; add.s64 %rd10, %rd7, %rd9; ld.global.f32 %f1, [%rd10+4]; add.s64 %rd11, %rd6, %rd9; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd8, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd10]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd10], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd10+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; cvta.to.global.u64 %rd7, %rd1; cvta.to.global.u64 %rd8, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd9, %r12, 4; add.s64 %rd10, %rd7, %rd9; ld.global.f32 %f1, [%rd10+4]; add.s64 %rd11, %rd6, %rd9; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd8, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd10]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd10], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd10+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; cvta.to.global.u64 %rd7, %rd1; cvta.to.global.u64 %rd8, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd9, %r12, 4; add.s64 %rd10, %rd7, %rd9; ld.global.f32 %f1, [%rd10+4]; add.s64 %rd11, %rd6, %rd9; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd8, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd10]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd10], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd10+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; cvta.to.global.u64 %rd7, %rd1; cvta.to.global.u64 %rd8, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd9, %r12, 4; add.s64 %rd10, %rd7, %rd9; ld.global.f32 %f1, [%rd10+4]; add.s64 %rd11, %rd6, %rd9; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd8, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd10]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd10], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd10+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; cvta.to.global.u64 %rd7, %rd1; cvta.to.global.u64 %rd8, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd9, %r12, 4; add.s64 %rd10, %rd7, %rd9; ld.global.f32 %f1, [%rd10+4]; add.s64 %rd11, %rd6, %rd9; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd8, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd10]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd10], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd10+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; cvta.to.global.u64 %rd7, %rd1; cvta.to.global.u64 %rd8, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd9, %r12, 4; add.s64 %rd10, %rd7, %rd9; ld.global.f32 %f1, [%rd10+4]; add.s64 %rd11, %rd6, %rd9; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd8, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd10]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd10], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd10+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; cvta.to.global.u64 %rd7, %rd1; cvta.to.global.u64 %rd8, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd9, %r12, 4; add.s64 %rd10, %rd7, %rd9; ld.global.f32 %f1, [%rd10+4]; add.s64 %rd11, %rd6, %rd9; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd8, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd10]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd10], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd10+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; BB0_2: ret; } ` kernmulRSymm2Dxy_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl kernmulRSymm2Dxy .visible .entry kernmulRSymm2Dxy( .param .u64 kernmulRSymm2Dxy_param_0, .param .u64 kernmulRSymm2Dxy_param_1, .param .u64 kernmulRSymm2Dxy_param_2, .param .u64 kernmulRSymm2Dxy_param_3, .param .u64 kernmulRSymm2Dxy_param_4, .param .u32 kernmulRSymm2Dxy_param_5, .param .u32 kernmulRSymm2Dxy_param_6 ) { .reg .pred %p<5>; .reg .f32 %f<18>; .reg .b32 %r<19>; .reg .b64 %rd<18>; ld.param.u64 %rd1, [kernmulRSymm2Dxy_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dxy_param_1]; ld.param.u64 %rd3, [kernmulRSymm2Dxy_param_2]; ld.param.u64 %rd4, [kernmulRSymm2Dxy_param_3]; ld.param.u64 %rd5, [kernmulRSymm2Dxy_param_4]; ld.param.u32 %r3, [kernmulRSymm2Dxy_param_5]; ld.param.u32 %r4, [kernmulRSymm2Dxy_param_6]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; cvta.to.global.u64 %rd7, %rd1; cvta.to.global.u64 %rd8, %rd3; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd9, %r12, 4; add.s64 %rd10, %rd7, %rd9; ld.global.f32 %f1, [%rd10+4]; add.s64 %rd11, %rd6, %rd9; ld.global.f32 %f2, [%rd11+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; selp.f32 %f3, 0fBF800000, 0f3F800000, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd12, %r18, 4; add.s64 %rd13, %rd8, %rd12; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f4, [%rd15]; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f5, [%rd17]; mul.f32 %f6, %f3, %f5; ld.global.nc.f32 %f7, [%rd13]; ld.global.f32 %f8, [%rd10]; ld.global.f32 %f9, [%rd11]; mul.f32 %f10, %f9, %f6; fma.rn.f32 %f11, %f8, %f7, %f10; st.global.f32 [%rd10], %f11; mul.f32 %f12, %f2, %f6; fma.rn.f32 %f13, %f1, %f7, %f12; st.global.f32 [%rd10+4], %f13; mul.f32 %f14, %f8, %f6; fma.rn.f32 %f15, %f9, %f4, %f14; st.global.f32 [%rd11], %f15; mul.f32 %f16, %f1, %f6; fma.rn.f32 %f17, %f2, %f4, %f16; st.global.f32 [%rd11+4], %f17; BB0_2: ret; } ` ) mumax3-3.10/cuda/kernmulrsymm2dz.cu000066400000000000000000000012331371432437400173100ustar00rootroot00000000000000// 2D Z (out-of-plane only) micromagnetic kernel multiplication: // Mz = Kzz * Mz // Using the same symmetries as kernmulrsymm3d.cu extern "C" __global__ void kernmulRSymm2Dz(float* __restrict__ fftMz, float* __restrict__ fftKzz, int Nx, int Ny) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; if(ix>= Nx || iy>=Ny) { return; } int I = iy*Nx + ix; int e = 2 * I; float reMz = fftMz[e ]; float imMz = fftMz[e+1]; if (iy > Ny/2) { iy = Ny-iy; } I = iy*Nx + ix; float Kzz = fftKzz[I]; fftMz[e ] = reMz * Kzz; fftMz[e+1] = imMz * Kzz; } mumax3-3.10/cuda/kernmulrsymm2dz_wrapper.go000066400000000000000000000535001371432437400210520ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for kernmulRSymm2Dz kernel var kernmulRSymm2Dz_code cu.Function // Stores the arguments for kernmulRSymm2Dz kernel invocation type kernmulRSymm2Dz_args_t struct { arg_fftMz unsafe.Pointer arg_fftKzz unsafe.Pointer arg_Nx int arg_Ny int argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for kernmulRSymm2Dz kernel invocation var kernmulRSymm2Dz_args kernmulRSymm2Dz_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. kernmulRSymm2Dz_args.argptr[0] = unsafe.Pointer(&kernmulRSymm2Dz_args.arg_fftMz) kernmulRSymm2Dz_args.argptr[1] = unsafe.Pointer(&kernmulRSymm2Dz_args.arg_fftKzz) kernmulRSymm2Dz_args.argptr[2] = unsafe.Pointer(&kernmulRSymm2Dz_args.arg_Nx) kernmulRSymm2Dz_args.argptr[3] = unsafe.Pointer(&kernmulRSymm2Dz_args.arg_Ny) } // Wrapper for kernmulRSymm2Dz CUDA kernel, asynchronous. func k_kernmulRSymm2Dz_async(fftMz unsafe.Pointer, fftKzz unsafe.Pointer, Nx int, Ny int, cfg *config) { if Synchronous { // debug Sync() timer.Start("kernmulRSymm2Dz") } kernmulRSymm2Dz_args.Lock() defer kernmulRSymm2Dz_args.Unlock() if kernmulRSymm2Dz_code == 0 { kernmulRSymm2Dz_code = fatbinLoad(kernmulRSymm2Dz_map, "kernmulRSymm2Dz") } kernmulRSymm2Dz_args.arg_fftMz = fftMz kernmulRSymm2Dz_args.arg_fftKzz = fftKzz kernmulRSymm2Dz_args.arg_Nx = Nx kernmulRSymm2Dz_args.arg_Ny = Ny args := kernmulRSymm2Dz_args.argptr[:] cu.LaunchKernel(kernmulRSymm2Dz_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("kernmulRSymm2Dz") } } // maps compute capability on PTX code for kernmulRSymm2Dz kernel. var kernmulRSymm2Dz_map = map[int]string{0: "", 30: kernmulRSymm2Dz_ptx_30, 32: kernmulRSymm2Dz_ptx_32, 35: kernmulRSymm2Dz_ptx_35, 37: kernmulRSymm2Dz_ptx_37, 50: kernmulRSymm2Dz_ptx_50, 52: kernmulRSymm2Dz_ptx_52, 53: kernmulRSymm2Dz_ptx_53, 60: kernmulRSymm2Dz_ptx_60, 61: kernmulRSymm2Dz_ptx_61, 62: kernmulRSymm2Dz_ptx_62, 70: kernmulRSymm2Dz_ptx_70, 72: kernmulRSymm2Dz_ptx_72, 75: kernmulRSymm2Dz_ptx_75} // kernmulRSymm2Dz PTX code for various compute capabilities. const ( kernmulRSymm2Dz_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd1; cvta.to.global.u64 %rd4, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd3, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd4, %rd7; ld.global.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; BB0_2: ret; } ` kernmulRSymm2Dz_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd1; cvta.to.global.u64 %rd4, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd3, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd4, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; BB0_2: ret; } ` kernmulRSymm2Dz_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd1; cvta.to.global.u64 %rd4, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd3, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd4, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; BB0_2: ret; } ` kernmulRSymm2Dz_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd1; cvta.to.global.u64 %rd4, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd3, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd4, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; BB0_2: ret; } ` kernmulRSymm2Dz_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd1; cvta.to.global.u64 %rd4, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd3, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd4, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; BB0_2: ret; } ` kernmulRSymm2Dz_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd1; cvta.to.global.u64 %rd4, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd3, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd4, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; BB0_2: ret; } ` kernmulRSymm2Dz_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd1; cvta.to.global.u64 %rd4, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd3, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd4, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; BB0_2: ret; } ` kernmulRSymm2Dz_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd1; cvta.to.global.u64 %rd4, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd3, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd4, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; BB0_2: ret; } ` kernmulRSymm2Dz_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd1; cvta.to.global.u64 %rd4, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd3, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd4, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; BB0_2: ret; } ` kernmulRSymm2Dz_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd1; cvta.to.global.u64 %rd4, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd3, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd4, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; BB0_2: ret; } ` kernmulRSymm2Dz_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd1; cvta.to.global.u64 %rd4, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd3, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd4, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; BB0_2: ret; } ` kernmulRSymm2Dz_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd1; cvta.to.global.u64 %rd4, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd3, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd4, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; BB0_2: ret; } ` kernmulRSymm2Dz_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl kernmulRSymm2Dz .visible .entry kernmulRSymm2Dz( .param .u64 kernmulRSymm2Dz_param_0, .param .u64 kernmulRSymm2Dz_param_1, .param .u32 kernmulRSymm2Dz_param_2, .param .u32 kernmulRSymm2Dz_param_3 ) { .reg .pred %p<5>; .reg .f32 %f<6>; .reg .b32 %r<19>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [kernmulRSymm2Dz_param_0]; ld.param.u64 %rd2, [kernmulRSymm2Dz_param_1]; ld.param.u32 %r3, [kernmulRSymm2Dz_param_2]; ld.param.u32 %r4, [kernmulRSymm2Dz_param_3]; mov.u32 %r5, %ntid.x; mov.u32 %r6, %ctaid.x; mov.u32 %r7, %tid.x; mad.lo.s32 %r1, %r5, %r6, %r7; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; mad.lo.s32 %r2, %r8, %r9, %r10; setp.ge.s32 %p1, %r2, %r4; setp.ge.s32 %p2, %r1, %r3; or.pred %p3, %p1, %p2; @%p3 bra BB0_2; cvta.to.global.u64 %rd3, %rd1; cvta.to.global.u64 %rd4, %rd2; mad.lo.s32 %r11, %r2, %r3, %r1; shl.b32 %r12, %r11, 1; mul.wide.s32 %rd5, %r12, 4; add.s64 %rd6, %rd3, %rd5; ld.global.f32 %f1, [%rd6+4]; shr.u32 %r13, %r4, 31; add.s32 %r14, %r4, %r13; shr.s32 %r15, %r14, 1; setp.gt.s32 %p4, %r2, %r15; sub.s32 %r16, %r4, %r2; selp.b32 %r17, %r16, %r2, %p4; mad.lo.s32 %r18, %r17, %r3, %r1; mul.wide.s32 %rd7, %r18, 4; add.s64 %rd8, %rd4, %rd7; ld.global.nc.f32 %f2, [%rd8]; ld.global.f32 %f3, [%rd6]; mul.f32 %f4, %f3, %f2; st.global.f32 [%rd6], %f4; mul.f32 %f5, %f1, %f2; st.global.f32 [%rd6+4], %f5; BB0_2: ret; } ` ) mumax3-3.10/cuda/kernmulrsymm3d.cu000066400000000000000000000047021371432437400171230ustar00rootroot00000000000000// 3D micromagnetic kernel multiplication: // // |Mx| |Kxx Kxy Kxz| |Mx| // |My| = |Kxy Kyy Kyz| * |My| // |Mz| |Kxz Kyz Kzz| |Mz| // // ~kernel has mirror symmetry along Y and Z-axis, // apart form first row, // and is only stored (roughly) half: // // K11, K22, K02: // xxxxx // aaaaa // bbbbb // .... // bbbbb // aaaaa // // K12: // xxxxx // aaaaa // bbbbb // ... // -bbbb // -aaaa extern "C" __global__ void kernmulRSymm3D(float* __restrict__ fftMx, float* __restrict__ fftMy, float* __restrict__ fftMz, float* __restrict__ fftKxx, float* __restrict__ fftKyy, float* __restrict__ fftKzz, float* __restrict__ fftKyz, float* __restrict__ fftKxz, float* __restrict__ fftKxy, int Nx, int Ny, int Nz) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if(ix>= Nx || iy>= Ny || iz>=Nz) { return; } // fetch (complex) FFT'ed magnetization int I = (iz*Ny + iy)*Nx + ix; int e = 2 * I; float reMx = fftMx[e ]; float imMx = fftMx[e+1]; float reMy = fftMy[e ]; float imMy = fftMy[e+1]; float reMz = fftMz[e ]; float imMz = fftMz[e+1]; // fetch kernel // minus signs are added to some elements if // reconstructed from symmetry. float signYZ = 1.0f; float signXZ = 1.0f; float signXY = 1.0f; // use symmetry to fetch from redundant parts: // mirror index into first quadrant and set signs. if (iy > Ny/2) { iy = Ny-iy; signYZ = -signYZ; signXY = -signXY; } if (iz > Nz/2) { iz = Nz-iz; signYZ = -signYZ; signXZ = -signXZ; } // fetch kernel element from non-redundant part // and apply minus signs for mirrored parts. I = (iz*(Ny/2+1) + iy)*Nx + ix; // Ny/2+1: only half is stored float Kxx = fftKxx[I]; float Kyy = fftKyy[I]; float Kzz = fftKzz[I]; float Kyz = fftKyz[I] * signYZ; float Kxz = fftKxz[I] * signXZ; float Kxy = fftKxy[I] * signXY; // m * K matrix multiplication, overwrite m with result. fftMx[e ] = reMx * Kxx + reMy * Kxy + reMz * Kxz; fftMx[e+1] = imMx * Kxx + imMy * Kxy + imMz * Kxz; fftMy[e ] = reMx * Kxy + reMy * Kyy + reMz * Kyz; fftMy[e+1] = imMx * Kxy + imMy * Kyy + imMz * Kyz; fftMz[e ] = reMx * Kxz + reMy * Kyz + reMz * Kzz; fftMz[e+1] = imMx * Kxz + imMy * Kyz + imMz * Kzz; } mumax3-3.10/cuda/kernmulrsymm3d_wrapper.go000066400000000000000000001641371371432437400206720ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for kernmulRSymm3D kernel var kernmulRSymm3D_code cu.Function // Stores the arguments for kernmulRSymm3D kernel invocation type kernmulRSymm3D_args_t struct { arg_fftMx unsafe.Pointer arg_fftMy unsafe.Pointer arg_fftMz unsafe.Pointer arg_fftKxx unsafe.Pointer arg_fftKyy unsafe.Pointer arg_fftKzz unsafe.Pointer arg_fftKyz unsafe.Pointer arg_fftKxz unsafe.Pointer arg_fftKxy unsafe.Pointer arg_Nx int arg_Ny int arg_Nz int argptr [12]unsafe.Pointer sync.Mutex } // Stores the arguments for kernmulRSymm3D kernel invocation var kernmulRSymm3D_args kernmulRSymm3D_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. kernmulRSymm3D_args.argptr[0] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftMx) kernmulRSymm3D_args.argptr[1] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftMy) kernmulRSymm3D_args.argptr[2] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftMz) kernmulRSymm3D_args.argptr[3] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKxx) kernmulRSymm3D_args.argptr[4] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKyy) kernmulRSymm3D_args.argptr[5] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKzz) kernmulRSymm3D_args.argptr[6] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKyz) kernmulRSymm3D_args.argptr[7] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKxz) kernmulRSymm3D_args.argptr[8] = unsafe.Pointer(&kernmulRSymm3D_args.arg_fftKxy) kernmulRSymm3D_args.argptr[9] = unsafe.Pointer(&kernmulRSymm3D_args.arg_Nx) kernmulRSymm3D_args.argptr[10] = unsafe.Pointer(&kernmulRSymm3D_args.arg_Ny) kernmulRSymm3D_args.argptr[11] = unsafe.Pointer(&kernmulRSymm3D_args.arg_Nz) } // Wrapper for kernmulRSymm3D CUDA kernel, asynchronous. func k_kernmulRSymm3D_async(fftMx unsafe.Pointer, fftMy unsafe.Pointer, fftMz unsafe.Pointer, fftKxx unsafe.Pointer, fftKyy unsafe.Pointer, fftKzz unsafe.Pointer, fftKyz unsafe.Pointer, fftKxz unsafe.Pointer, fftKxy unsafe.Pointer, Nx int, Ny int, Nz int, cfg *config) { if Synchronous { // debug Sync() timer.Start("kernmulRSymm3D") } kernmulRSymm3D_args.Lock() defer kernmulRSymm3D_args.Unlock() if kernmulRSymm3D_code == 0 { kernmulRSymm3D_code = fatbinLoad(kernmulRSymm3D_map, "kernmulRSymm3D") } kernmulRSymm3D_args.arg_fftMx = fftMx kernmulRSymm3D_args.arg_fftMy = fftMy kernmulRSymm3D_args.arg_fftMz = fftMz kernmulRSymm3D_args.arg_fftKxx = fftKxx kernmulRSymm3D_args.arg_fftKyy = fftKyy kernmulRSymm3D_args.arg_fftKzz = fftKzz kernmulRSymm3D_args.arg_fftKyz = fftKyz kernmulRSymm3D_args.arg_fftKxz = fftKxz kernmulRSymm3D_args.arg_fftKxy = fftKxy kernmulRSymm3D_args.arg_Nx = Nx kernmulRSymm3D_args.arg_Ny = Ny kernmulRSymm3D_args.arg_Nz = Nz args := kernmulRSymm3D_args.argptr[:] cu.LaunchKernel(kernmulRSymm3D_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("kernmulRSymm3D") } } // maps compute capability on PTX code for kernmulRSymm3D kernel. var kernmulRSymm3D_map = map[int]string{0: "", 30: kernmulRSymm3D_ptx_30, 32: kernmulRSymm3D_ptx_32, 35: kernmulRSymm3D_ptx_35, 37: kernmulRSymm3D_ptx_37, 50: kernmulRSymm3D_ptx_50, 52: kernmulRSymm3D_ptx_52, 53: kernmulRSymm3D_ptx_53, 60: kernmulRSymm3D_ptx_60, 61: kernmulRSymm3D_ptx_61, 62: kernmulRSymm3D_ptx_62, 70: kernmulRSymm3D_ptx_70, 72: kernmulRSymm3D_ptx_72, 75: kernmulRSymm3D_ptx_75} // kernmulRSymm3D PTX code for various compute capabilities. const ( kernmulRSymm3D_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd10, %rd3; cvta.to.global.u64 %rd11, %rd2; cvta.to.global.u64 %rd12, %rd1; cvta.to.global.u64 %rd13, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; mul.wide.s32 %rd14, %r18, 4; add.s64 %rd15, %rd12, %rd14; ld.global.f32 %f1, [%rd15+4]; add.s64 %rd16, %rd11, %rd14; ld.global.f32 %f2, [%rd16+4]; add.s64 %rd17, %rd10, %rd14; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd13, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd15]; ld.global.f32 %f18, [%rd16]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd15], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd15+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd16], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd16+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; BB0_2: ret; } ` kernmulRSymm3D_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd10, %rd3; cvta.to.global.u64 %rd11, %rd2; cvta.to.global.u64 %rd12, %rd1; cvta.to.global.u64 %rd13, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; mul.wide.s32 %rd14, %r18, 4; add.s64 %rd15, %rd12, %rd14; ld.global.f32 %f1, [%rd15+4]; add.s64 %rd16, %rd11, %rd14; ld.global.f32 %f2, [%rd16+4]; add.s64 %rd17, %rd10, %rd14; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd13, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd15]; ld.global.f32 %f18, [%rd16]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd15], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd15+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd16], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd16+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; BB0_2: ret; } ` kernmulRSymm3D_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd10, %rd3; cvta.to.global.u64 %rd11, %rd2; cvta.to.global.u64 %rd12, %rd1; cvta.to.global.u64 %rd13, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; mul.wide.s32 %rd14, %r18, 4; add.s64 %rd15, %rd12, %rd14; ld.global.f32 %f1, [%rd15+4]; add.s64 %rd16, %rd11, %rd14; ld.global.f32 %f2, [%rd16+4]; add.s64 %rd17, %rd10, %rd14; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd13, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd15]; ld.global.f32 %f18, [%rd16]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd15], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd15+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd16], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd16+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; BB0_2: ret; } ` kernmulRSymm3D_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd10, %rd3; cvta.to.global.u64 %rd11, %rd2; cvta.to.global.u64 %rd12, %rd1; cvta.to.global.u64 %rd13, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; mul.wide.s32 %rd14, %r18, 4; add.s64 %rd15, %rd12, %rd14; ld.global.f32 %f1, [%rd15+4]; add.s64 %rd16, %rd11, %rd14; ld.global.f32 %f2, [%rd16+4]; add.s64 %rd17, %rd10, %rd14; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd13, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd15]; ld.global.f32 %f18, [%rd16]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd15], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd15+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd16], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd16+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; BB0_2: ret; } ` kernmulRSymm3D_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd10, %rd3; cvta.to.global.u64 %rd11, %rd2; cvta.to.global.u64 %rd12, %rd1; cvta.to.global.u64 %rd13, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; mul.wide.s32 %rd14, %r18, 4; add.s64 %rd15, %rd12, %rd14; ld.global.f32 %f1, [%rd15+4]; add.s64 %rd16, %rd11, %rd14; ld.global.f32 %f2, [%rd16+4]; add.s64 %rd17, %rd10, %rd14; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd13, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd15]; ld.global.f32 %f18, [%rd16]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd15], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd15+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd16], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd16+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; BB0_2: ret; } ` kernmulRSymm3D_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd10, %rd3; cvta.to.global.u64 %rd11, %rd2; cvta.to.global.u64 %rd12, %rd1; cvta.to.global.u64 %rd13, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; mul.wide.s32 %rd14, %r18, 4; add.s64 %rd15, %rd12, %rd14; ld.global.f32 %f1, [%rd15+4]; add.s64 %rd16, %rd11, %rd14; ld.global.f32 %f2, [%rd16+4]; add.s64 %rd17, %rd10, %rd14; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd13, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd15]; ld.global.f32 %f18, [%rd16]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd15], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd15+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd16], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd16+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; BB0_2: ret; } ` kernmulRSymm3D_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd10, %rd3; cvta.to.global.u64 %rd11, %rd2; cvta.to.global.u64 %rd12, %rd1; cvta.to.global.u64 %rd13, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; mul.wide.s32 %rd14, %r18, 4; add.s64 %rd15, %rd12, %rd14; ld.global.f32 %f1, [%rd15+4]; add.s64 %rd16, %rd11, %rd14; ld.global.f32 %f2, [%rd16+4]; add.s64 %rd17, %rd10, %rd14; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd13, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd15]; ld.global.f32 %f18, [%rd16]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd15], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd15+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd16], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd16+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; BB0_2: ret; } ` kernmulRSymm3D_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd10, %rd3; cvta.to.global.u64 %rd11, %rd2; cvta.to.global.u64 %rd12, %rd1; cvta.to.global.u64 %rd13, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; mul.wide.s32 %rd14, %r18, 4; add.s64 %rd15, %rd12, %rd14; ld.global.f32 %f1, [%rd15+4]; add.s64 %rd16, %rd11, %rd14; ld.global.f32 %f2, [%rd16+4]; add.s64 %rd17, %rd10, %rd14; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd13, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd15]; ld.global.f32 %f18, [%rd16]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd15], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd15+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd16], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd16+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; BB0_2: ret; } ` kernmulRSymm3D_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd10, %rd3; cvta.to.global.u64 %rd11, %rd2; cvta.to.global.u64 %rd12, %rd1; cvta.to.global.u64 %rd13, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; mul.wide.s32 %rd14, %r18, 4; add.s64 %rd15, %rd12, %rd14; ld.global.f32 %f1, [%rd15+4]; add.s64 %rd16, %rd11, %rd14; ld.global.f32 %f2, [%rd16+4]; add.s64 %rd17, %rd10, %rd14; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd13, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd15]; ld.global.f32 %f18, [%rd16]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd15], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd15+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd16], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd16+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; BB0_2: ret; } ` kernmulRSymm3D_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd10, %rd3; cvta.to.global.u64 %rd11, %rd2; cvta.to.global.u64 %rd12, %rd1; cvta.to.global.u64 %rd13, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; mul.wide.s32 %rd14, %r18, 4; add.s64 %rd15, %rd12, %rd14; ld.global.f32 %f1, [%rd15+4]; add.s64 %rd16, %rd11, %rd14; ld.global.f32 %f2, [%rd16+4]; add.s64 %rd17, %rd10, %rd14; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd13, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd15]; ld.global.f32 %f18, [%rd16]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd15], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd15+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd16], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd16+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; BB0_2: ret; } ` kernmulRSymm3D_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd10, %rd3; cvta.to.global.u64 %rd11, %rd2; cvta.to.global.u64 %rd12, %rd1; cvta.to.global.u64 %rd13, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; mul.wide.s32 %rd14, %r18, 4; add.s64 %rd15, %rd12, %rd14; ld.global.f32 %f1, [%rd15+4]; add.s64 %rd16, %rd11, %rd14; ld.global.f32 %f2, [%rd16+4]; add.s64 %rd17, %rd10, %rd14; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd13, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd15]; ld.global.f32 %f18, [%rd16]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd15], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd15+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd16], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd16+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; BB0_2: ret; } ` kernmulRSymm3D_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd10, %rd3; cvta.to.global.u64 %rd11, %rd2; cvta.to.global.u64 %rd12, %rd1; cvta.to.global.u64 %rd13, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; mul.wide.s32 %rd14, %r18, 4; add.s64 %rd15, %rd12, %rd14; ld.global.f32 %f1, [%rd15+4]; add.s64 %rd16, %rd11, %rd14; ld.global.f32 %f2, [%rd16+4]; add.s64 %rd17, %rd10, %rd14; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd13, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd15]; ld.global.f32 %f18, [%rd16]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd15], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd15+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd16], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd16+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; BB0_2: ret; } ` kernmulRSymm3D_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl kernmulRSymm3D .visible .entry kernmulRSymm3D( .param .u64 kernmulRSymm3D_param_0, .param .u64 kernmulRSymm3D_param_1, .param .u64 kernmulRSymm3D_param_2, .param .u64 kernmulRSymm3D_param_3, .param .u64 kernmulRSymm3D_param_4, .param .u64 kernmulRSymm3D_param_5, .param .u64 kernmulRSymm3D_param_6, .param .u64 kernmulRSymm3D_param_7, .param .u64 kernmulRSymm3D_param_8, .param .u32 kernmulRSymm3D_param_9, .param .u32 kernmulRSymm3D_param_10, .param .u32 kernmulRSymm3D_param_11 ) { .reg .pred %p<8>; .reg .f32 %f<38>; .reg .b32 %r<32>; .reg .b64 %rd<30>; ld.param.u64 %rd1, [kernmulRSymm3D_param_0]; ld.param.u64 %rd2, [kernmulRSymm3D_param_1]; ld.param.u64 %rd3, [kernmulRSymm3D_param_2]; ld.param.u64 %rd4, [kernmulRSymm3D_param_3]; ld.param.u64 %rd5, [kernmulRSymm3D_param_4]; ld.param.u64 %rd6, [kernmulRSymm3D_param_5]; ld.param.u64 %rd7, [kernmulRSymm3D_param_6]; ld.param.u64 %rd8, [kernmulRSymm3D_param_7]; ld.param.u64 %rd9, [kernmulRSymm3D_param_8]; ld.param.u32 %r4, [kernmulRSymm3D_param_9]; ld.param.u32 %r5, [kernmulRSymm3D_param_10]; ld.param.u32 %r6, [kernmulRSymm3D_param_11]; mov.u32 %r7, %ntid.x; mov.u32 %r8, %ctaid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r7, %r8, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd10, %rd3; cvta.to.global.u64 %rd11, %rd2; cvta.to.global.u64 %rd12, %rd1; cvta.to.global.u64 %rd13, %rd4; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; shl.b32 %r18, %r17, 1; mul.wide.s32 %rd14, %r18, 4; add.s64 %rd15, %rd12, %rd14; ld.global.f32 %f1, [%rd15+4]; add.s64 %rd16, %rd11, %rd14; ld.global.f32 %f2, [%rd16+4]; add.s64 %rd17, %rd10, %rd14; ld.global.f32 %f3, [%rd17+4]; shr.u32 %r19, %r5, 31; add.s32 %r20, %r5, %r19; shr.s32 %r21, %r20, 1; setp.gt.s32 %p6, %r2, %r21; sub.s32 %r22, %r5, %r2; selp.b32 %r23, %r22, %r2, %p6; selp.f32 %f4, 0fBF800000, 0f3F800000, %p6; shr.u32 %r24, %r6, 31; add.s32 %r25, %r6, %r24; shr.s32 %r26, %r25, 1; setp.gt.s32 %p7, %r3, %r26; neg.f32 %f5, %f4; sub.s32 %r27, %r6, %r3; selp.b32 %r28, %r27, %r3, %p7; selp.f32 %f6, %f5, %f4, %p7; selp.f32 %f7, 0fBF800000, 0f3F800000, %p7; add.s32 %r29, %r21, 1; mad.lo.s32 %r30, %r28, %r29, %r23; mad.lo.s32 %r31, %r30, %r4, %r1; mul.wide.s32 %rd18, %r31, 4; add.s64 %rd19, %rd13, %rd18; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f8, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f9, [%rd23]; cvta.to.global.u64 %rd24, %rd7; add.s64 %rd25, %rd24, %rd18; ld.global.nc.f32 %f10, [%rd25]; mul.f32 %f11, %f6, %f10; cvta.to.global.u64 %rd26, %rd8; add.s64 %rd27, %rd26, %rd18; ld.global.nc.f32 %f12, [%rd27]; mul.f32 %f13, %f7, %f12; cvta.to.global.u64 %rd28, %rd9; add.s64 %rd29, %rd28, %rd18; ld.global.nc.f32 %f14, [%rd29]; mul.f32 %f15, %f4, %f14; ld.global.nc.f32 %f16, [%rd19]; ld.global.f32 %f17, [%rd15]; ld.global.f32 %f18, [%rd16]; mul.f32 %f19, %f18, %f15; fma.rn.f32 %f20, %f17, %f16, %f19; ld.global.f32 %f21, [%rd17]; fma.rn.f32 %f22, %f21, %f13, %f20; st.global.f32 [%rd15], %f22; mul.f32 %f23, %f2, %f15; fma.rn.f32 %f24, %f1, %f16, %f23; fma.rn.f32 %f25, %f3, %f13, %f24; st.global.f32 [%rd15+4], %f25; mul.f32 %f26, %f17, %f15; fma.rn.f32 %f27, %f18, %f8, %f26; fma.rn.f32 %f28, %f21, %f11, %f27; st.global.f32 [%rd16], %f28; mul.f32 %f29, %f1, %f15; fma.rn.f32 %f30, %f2, %f8, %f29; fma.rn.f32 %f31, %f3, %f11, %f30; st.global.f32 [%rd16+4], %f31; mul.f32 %f32, %f17, %f13; fma.rn.f32 %f33, %f18, %f11, %f32; fma.rn.f32 %f34, %f21, %f9, %f33; st.global.f32 [%rd17], %f34; mul.f32 %f35, %f1, %f13; fma.rn.f32 %f36, %f2, %f11, %f35; fma.rn.f32 %f37, %f3, %f9, %f36; st.global.f32 [%rd17+4], %f37; BB0_2: ret; } ` ) mumax3-3.10/cuda/llnoprecess.cu000066400000000000000000000013301371432437400164520ustar00rootroot00000000000000#include #include "float3.h" // Landau-Lifshitz torque without precession extern "C" __global__ void llnoprecess(float* __restrict__ tx, float* __restrict__ ty, float* __restrict__ tz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ hx, float* __restrict__ hy, float* __restrict__ hz, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { float3 m = {mx[i], my[i], mz[i]}; float3 H = {hx[i], hy[i], hz[i]}; float3 mxH = cross(m, H); float3 torque = -cross(m, mxH); tx[i] = torque.x; ty[i] = torque.y; tz[i] = torque.z; } } mumax3-3.10/cuda/llnoprecess_wrapper.go000066400000000000000000001122471371432437400202220ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for llnoprecess kernel var llnoprecess_code cu.Function // Stores the arguments for llnoprecess kernel invocation type llnoprecess_args_t struct { arg_tx unsafe.Pointer arg_ty unsafe.Pointer arg_tz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_hx unsafe.Pointer arg_hy unsafe.Pointer arg_hz unsafe.Pointer arg_N int argptr [10]unsafe.Pointer sync.Mutex } // Stores the arguments for llnoprecess kernel invocation var llnoprecess_args llnoprecess_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. llnoprecess_args.argptr[0] = unsafe.Pointer(&llnoprecess_args.arg_tx) llnoprecess_args.argptr[1] = unsafe.Pointer(&llnoprecess_args.arg_ty) llnoprecess_args.argptr[2] = unsafe.Pointer(&llnoprecess_args.arg_tz) llnoprecess_args.argptr[3] = unsafe.Pointer(&llnoprecess_args.arg_mx) llnoprecess_args.argptr[4] = unsafe.Pointer(&llnoprecess_args.arg_my) llnoprecess_args.argptr[5] = unsafe.Pointer(&llnoprecess_args.arg_mz) llnoprecess_args.argptr[6] = unsafe.Pointer(&llnoprecess_args.arg_hx) llnoprecess_args.argptr[7] = unsafe.Pointer(&llnoprecess_args.arg_hy) llnoprecess_args.argptr[8] = unsafe.Pointer(&llnoprecess_args.arg_hz) llnoprecess_args.argptr[9] = unsafe.Pointer(&llnoprecess_args.arg_N) } // Wrapper for llnoprecess CUDA kernel, asynchronous. func k_llnoprecess_async(tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, hx unsafe.Pointer, hy unsafe.Pointer, hz unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("llnoprecess") } llnoprecess_args.Lock() defer llnoprecess_args.Unlock() if llnoprecess_code == 0 { llnoprecess_code = fatbinLoad(llnoprecess_map, "llnoprecess") } llnoprecess_args.arg_tx = tx llnoprecess_args.arg_ty = ty llnoprecess_args.arg_tz = tz llnoprecess_args.arg_mx = mx llnoprecess_args.arg_my = my llnoprecess_args.arg_mz = mz llnoprecess_args.arg_hx = hx llnoprecess_args.arg_hy = hy llnoprecess_args.arg_hz = hz llnoprecess_args.arg_N = N args := llnoprecess_args.argptr[:] cu.LaunchKernel(llnoprecess_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("llnoprecess") } } // maps compute capability on PTX code for llnoprecess kernel. var llnoprecess_map = map[int]string{0: "", 30: llnoprecess_ptx_30, 32: llnoprecess_ptx_32, 35: llnoprecess_ptx_35, 37: llnoprecess_ptx_37, 50: llnoprecess_ptx_50, 52: llnoprecess_ptx_52, 53: llnoprecess_ptx_53, 60: llnoprecess_ptx_60, 61: llnoprecess_ptx_61, 62: llnoprecess_ptx_62, 70: llnoprecess_ptx_70, 72: llnoprecess_ptx_72, 75: llnoprecess_ptx_75} // llnoprecess PTX code for various compute capabilities. const ( llnoprecess_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.f32 %f1, [%rd22]; ld.global.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.f32 %f4, [%rd20]; ld.global.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; BB0_2: ret; } ` llnoprecess_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; BB0_2: ret; } ` llnoprecess_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; BB0_2: ret; } ` llnoprecess_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; BB0_2: ret; } ` llnoprecess_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; BB0_2: ret; } ` llnoprecess_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; BB0_2: ret; } ` llnoprecess_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; BB0_2: ret; } ` llnoprecess_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; BB0_2: ret; } ` llnoprecess_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; BB0_2: ret; } ` llnoprecess_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; BB0_2: ret; } ` llnoprecess_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; BB0_2: ret; } ` llnoprecess_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; BB0_2: ret; } ` llnoprecess_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl llnoprecess .visible .entry llnoprecess( .param .u64 llnoprecess_param_0, .param .u64 llnoprecess_param_1, .param .u64 llnoprecess_param_2, .param .u64 llnoprecess_param_3, .param .u64 llnoprecess_param_4, .param .u64 llnoprecess_param_5, .param .u64 llnoprecess_param_6, .param .u64 llnoprecess_param_7, .param .u64 llnoprecess_param_8, .param .u32 llnoprecess_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<28>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [llnoprecess_param_0]; ld.param.u64 %rd2, [llnoprecess_param_1]; ld.param.u64 %rd3, [llnoprecess_param_2]; ld.param.u64 %rd4, [llnoprecess_param_3]; ld.param.u64 %rd5, [llnoprecess_param_4]; ld.param.u64 %rd6, [llnoprecess_param_5]; ld.param.u64 %rd7, [llnoprecess_param_6]; ld.param.u64 %rd8, [llnoprecess_param_7]; ld.param.u64 %rd9, [llnoprecess_param_8]; ld.param.u32 %r2, [llnoprecess_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f1, [%rd22]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f3, %f2, %f1; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd16]; mul.f32 %f6, %f5, %f4; sub.f32 %f7, %f3, %f6; ld.global.nc.f32 %f8, [%rd18]; mul.f32 %f9, %f5, %f8; ld.global.nc.f32 %f10, [%rd12]; mul.f32 %f11, %f10, %f1; sub.f32 %f12, %f9, %f11; mul.f32 %f13, %f10, %f4; mul.f32 %f14, %f2, %f8; sub.f32 %f15, %f13, %f14; mul.f32 %f16, %f2, %f15; mul.f32 %f17, %f5, %f12; sub.f32 %f18, %f16, %f17; mul.f32 %f19, %f5, %f7; mul.f32 %f20, %f10, %f15; sub.f32 %f21, %f19, %f20; mul.f32 %f22, %f10, %f12; mul.f32 %f23, %f2, %f7; sub.f32 %f24, %f22, %f23; neg.f32 %f25, %f18; neg.f32 %f26, %f21; neg.f32 %f27, %f24; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f25; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f26; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f27; BB0_2: ret; } ` ) mumax3-3.10/cuda/lltorque.go000066400000000000000000000015141371432437400157720ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" ) // Landau-Lifshitz torque divided by gamma0: // - 1/(1+α²) [ m x B + α m x (m x B) ] // torque in Tesla // m normalized // B in Tesla // see lltorque.cu func LLTorque(torque, m, B *data.Slice, alpha MSlice) { N := torque.Len() cfg := make1DConf(N) k_lltorque2_async(torque.DevPtr(X), torque.DevPtr(Y), torque.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), B.DevPtr(X), B.DevPtr(Y), B.DevPtr(Z), alpha.DevPtr(0), alpha.Mul(0), N, cfg) } // Landau-Lifshitz torque with precession disabled. // Used by engine.Relax(). func LLNoPrecess(torque, m, B *data.Slice) { N := torque.Len() cfg := make1DConf(N) k_llnoprecess_async(torque.DevPtr(X), torque.DevPtr(Y), torque.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), B.DevPtr(X), B.DevPtr(Y), B.DevPtr(Z), N, cfg) } mumax3-3.10/cuda/lltorque2.cu000066400000000000000000000016071371432437400160610ustar00rootroot00000000000000#include "amul.h" #include "float3.h" #include // Landau-Lifshitz torque. extern "C" __global__ void lltorque2(float* __restrict__ tx, float* __restrict__ ty, float* __restrict__ tz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ hx, float* __restrict__ hy, float* __restrict__ hz, float* __restrict__ alpha_, float alpha_mul, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { float3 m = {mx[i], my[i], mz[i]}; float3 H = {hx[i], hy[i], hz[i]}; float alpha = amul(alpha_, alpha_mul, i); float3 mxH = cross(m, H); float gilb = -1.0f / (1.0f + alpha * alpha); float3 torque = gilb * (mxH + alpha * cross(m, mxH)); tx[i] = torque.x; ty[i] = torque.y; tz[i] = torque.z; } } mumax3-3.10/cuda/lltorque2_wrapper.go000066400000000000000000001301301371432437400176110ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for lltorque2 kernel var lltorque2_code cu.Function // Stores the arguments for lltorque2 kernel invocation type lltorque2_args_t struct { arg_tx unsafe.Pointer arg_ty unsafe.Pointer arg_tz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_hx unsafe.Pointer arg_hy unsafe.Pointer arg_hz unsafe.Pointer arg_alpha_ unsafe.Pointer arg_alpha_mul float32 arg_N int argptr [12]unsafe.Pointer sync.Mutex } // Stores the arguments for lltorque2 kernel invocation var lltorque2_args lltorque2_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. lltorque2_args.argptr[0] = unsafe.Pointer(&lltorque2_args.arg_tx) lltorque2_args.argptr[1] = unsafe.Pointer(&lltorque2_args.arg_ty) lltorque2_args.argptr[2] = unsafe.Pointer(&lltorque2_args.arg_tz) lltorque2_args.argptr[3] = unsafe.Pointer(&lltorque2_args.arg_mx) lltorque2_args.argptr[4] = unsafe.Pointer(&lltorque2_args.arg_my) lltorque2_args.argptr[5] = unsafe.Pointer(&lltorque2_args.arg_mz) lltorque2_args.argptr[6] = unsafe.Pointer(&lltorque2_args.arg_hx) lltorque2_args.argptr[7] = unsafe.Pointer(&lltorque2_args.arg_hy) lltorque2_args.argptr[8] = unsafe.Pointer(&lltorque2_args.arg_hz) lltorque2_args.argptr[9] = unsafe.Pointer(&lltorque2_args.arg_alpha_) lltorque2_args.argptr[10] = unsafe.Pointer(&lltorque2_args.arg_alpha_mul) lltorque2_args.argptr[11] = unsafe.Pointer(&lltorque2_args.arg_N) } // Wrapper for lltorque2 CUDA kernel, asynchronous. func k_lltorque2_async(tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, hx unsafe.Pointer, hy unsafe.Pointer, hz unsafe.Pointer, alpha_ unsafe.Pointer, alpha_mul float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("lltorque2") } lltorque2_args.Lock() defer lltorque2_args.Unlock() if lltorque2_code == 0 { lltorque2_code = fatbinLoad(lltorque2_map, "lltorque2") } lltorque2_args.arg_tx = tx lltorque2_args.arg_ty = ty lltorque2_args.arg_tz = tz lltorque2_args.arg_mx = mx lltorque2_args.arg_my = my lltorque2_args.arg_mz = mz lltorque2_args.arg_hx = hx lltorque2_args.arg_hy = hy lltorque2_args.arg_hz = hz lltorque2_args.arg_alpha_ = alpha_ lltorque2_args.arg_alpha_mul = alpha_mul lltorque2_args.arg_N = N args := lltorque2_args.argptr[:] cu.LaunchKernel(lltorque2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("lltorque2") } } // maps compute capability on PTX code for lltorque2 kernel. var lltorque2_map = map[int]string{0: "", 30: lltorque2_ptx_30, 32: lltorque2_ptx_32, 35: lltorque2_ptx_35, 37: lltorque2_ptx_37, 50: lltorque2_ptx_50, 52: lltorque2_ptx_52, 53: lltorque2_ptx_53, 60: lltorque2_ptx_60, 61: lltorque2_ptx_61, 62: lltorque2_ptx_62, 70: lltorque2_ptx_70, 72: lltorque2_ptx_72, 75: lltorque2_ptx_75} // lltorque2 PTX code for various compute capabilities. const ( lltorque2_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<34>; ld.param.u64 %rd1, [lltorque2_param_0]; ld.param.u64 %rd2, [lltorque2_param_1]; ld.param.u64 %rd3, [lltorque2_param_2]; ld.param.u64 %rd4, [lltorque2_param_3]; ld.param.u64 %rd5, [lltorque2_param_4]; ld.param.u64 %rd6, [lltorque2_param_5]; ld.param.u64 %rd7, [lltorque2_param_6]; ld.param.u64 %rd8, [lltorque2_param_7]; ld.param.u64 %rd9, [lltorque2_param_8]; ld.param.u64 %rd10, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd13, %rd11, %rd12; ld.global.f32 %f1, [%rd13]; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd12; ld.global.f32 %f2, [%rd15]; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd12; ld.global.f32 %f3, [%rd17]; cvta.to.global.u64 %rd18, %rd7; add.s64 %rd19, %rd18, %rd12; ld.global.f32 %f4, [%rd19]; cvta.to.global.u64 %rd20, %rd8; add.s64 %rd21, %rd20, %rd12; ld.global.f32 %f5, [%rd21]; cvta.to.global.u64 %rd22, %rd9; add.s64 %rd23, %rd22, %rd12; ld.global.f32 %f6, [%rd23]; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd10; add.s64 %rd26, %rd24, %rd12; ld.global.f32 %f10, [%rd26]; mul.f32 %f38, %f10, %f38; BB0_3: cvta.to.global.u64 %rd27, %rd3; cvta.to.global.u64 %rd28, %rd2; mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f22, %f32; mul.f32 %f36, %f22, %f33; mul.f32 %f37, %f22, %f34; cvta.to.global.u64 %rd29, %rd1; add.s64 %rd31, %rd29, %rd12; st.global.f32 [%rd31], %f35; add.s64 %rd32, %rd28, %rd12; st.global.f32 [%rd32], %f36; add.s64 %rd33, %rd27, %rd12; st.global.f32 [%rd33], %f37; BB0_4: ret; } ` lltorque2_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<34>; ld.param.u64 %rd1, [lltorque2_param_0]; ld.param.u64 %rd2, [lltorque2_param_1]; ld.param.u64 %rd3, [lltorque2_param_2]; ld.param.u64 %rd4, [lltorque2_param_3]; ld.param.u64 %rd5, [lltorque2_param_4]; ld.param.u64 %rd6, [lltorque2_param_5]; ld.param.u64 %rd7, [lltorque2_param_6]; ld.param.u64 %rd8, [lltorque2_param_7]; ld.param.u64 %rd9, [lltorque2_param_8]; ld.param.u64 %rd10, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f1, [%rd13]; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f2, [%rd15]; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f3, [%rd17]; cvta.to.global.u64 %rd18, %rd7; add.s64 %rd19, %rd18, %rd12; ld.global.nc.f32 %f4, [%rd19]; cvta.to.global.u64 %rd20, %rd8; add.s64 %rd21, %rd20, %rd12; ld.global.nc.f32 %f5, [%rd21]; cvta.to.global.u64 %rd22, %rd9; add.s64 %rd23, %rd22, %rd12; ld.global.nc.f32 %f6, [%rd23]; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd10; add.s64 %rd26, %rd24, %rd12; ld.global.nc.f32 %f10, [%rd26]; mul.f32 %f38, %f10, %f38; BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f22, %f32; mul.f32 %f36, %f22, %f33; mul.f32 %f37, %f22, %f34; cvta.to.global.u64 %rd27, %rd1; add.s64 %rd29, %rd27, %rd12; st.global.f32 [%rd29], %f35; cvta.to.global.u64 %rd30, %rd2; add.s64 %rd31, %rd30, %rd12; st.global.f32 [%rd31], %f36; cvta.to.global.u64 %rd32, %rd3; add.s64 %rd33, %rd32, %rd12; st.global.f32 [%rd33], %f37; BB0_4: ret; } ` lltorque2_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<34>; ld.param.u64 %rd1, [lltorque2_param_0]; ld.param.u64 %rd2, [lltorque2_param_1]; ld.param.u64 %rd3, [lltorque2_param_2]; ld.param.u64 %rd4, [lltorque2_param_3]; ld.param.u64 %rd5, [lltorque2_param_4]; ld.param.u64 %rd6, [lltorque2_param_5]; ld.param.u64 %rd7, [lltorque2_param_6]; ld.param.u64 %rd8, [lltorque2_param_7]; ld.param.u64 %rd9, [lltorque2_param_8]; ld.param.u64 %rd10, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f1, [%rd13]; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f2, [%rd15]; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f3, [%rd17]; cvta.to.global.u64 %rd18, %rd7; add.s64 %rd19, %rd18, %rd12; ld.global.nc.f32 %f4, [%rd19]; cvta.to.global.u64 %rd20, %rd8; add.s64 %rd21, %rd20, %rd12; ld.global.nc.f32 %f5, [%rd21]; cvta.to.global.u64 %rd22, %rd9; add.s64 %rd23, %rd22, %rd12; ld.global.nc.f32 %f6, [%rd23]; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd10; add.s64 %rd26, %rd24, %rd12; ld.global.nc.f32 %f10, [%rd26]; mul.f32 %f38, %f10, %f38; BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f22, %f32; mul.f32 %f36, %f22, %f33; mul.f32 %f37, %f22, %f34; cvta.to.global.u64 %rd27, %rd1; add.s64 %rd29, %rd27, %rd12; st.global.f32 [%rd29], %f35; cvta.to.global.u64 %rd30, %rd2; add.s64 %rd31, %rd30, %rd12; st.global.f32 [%rd31], %f36; cvta.to.global.u64 %rd32, %rd3; add.s64 %rd33, %rd32, %rd12; st.global.f32 [%rd33], %f37; BB0_4: ret; } ` lltorque2_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<34>; ld.param.u64 %rd1, [lltorque2_param_0]; ld.param.u64 %rd2, [lltorque2_param_1]; ld.param.u64 %rd3, [lltorque2_param_2]; ld.param.u64 %rd4, [lltorque2_param_3]; ld.param.u64 %rd5, [lltorque2_param_4]; ld.param.u64 %rd6, [lltorque2_param_5]; ld.param.u64 %rd7, [lltorque2_param_6]; ld.param.u64 %rd8, [lltorque2_param_7]; ld.param.u64 %rd9, [lltorque2_param_8]; ld.param.u64 %rd10, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f1, [%rd13]; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f2, [%rd15]; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f3, [%rd17]; cvta.to.global.u64 %rd18, %rd7; add.s64 %rd19, %rd18, %rd12; ld.global.nc.f32 %f4, [%rd19]; cvta.to.global.u64 %rd20, %rd8; add.s64 %rd21, %rd20, %rd12; ld.global.nc.f32 %f5, [%rd21]; cvta.to.global.u64 %rd22, %rd9; add.s64 %rd23, %rd22, %rd12; ld.global.nc.f32 %f6, [%rd23]; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd10; add.s64 %rd26, %rd24, %rd12; ld.global.nc.f32 %f10, [%rd26]; mul.f32 %f38, %f10, %f38; BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f22, %f32; mul.f32 %f36, %f22, %f33; mul.f32 %f37, %f22, %f34; cvta.to.global.u64 %rd27, %rd1; add.s64 %rd29, %rd27, %rd12; st.global.f32 [%rd29], %f35; cvta.to.global.u64 %rd30, %rd2; add.s64 %rd31, %rd30, %rd12; st.global.f32 [%rd31], %f36; cvta.to.global.u64 %rd32, %rd3; add.s64 %rd33, %rd32, %rd12; st.global.f32 [%rd33], %f37; BB0_4: ret; } ` lltorque2_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<34>; ld.param.u64 %rd1, [lltorque2_param_0]; ld.param.u64 %rd2, [lltorque2_param_1]; ld.param.u64 %rd3, [lltorque2_param_2]; ld.param.u64 %rd4, [lltorque2_param_3]; ld.param.u64 %rd5, [lltorque2_param_4]; ld.param.u64 %rd6, [lltorque2_param_5]; ld.param.u64 %rd7, [lltorque2_param_6]; ld.param.u64 %rd8, [lltorque2_param_7]; ld.param.u64 %rd9, [lltorque2_param_8]; ld.param.u64 %rd10, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f1, [%rd13]; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f2, [%rd15]; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f3, [%rd17]; cvta.to.global.u64 %rd18, %rd7; add.s64 %rd19, %rd18, %rd12; ld.global.nc.f32 %f4, [%rd19]; cvta.to.global.u64 %rd20, %rd8; add.s64 %rd21, %rd20, %rd12; ld.global.nc.f32 %f5, [%rd21]; cvta.to.global.u64 %rd22, %rd9; add.s64 %rd23, %rd22, %rd12; ld.global.nc.f32 %f6, [%rd23]; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd10; add.s64 %rd26, %rd24, %rd12; ld.global.nc.f32 %f10, [%rd26]; mul.f32 %f38, %f10, %f38; BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f22, %f32; mul.f32 %f36, %f22, %f33; mul.f32 %f37, %f22, %f34; cvta.to.global.u64 %rd27, %rd1; add.s64 %rd29, %rd27, %rd12; st.global.f32 [%rd29], %f35; cvta.to.global.u64 %rd30, %rd2; add.s64 %rd31, %rd30, %rd12; st.global.f32 [%rd31], %f36; cvta.to.global.u64 %rd32, %rd3; add.s64 %rd33, %rd32, %rd12; st.global.f32 [%rd33], %f37; BB0_4: ret; } ` lltorque2_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<34>; ld.param.u64 %rd1, [lltorque2_param_0]; ld.param.u64 %rd2, [lltorque2_param_1]; ld.param.u64 %rd3, [lltorque2_param_2]; ld.param.u64 %rd4, [lltorque2_param_3]; ld.param.u64 %rd5, [lltorque2_param_4]; ld.param.u64 %rd6, [lltorque2_param_5]; ld.param.u64 %rd7, [lltorque2_param_6]; ld.param.u64 %rd8, [lltorque2_param_7]; ld.param.u64 %rd9, [lltorque2_param_8]; ld.param.u64 %rd10, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f1, [%rd13]; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f2, [%rd15]; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f3, [%rd17]; cvta.to.global.u64 %rd18, %rd7; add.s64 %rd19, %rd18, %rd12; ld.global.nc.f32 %f4, [%rd19]; cvta.to.global.u64 %rd20, %rd8; add.s64 %rd21, %rd20, %rd12; ld.global.nc.f32 %f5, [%rd21]; cvta.to.global.u64 %rd22, %rd9; add.s64 %rd23, %rd22, %rd12; ld.global.nc.f32 %f6, [%rd23]; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd10; add.s64 %rd26, %rd24, %rd12; ld.global.nc.f32 %f10, [%rd26]; mul.f32 %f38, %f10, %f38; BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f22, %f32; mul.f32 %f36, %f22, %f33; mul.f32 %f37, %f22, %f34; cvta.to.global.u64 %rd27, %rd1; add.s64 %rd29, %rd27, %rd12; st.global.f32 [%rd29], %f35; cvta.to.global.u64 %rd30, %rd2; add.s64 %rd31, %rd30, %rd12; st.global.f32 [%rd31], %f36; cvta.to.global.u64 %rd32, %rd3; add.s64 %rd33, %rd32, %rd12; st.global.f32 [%rd33], %f37; BB0_4: ret; } ` lltorque2_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<34>; ld.param.u64 %rd1, [lltorque2_param_0]; ld.param.u64 %rd2, [lltorque2_param_1]; ld.param.u64 %rd3, [lltorque2_param_2]; ld.param.u64 %rd4, [lltorque2_param_3]; ld.param.u64 %rd5, [lltorque2_param_4]; ld.param.u64 %rd6, [lltorque2_param_5]; ld.param.u64 %rd7, [lltorque2_param_6]; ld.param.u64 %rd8, [lltorque2_param_7]; ld.param.u64 %rd9, [lltorque2_param_8]; ld.param.u64 %rd10, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f1, [%rd13]; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f2, [%rd15]; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f3, [%rd17]; cvta.to.global.u64 %rd18, %rd7; add.s64 %rd19, %rd18, %rd12; ld.global.nc.f32 %f4, [%rd19]; cvta.to.global.u64 %rd20, %rd8; add.s64 %rd21, %rd20, %rd12; ld.global.nc.f32 %f5, [%rd21]; cvta.to.global.u64 %rd22, %rd9; add.s64 %rd23, %rd22, %rd12; ld.global.nc.f32 %f6, [%rd23]; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd10; add.s64 %rd26, %rd24, %rd12; ld.global.nc.f32 %f10, [%rd26]; mul.f32 %f38, %f10, %f38; BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f22, %f32; mul.f32 %f36, %f22, %f33; mul.f32 %f37, %f22, %f34; cvta.to.global.u64 %rd27, %rd1; add.s64 %rd29, %rd27, %rd12; st.global.f32 [%rd29], %f35; cvta.to.global.u64 %rd30, %rd2; add.s64 %rd31, %rd30, %rd12; st.global.f32 [%rd31], %f36; cvta.to.global.u64 %rd32, %rd3; add.s64 %rd33, %rd32, %rd12; st.global.f32 [%rd33], %f37; BB0_4: ret; } ` lltorque2_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<34>; ld.param.u64 %rd1, [lltorque2_param_0]; ld.param.u64 %rd2, [lltorque2_param_1]; ld.param.u64 %rd3, [lltorque2_param_2]; ld.param.u64 %rd4, [lltorque2_param_3]; ld.param.u64 %rd5, [lltorque2_param_4]; ld.param.u64 %rd6, [lltorque2_param_5]; ld.param.u64 %rd7, [lltorque2_param_6]; ld.param.u64 %rd8, [lltorque2_param_7]; ld.param.u64 %rd9, [lltorque2_param_8]; ld.param.u64 %rd10, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f1, [%rd13]; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f2, [%rd15]; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f3, [%rd17]; cvta.to.global.u64 %rd18, %rd7; add.s64 %rd19, %rd18, %rd12; ld.global.nc.f32 %f4, [%rd19]; cvta.to.global.u64 %rd20, %rd8; add.s64 %rd21, %rd20, %rd12; ld.global.nc.f32 %f5, [%rd21]; cvta.to.global.u64 %rd22, %rd9; add.s64 %rd23, %rd22, %rd12; ld.global.nc.f32 %f6, [%rd23]; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd10; add.s64 %rd26, %rd24, %rd12; ld.global.nc.f32 %f10, [%rd26]; mul.f32 %f38, %f10, %f38; BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f22, %f32; mul.f32 %f36, %f22, %f33; mul.f32 %f37, %f22, %f34; cvta.to.global.u64 %rd27, %rd1; add.s64 %rd29, %rd27, %rd12; st.global.f32 [%rd29], %f35; cvta.to.global.u64 %rd30, %rd2; add.s64 %rd31, %rd30, %rd12; st.global.f32 [%rd31], %f36; cvta.to.global.u64 %rd32, %rd3; add.s64 %rd33, %rd32, %rd12; st.global.f32 [%rd33], %f37; BB0_4: ret; } ` lltorque2_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<34>; ld.param.u64 %rd1, [lltorque2_param_0]; ld.param.u64 %rd2, [lltorque2_param_1]; ld.param.u64 %rd3, [lltorque2_param_2]; ld.param.u64 %rd4, [lltorque2_param_3]; ld.param.u64 %rd5, [lltorque2_param_4]; ld.param.u64 %rd6, [lltorque2_param_5]; ld.param.u64 %rd7, [lltorque2_param_6]; ld.param.u64 %rd8, [lltorque2_param_7]; ld.param.u64 %rd9, [lltorque2_param_8]; ld.param.u64 %rd10, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f1, [%rd13]; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f2, [%rd15]; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f3, [%rd17]; cvta.to.global.u64 %rd18, %rd7; add.s64 %rd19, %rd18, %rd12; ld.global.nc.f32 %f4, [%rd19]; cvta.to.global.u64 %rd20, %rd8; add.s64 %rd21, %rd20, %rd12; ld.global.nc.f32 %f5, [%rd21]; cvta.to.global.u64 %rd22, %rd9; add.s64 %rd23, %rd22, %rd12; ld.global.nc.f32 %f6, [%rd23]; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd10; add.s64 %rd26, %rd24, %rd12; ld.global.nc.f32 %f10, [%rd26]; mul.f32 %f38, %f10, %f38; BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f22, %f32; mul.f32 %f36, %f22, %f33; mul.f32 %f37, %f22, %f34; cvta.to.global.u64 %rd27, %rd1; add.s64 %rd29, %rd27, %rd12; st.global.f32 [%rd29], %f35; cvta.to.global.u64 %rd30, %rd2; add.s64 %rd31, %rd30, %rd12; st.global.f32 [%rd31], %f36; cvta.to.global.u64 %rd32, %rd3; add.s64 %rd33, %rd32, %rd12; st.global.f32 [%rd33], %f37; BB0_4: ret; } ` lltorque2_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<34>; ld.param.u64 %rd1, [lltorque2_param_0]; ld.param.u64 %rd2, [lltorque2_param_1]; ld.param.u64 %rd3, [lltorque2_param_2]; ld.param.u64 %rd4, [lltorque2_param_3]; ld.param.u64 %rd5, [lltorque2_param_4]; ld.param.u64 %rd6, [lltorque2_param_5]; ld.param.u64 %rd7, [lltorque2_param_6]; ld.param.u64 %rd8, [lltorque2_param_7]; ld.param.u64 %rd9, [lltorque2_param_8]; ld.param.u64 %rd10, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f1, [%rd13]; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f2, [%rd15]; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f3, [%rd17]; cvta.to.global.u64 %rd18, %rd7; add.s64 %rd19, %rd18, %rd12; ld.global.nc.f32 %f4, [%rd19]; cvta.to.global.u64 %rd20, %rd8; add.s64 %rd21, %rd20, %rd12; ld.global.nc.f32 %f5, [%rd21]; cvta.to.global.u64 %rd22, %rd9; add.s64 %rd23, %rd22, %rd12; ld.global.nc.f32 %f6, [%rd23]; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd10; add.s64 %rd26, %rd24, %rd12; ld.global.nc.f32 %f10, [%rd26]; mul.f32 %f38, %f10, %f38; BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f22, %f32; mul.f32 %f36, %f22, %f33; mul.f32 %f37, %f22, %f34; cvta.to.global.u64 %rd27, %rd1; add.s64 %rd29, %rd27, %rd12; st.global.f32 [%rd29], %f35; cvta.to.global.u64 %rd30, %rd2; add.s64 %rd31, %rd30, %rd12; st.global.f32 [%rd31], %f36; cvta.to.global.u64 %rd32, %rd3; add.s64 %rd33, %rd32, %rd12; st.global.f32 [%rd33], %f37; BB0_4: ret; } ` lltorque2_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<34>; ld.param.u64 %rd1, [lltorque2_param_0]; ld.param.u64 %rd2, [lltorque2_param_1]; ld.param.u64 %rd3, [lltorque2_param_2]; ld.param.u64 %rd4, [lltorque2_param_3]; ld.param.u64 %rd5, [lltorque2_param_4]; ld.param.u64 %rd6, [lltorque2_param_5]; ld.param.u64 %rd7, [lltorque2_param_6]; ld.param.u64 %rd8, [lltorque2_param_7]; ld.param.u64 %rd9, [lltorque2_param_8]; ld.param.u64 %rd10, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f1, [%rd13]; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f2, [%rd15]; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f3, [%rd17]; cvta.to.global.u64 %rd18, %rd7; add.s64 %rd19, %rd18, %rd12; ld.global.nc.f32 %f4, [%rd19]; cvta.to.global.u64 %rd20, %rd8; add.s64 %rd21, %rd20, %rd12; ld.global.nc.f32 %f5, [%rd21]; cvta.to.global.u64 %rd22, %rd9; add.s64 %rd23, %rd22, %rd12; ld.global.nc.f32 %f6, [%rd23]; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd10; add.s64 %rd26, %rd24, %rd12; ld.global.nc.f32 %f10, [%rd26]; mul.f32 %f38, %f10, %f38; BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f22, %f32; mul.f32 %f36, %f22, %f33; mul.f32 %f37, %f22, %f34; cvta.to.global.u64 %rd27, %rd1; add.s64 %rd29, %rd27, %rd12; st.global.f32 [%rd29], %f35; cvta.to.global.u64 %rd30, %rd2; add.s64 %rd31, %rd30, %rd12; st.global.f32 [%rd31], %f36; cvta.to.global.u64 %rd32, %rd3; add.s64 %rd33, %rd32, %rd12; st.global.f32 [%rd33], %f37; BB0_4: ret; } ` lltorque2_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<34>; ld.param.u64 %rd1, [lltorque2_param_0]; ld.param.u64 %rd2, [lltorque2_param_1]; ld.param.u64 %rd3, [lltorque2_param_2]; ld.param.u64 %rd4, [lltorque2_param_3]; ld.param.u64 %rd5, [lltorque2_param_4]; ld.param.u64 %rd6, [lltorque2_param_5]; ld.param.u64 %rd7, [lltorque2_param_6]; ld.param.u64 %rd8, [lltorque2_param_7]; ld.param.u64 %rd9, [lltorque2_param_8]; ld.param.u64 %rd10, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f1, [%rd13]; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f2, [%rd15]; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f3, [%rd17]; cvta.to.global.u64 %rd18, %rd7; add.s64 %rd19, %rd18, %rd12; ld.global.nc.f32 %f4, [%rd19]; cvta.to.global.u64 %rd20, %rd8; add.s64 %rd21, %rd20, %rd12; ld.global.nc.f32 %f5, [%rd21]; cvta.to.global.u64 %rd22, %rd9; add.s64 %rd23, %rd22, %rd12; ld.global.nc.f32 %f6, [%rd23]; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd10; add.s64 %rd26, %rd24, %rd12; ld.global.nc.f32 %f10, [%rd26]; mul.f32 %f38, %f10, %f38; BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f22, %f32; mul.f32 %f36, %f22, %f33; mul.f32 %f37, %f22, %f34; cvta.to.global.u64 %rd27, %rd1; add.s64 %rd29, %rd27, %rd12; st.global.f32 [%rd29], %f35; cvta.to.global.u64 %rd30, %rd2; add.s64 %rd31, %rd30, %rd12; st.global.f32 [%rd31], %f36; cvta.to.global.u64 %rd32, %rd3; add.s64 %rd33, %rd32, %rd12; st.global.f32 [%rd33], %f37; BB0_4: ret; } ` lltorque2_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl lltorque2 .visible .entry lltorque2( .param .u64 lltorque2_param_0, .param .u64 lltorque2_param_1, .param .u64 lltorque2_param_2, .param .u64 lltorque2_param_3, .param .u64 lltorque2_param_4, .param .u64 lltorque2_param_5, .param .u64 lltorque2_param_6, .param .u64 lltorque2_param_7, .param .u64 lltorque2_param_8, .param .u64 lltorque2_param_9, .param .f32 lltorque2_param_10, .param .u32 lltorque2_param_11 ) { .reg .pred %p<3>; .reg .f32 %f<39>; .reg .b32 %r<9>; .reg .b64 %rd<34>; ld.param.u64 %rd1, [lltorque2_param_0]; ld.param.u64 %rd2, [lltorque2_param_1]; ld.param.u64 %rd3, [lltorque2_param_2]; ld.param.u64 %rd4, [lltorque2_param_3]; ld.param.u64 %rd5, [lltorque2_param_4]; ld.param.u64 %rd6, [lltorque2_param_5]; ld.param.u64 %rd7, [lltorque2_param_6]; ld.param.u64 %rd8, [lltorque2_param_7]; ld.param.u64 %rd9, [lltorque2_param_8]; ld.param.u64 %rd10, [lltorque2_param_9]; ld.param.f32 %f38, [lltorque2_param_10]; ld.param.u32 %r2, [lltorque2_param_11]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd11, %rd4; mul.wide.s32 %rd12, %r1, 4; add.s64 %rd13, %rd11, %rd12; ld.global.nc.f32 %f1, [%rd13]; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd12; ld.global.nc.f32 %f2, [%rd15]; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd12; ld.global.nc.f32 %f3, [%rd17]; cvta.to.global.u64 %rd18, %rd7; add.s64 %rd19, %rd18, %rd12; ld.global.nc.f32 %f4, [%rd19]; cvta.to.global.u64 %rd20, %rd8; add.s64 %rd21, %rd20, %rd12; ld.global.nc.f32 %f5, [%rd21]; cvta.to.global.u64 %rd22, %rd9; add.s64 %rd23, %rd22, %rd12; ld.global.nc.f32 %f6, [%rd23]; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd10; add.s64 %rd26, %rd24, %rd12; ld.global.nc.f32 %f10, [%rd26]; mul.f32 %f38, %f10, %f38; BB0_3: mul.f32 %f11, %f3, %f5; mul.f32 %f12, %f2, %f6; sub.f32 %f13, %f12, %f11; mul.f32 %f14, %f1, %f6; mul.f32 %f15, %f3, %f4; sub.f32 %f16, %f15, %f14; mul.f32 %f17, %f2, %f4; mul.f32 %f18, %f1, %f5; sub.f32 %f19, %f18, %f17; fma.rn.f32 %f20, %f38, %f38, 0f3F800000; mov.f32 %f21, 0fBF800000; div.rn.f32 %f22, %f21, %f20; mul.f32 %f23, %f2, %f19; mul.f32 %f24, %f3, %f16; sub.f32 %f25, %f23, %f24; mul.f32 %f26, %f3, %f13; mul.f32 %f27, %f1, %f19; sub.f32 %f28, %f26, %f27; mul.f32 %f29, %f1, %f16; mul.f32 %f30, %f2, %f13; sub.f32 %f31, %f29, %f30; fma.rn.f32 %f32, %f25, %f38, %f13; fma.rn.f32 %f33, %f28, %f38, %f16; fma.rn.f32 %f34, %f31, %f38, %f19; mul.f32 %f35, %f22, %f32; mul.f32 %f36, %f22, %f33; mul.f32 %f37, %f22, %f34; cvta.to.global.u64 %rd27, %rd1; add.s64 %rd29, %rd27, %rd12; st.global.f32 [%rd29], %f35; cvta.to.global.u64 %rd30, %rd2; add.s64 %rd31, %rd30, %rd12; st.global.f32 [%rd31], %f36; cvta.to.global.u64 %rd32, %rd3; add.s64 %rd33, %rd32, %rd12; st.global.f32 [%rd33], %f37; BB0_4: ret; } ` ) mumax3-3.10/cuda/lut.go000066400000000000000000000005711371432437400147310ustar00rootroot00000000000000package cuda // Look-up tables holding per-region parameter values. // LUT[regions[cellindex]] gives parameter value for cell. import "unsafe" type LUTPtr unsafe.Pointer // points to 256 float32's type LUTPtrs []unsafe.Pointer // elements point to 256 float32's type SymmLUT unsafe.Pointer // points to 256x256 symmetric matrix, only lower half stored. See exchange.cu mumax3-3.10/cuda/madd.go000066400000000000000000000117431371432437400150350ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) // multiply: dst[i] = a[i] * b[i] // a and b must have the same number of components func Mul(dst, a, b *data.Slice) { N := dst.Len() nComp := dst.NComp() util.Assert(a.Len() == N && a.NComp() == nComp && b.Len() == N && b.NComp() == nComp) cfg := make1DConf(N) for c := 0; c < nComp; c++ { k_mul_async(dst.DevPtr(c), a.DevPtr(c), b.DevPtr(c), N, cfg) } } // divide: dst[i] = a[i] / b[i] // divide-by-zero yields zero. func Div(dst, a, b *data.Slice) { N := dst.Len() nComp := dst.NComp() util.Assert(a.Len() == N && a.NComp() == nComp && b.Len() == N && b.NComp() == nComp) cfg := make1DConf(N) for c := 0; c < nComp; c++ { k_pointwise_div_async(dst.DevPtr(c), a.DevPtr(c), b.DevPtr(c), N, cfg) } } // Add: dst = src1 + src2. func Add(dst, src1, src2 *data.Slice) { Madd2(dst, src1, src2, 1, 1) } // multiply-add: dst[i] = src1[i] * factor1 + src2[i] * factor2 func Madd2(dst, src1, src2 *data.Slice, factor1, factor2 float32) { N := dst.Len() nComp := dst.NComp() util.Assert(src1.Len() == N && src2.Len() == N) util.Assert(src1.NComp() == nComp && src2.NComp() == nComp) cfg := make1DConf(N) for c := 0; c < nComp; c++ { k_madd2_async(dst.DevPtr(c), src1.DevPtr(c), factor1, src2.DevPtr(c), factor2, N, cfg) } } // multiply-add: dst[i] = src1[i] * factor1 + src2[i] * factor2 + src3[i] * factor3 func Madd3(dst, src1, src2, src3 *data.Slice, factor1, factor2, factor3 float32) { N := dst.Len() nComp := dst.NComp() util.Assert(src1.Len() == N && src2.Len() == N && src3.Len() == N) util.Assert(src1.NComp() == nComp && src2.NComp() == nComp && src3.NComp() == nComp) cfg := make1DConf(N) for c := 0; c < nComp; c++ { k_madd3_async(dst.DevPtr(c), src1.DevPtr(c), factor1, src2.DevPtr(c), factor2, src3.DevPtr(c), factor3, N, cfg) } } // multiply-add: dst[i] = src1[i] * factor1 + src2[i] * factor2 + src3[i] * factor3 + src4[i] * factor4 func Madd4(dst, src1, src2, src3, src4 *data.Slice, factor1, factor2, factor3, factor4 float32) { N := dst.Len() nComp := dst.NComp() util.Assert(src1.Len() == N && src2.Len() == N && src3.Len() == N && src4.Len() == N) util.Assert(src1.NComp() == nComp && src2.NComp() == nComp && src3.NComp() == nComp && src4.NComp() == nComp) cfg := make1DConf(N) for c := 0; c < nComp; c++ { k_madd4_async(dst.DevPtr(c), src1.DevPtr(c), factor1, src2.DevPtr(c), factor2, src3.DevPtr(c), factor3, src4.DevPtr(c), factor4, N, cfg) } } // multiply-add: dst[i] = src1[i] * factor1 + src2[i] * factor2 + src3[i] * factor3 + src4[i] * factor4 + src5[i] * factor5 func Madd5(dst, src1, src2, src3, src4, src5 *data.Slice, factor1, factor2, factor3, factor4, factor5 float32) { N := dst.Len() nComp := dst.NComp() util.Assert(src1.Len() == N && src2.Len() == N && src3.Len() == N && src4.Len() == N && src5.Len() == N) util.Assert(src1.NComp() == nComp && src2.NComp() == nComp && src3.NComp() == nComp && src4.NComp() == nComp && src5.NComp() == nComp) cfg := make1DConf(N) for c := 0; c < nComp; c++ { k_madd5_async(dst.DevPtr(c), src1.DevPtr(c), factor1, src2.DevPtr(c), factor2, src3.DevPtr(c), factor3, src4.DevPtr(c), factor4, src5.DevPtr(c), factor5, N, cfg) } } // multiply-add: dst[i] = src1[i] * factor1 + src2[i] * factor2 + src3[i] * factor3 + src4[i] * factor4 + src5[i] * factor5 + src6[i] * factor6 func Madd6(dst, src1, src2, src3, src4, src5, src6 *data.Slice, factor1, factor2, factor3, factor4, factor5, factor6 float32) { N := dst.Len() nComp := dst.NComp() util.Assert(src1.Len() == N && src2.Len() == N && src3.Len() == N && src4.Len() == N && src5.Len() == N && src6.Len() == N) util.Assert(src1.NComp() == nComp && src2.NComp() == nComp && src3.NComp() == nComp && src4.NComp() == nComp && src5.NComp() == nComp && src6.NComp() == nComp) cfg := make1DConf(N) for c := 0; c < nComp; c++ { k_madd6_async(dst.DevPtr(c), src1.DevPtr(c), factor1, src2.DevPtr(c), factor2, src3.DevPtr(c), factor3, src4.DevPtr(c), factor4, src5.DevPtr(c), factor5, src6.DevPtr(c), factor6, N, cfg) } } // multiply-add: dst[i] = src1[i] * factor1 + src2[i] * factor2 + src3[i] * factor3 + src4[i] * factor4 + src5[i] * factor5 + src6[i] * factor6 + src7[i] * factor7 func Madd7(dst, src1, src2, src3, src4, src5, src6, src7 *data.Slice, factor1, factor2, factor3, factor4, factor5, factor6, factor7 float32) { N := dst.Len() nComp := dst.NComp() util.Assert(src1.Len() == N && src2.Len() == N && src3.Len() == N && src4.Len() == N && src5.Len() == N && src6.Len() == N && src7.Len() == N) util.Assert(src1.NComp() == nComp && src2.NComp() == nComp && src3.NComp() == nComp && src4.NComp() == nComp && src5.NComp() == nComp && src6.NComp() == nComp && src7.NComp() == nComp) cfg := make1DConf(N) for c := 0; c < nComp; c++ { k_madd7_async(dst.DevPtr(c), src1.DevPtr(c), factor1, src2.DevPtr(c), factor2, src3.DevPtr(c), factor3, src4.DevPtr(c), factor4, src5.DevPtr(c), factor5, src6.DevPtr(c), factor6, src7.DevPtr(c), factor7, N, cfg) } } mumax3-3.10/cuda/madd2.cu000066400000000000000000000005401371432437400151120ustar00rootroot00000000000000 // dst[i] = fac1*src1[i] + fac2*src2[i]; extern "C" __global__ void madd2(float* __restrict__ dst, float* __restrict__ src1, float fac1, float* __restrict__ src2, float fac2, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if(i < N) { dst[i] = fac1*src1[i] + fac2*src2[i]; } } mumax3-3.10/cuda/madd2_wrapper.go000066400000000000000000000425601371432437400166600ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for madd2 kernel var madd2_code cu.Function // Stores the arguments for madd2 kernel invocation type madd2_args_t struct { arg_dst unsafe.Pointer arg_src1 unsafe.Pointer arg_fac1 float32 arg_src2 unsafe.Pointer arg_fac2 float32 arg_N int argptr [6]unsafe.Pointer sync.Mutex } // Stores the arguments for madd2 kernel invocation var madd2_args madd2_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. madd2_args.argptr[0] = unsafe.Pointer(&madd2_args.arg_dst) madd2_args.argptr[1] = unsafe.Pointer(&madd2_args.arg_src1) madd2_args.argptr[2] = unsafe.Pointer(&madd2_args.arg_fac1) madd2_args.argptr[3] = unsafe.Pointer(&madd2_args.arg_src2) madd2_args.argptr[4] = unsafe.Pointer(&madd2_args.arg_fac2) madd2_args.argptr[5] = unsafe.Pointer(&madd2_args.arg_N) } // Wrapper for madd2 CUDA kernel, asynchronous. func k_madd2_async(dst unsafe.Pointer, src1 unsafe.Pointer, fac1 float32, src2 unsafe.Pointer, fac2 float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("madd2") } madd2_args.Lock() defer madd2_args.Unlock() if madd2_code == 0 { madd2_code = fatbinLoad(madd2_map, "madd2") } madd2_args.arg_dst = dst madd2_args.arg_src1 = src1 madd2_args.arg_fac1 = fac1 madd2_args.arg_src2 = src2 madd2_args.arg_fac2 = fac2 madd2_args.arg_N = N args := madd2_args.argptr[:] cu.LaunchKernel(madd2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("madd2") } } // maps compute capability on PTX code for madd2 kernel. var madd2_map = map[int]string{0: "", 30: madd2_ptx_30, 32: madd2_ptx_32, 35: madd2_ptx_35, 37: madd2_ptx_37, 50: madd2_ptx_50, 52: madd2_ptx_52, 53: madd2_ptx_53, 60: madd2_ptx_60, 61: madd2_ptx_61, 62: madd2_ptx_62, 70: madd2_ptx_70, 72: madd2_ptx_72, 75: madd2_ptx_75} // madd2 PTX code for various compute capabilities. const ( madd2_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; BB0_2: ret; } ` madd2_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; BB0_2: ret; } ` madd2_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; BB0_2: ret; } ` madd2_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; BB0_2: ret; } ` madd2_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; BB0_2: ret; } ` madd2_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; BB0_2: ret; } ` madd2_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; BB0_2: ret; } ` madd2_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; BB0_2: ret; } ` madd2_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; BB0_2: ret; } ` madd2_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; BB0_2: ret; } ` madd2_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; BB0_2: ret; } ` madd2_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; BB0_2: ret; } ` madd2_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl madd2 .visible .entry madd2( .param .u64 madd2_param_0, .param .u64 madd2_param_1, .param .f32 madd2_param_2, .param .u64 madd2_param_3, .param .f32 madd2_param_4, .param .u32 madd2_param_5 ) { .reg .pred %p<2>; .reg .f32 %f<7>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [madd2_param_0]; ld.param.u64 %rd2, [madd2_param_1]; ld.param.f32 %f1, [madd2_param_2]; ld.param.u64 %rd3, [madd2_param_3]; ld.param.f32 %f2, [madd2_param_4]; ld.param.u32 %r2, [madd2_param_5]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; ld.global.nc.f32 %f3, [%rd6]; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f4, [%rd8]; mul.f32 %f5, %f4, %f2; fma.rn.f32 %f6, %f3, %f1, %f5; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f6; BB0_2: ret; } ` ) mumax3-3.10/cuda/madd3.cu000066400000000000000000000007511371432437400151170ustar00rootroot00000000000000 // dst[i] = fac1 * src1[i] + fac2 * src2[i] + fac3 * src3[i] extern "C" __global__ void madd3(float* __restrict__ dst, float* __restrict__ src1, float fac1, float* __restrict__ src2, float fac2, float* __restrict__ src3, float fac3, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if(i < N) { dst[i] = (fac1 * src1[i]) + (fac2 * src2[i] + fac3 * src3[i]); // parens for better accuracy heun solver. } } mumax3-3.10/cuda/madd3_wrapper.go000066400000000000000000000517251371432437400166640ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for madd3 kernel var madd3_code cu.Function // Stores the arguments for madd3 kernel invocation type madd3_args_t struct { arg_dst unsafe.Pointer arg_src1 unsafe.Pointer arg_fac1 float32 arg_src2 unsafe.Pointer arg_fac2 float32 arg_src3 unsafe.Pointer arg_fac3 float32 arg_N int argptr [8]unsafe.Pointer sync.Mutex } // Stores the arguments for madd3 kernel invocation var madd3_args madd3_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. madd3_args.argptr[0] = unsafe.Pointer(&madd3_args.arg_dst) madd3_args.argptr[1] = unsafe.Pointer(&madd3_args.arg_src1) madd3_args.argptr[2] = unsafe.Pointer(&madd3_args.arg_fac1) madd3_args.argptr[3] = unsafe.Pointer(&madd3_args.arg_src2) madd3_args.argptr[4] = unsafe.Pointer(&madd3_args.arg_fac2) madd3_args.argptr[5] = unsafe.Pointer(&madd3_args.arg_src3) madd3_args.argptr[6] = unsafe.Pointer(&madd3_args.arg_fac3) madd3_args.argptr[7] = unsafe.Pointer(&madd3_args.arg_N) } // Wrapper for madd3 CUDA kernel, asynchronous. func k_madd3_async(dst unsafe.Pointer, src1 unsafe.Pointer, fac1 float32, src2 unsafe.Pointer, fac2 float32, src3 unsafe.Pointer, fac3 float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("madd3") } madd3_args.Lock() defer madd3_args.Unlock() if madd3_code == 0 { madd3_code = fatbinLoad(madd3_map, "madd3") } madd3_args.arg_dst = dst madd3_args.arg_src1 = src1 madd3_args.arg_fac1 = fac1 madd3_args.arg_src2 = src2 madd3_args.arg_fac2 = fac2 madd3_args.arg_src3 = src3 madd3_args.arg_fac3 = fac3 madd3_args.arg_N = N args := madd3_args.argptr[:] cu.LaunchKernel(madd3_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("madd3") } } // maps compute capability on PTX code for madd3 kernel. var madd3_map = map[int]string{0: "", 30: madd3_ptx_30, 32: madd3_ptx_32, 35: madd3_ptx_35, 37: madd3_ptx_37, 50: madd3_ptx_50, 52: madd3_ptx_52, 53: madd3_ptx_53, 60: madd3_ptx_60, 61: madd3_ptx_61, 62: madd3_ptx_62, 70: madd3_ptx_70, 72: madd3_ptx_72, 75: madd3_ptx_75} // madd3 PTX code for various compute capabilities. const ( madd3_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; BB0_2: ret; } ` madd3_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; BB0_2: ret; } ` madd3_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; BB0_2: ret; } ` madd3_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; BB0_2: ret; } ` madd3_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; BB0_2: ret; } ` madd3_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; BB0_2: ret; } ` madd3_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; BB0_2: ret; } ` madd3_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; BB0_2: ret; } ` madd3_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; BB0_2: ret; } ` madd3_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; BB0_2: ret; } ` madd3_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; BB0_2: ret; } ` madd3_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; BB0_2: ret; } ` madd3_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl madd3 .visible .entry madd3( .param .u64 madd3_param_0, .param .u64 madd3_param_1, .param .f32 madd3_param_2, .param .u64 madd3_param_3, .param .f32 madd3_param_4, .param .u64 madd3_param_5, .param .f32 madd3_param_6, .param .u32 madd3_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<9>; .reg .b64 %rd<14>; ld.param.u64 %rd1, [madd3_param_0]; ld.param.u64 %rd2, [madd3_param_1]; ld.param.f32 %f1, [madd3_param_2]; ld.param.u64 %rd3, [madd3_param_3]; ld.param.f32 %f2, [madd3_param_4]; ld.param.u64 %rd4, [madd3_param_5]; ld.param.f32 %f3, [madd3_param_6]; ld.param.u32 %r2, [madd3_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd5, %rd2; mul.wide.s32 %rd6, %r1, 4; add.s64 %rd7, %rd5, %rd6; ld.global.nc.f32 %f4, [%rd7]; cvta.to.global.u64 %rd8, %rd3; add.s64 %rd9, %rd8, %rd6; ld.global.nc.f32 %f5, [%rd9]; cvta.to.global.u64 %rd10, %rd4; add.s64 %rd11, %rd10, %rd6; ld.global.nc.f32 %f6, [%rd11]; mul.f32 %f7, %f6, %f3; fma.rn.f32 %f8, %f5, %f2, %f7; fma.rn.f32 %f9, %f4, %f1, %f8; cvta.to.global.u64 %rd12, %rd1; add.s64 %rd13, %rd12, %rd6; st.global.f32 [%rd13], %f9; BB0_2: ret; } ` ) mumax3-3.10/cuda/madd4.cu000066400000000000000000000010001371432437400151040ustar00rootroot00000000000000 // dst[i] = src1[i] * fac1 + src2[i] * fac2 + src3[i] * fac3 + src4[i] * fac4 extern "C" __global__ void madd4(float* __restrict__ dst, float* __restrict__ src1, float fac1, float* __restrict__ src2, float fac2, float* __restrict__ src3, float fac3, float* __restrict__ src4, float fac4, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if(i < N) { dst[i] = (fac1*src1[i]) + (fac2*src2[i]) + (fac3*src3[i]) + (fac4*src4[i]); } } mumax3-3.10/cuda/madd4_wrapper.go000066400000000000000000000467561371432437400166750ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for madd4 kernel var madd4_code cu.Function // Stores the arguments for madd4 kernel invocation type madd4_args_t struct { arg_dst unsafe.Pointer arg_src1 unsafe.Pointer arg_fac1 float32 arg_src2 unsafe.Pointer arg_fac2 float32 arg_src3 unsafe.Pointer arg_fac3 float32 arg_src4 unsafe.Pointer arg_fac4 float32 arg_N int argptr [10]unsafe.Pointer sync.Mutex } // Stores the arguments for madd4 kernel invocation var madd4_args madd4_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. madd4_args.argptr[0] = unsafe.Pointer(&madd4_args.arg_dst) madd4_args.argptr[1] = unsafe.Pointer(&madd4_args.arg_src1) madd4_args.argptr[2] = unsafe.Pointer(&madd4_args.arg_fac1) madd4_args.argptr[3] = unsafe.Pointer(&madd4_args.arg_src2) madd4_args.argptr[4] = unsafe.Pointer(&madd4_args.arg_fac2) madd4_args.argptr[5] = unsafe.Pointer(&madd4_args.arg_src3) madd4_args.argptr[6] = unsafe.Pointer(&madd4_args.arg_fac3) madd4_args.argptr[7] = unsafe.Pointer(&madd4_args.arg_src4) madd4_args.argptr[8] = unsafe.Pointer(&madd4_args.arg_fac4) madd4_args.argptr[9] = unsafe.Pointer(&madd4_args.arg_N) } // Wrapper for madd4 CUDA kernel, asynchronous. func k_madd4_async(dst unsafe.Pointer, src1 unsafe.Pointer, fac1 float32, src2 unsafe.Pointer, fac2 float32, src3 unsafe.Pointer, fac3 float32, src4 unsafe.Pointer, fac4 float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("madd4") } madd4_args.Lock() defer madd4_args.Unlock() if madd4_code == 0 { madd4_code = fatbinLoad(madd4_map, "madd4") } madd4_args.arg_dst = dst madd4_args.arg_src1 = src1 madd4_args.arg_fac1 = fac1 madd4_args.arg_src2 = src2 madd4_args.arg_fac2 = fac2 madd4_args.arg_src3 = src3 madd4_args.arg_fac3 = fac3 madd4_args.arg_src4 = src4 madd4_args.arg_fac4 = fac4 madd4_args.arg_N = N args := madd4_args.argptr[:] cu.LaunchKernel(madd4_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("madd4") } } // maps compute capability on PTX code for madd4 kernel. var madd4_map = map[int]string{0: "", 30: madd4_ptx_30, 35: madd4_ptx_35, 37: madd4_ptx_37, 50: madd4_ptx_50, 52: madd4_ptx_52, 53: madd4_ptx_53, 60: madd4_ptx_60, 61: madd4_ptx_61, 70: madd4_ptx_70, 75: madd4_ptx_75} // madd4 PTX code for various compute capabilities. const ( madd4_ptx_30 = ` .version 6.4 .target sm_30 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; BB0_2: ret; } ` madd4_ptx_35 = ` .version 6.4 .target sm_35 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; BB0_2: ret; } ` madd4_ptx_37 = ` .version 6.4 .target sm_37 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; BB0_2: ret; } ` madd4_ptx_50 = ` .version 6.4 .target sm_50 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; BB0_2: ret; } ` madd4_ptx_52 = ` .version 6.4 .target sm_52 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; BB0_2: ret; } ` madd4_ptx_53 = ` .version 6.4 .target sm_53 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; BB0_2: ret; } ` madd4_ptx_60 = ` .version 6.4 .target sm_60 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; BB0_2: ret; } ` madd4_ptx_61 = ` .version 6.4 .target sm_61 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; BB0_2: ret; } ` madd4_ptx_70 = ` .version 6.4 .target sm_70 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; BB0_2: ret; } ` madd4_ptx_75 = ` .version 6.4 .target sm_75 .address_size 64 // .globl madd4 .visible .entry madd4( .param .u64 madd4_param_0, .param .u64 madd4_param_1, .param .f32 madd4_param_2, .param .u64 madd4_param_3, .param .f32 madd4_param_4, .param .u64 madd4_param_5, .param .f32 madd4_param_6, .param .u64 madd4_param_7, .param .f32 madd4_param_8, .param .u32 madd4_param_9 ) { .reg .pred %p<2>; .reg .f32 %f<13>; .reg .b32 %r<9>; .reg .b64 %rd<17>; ld.param.u64 %rd1, [madd4_param_0]; ld.param.u64 %rd2, [madd4_param_1]; ld.param.f32 %f1, [madd4_param_2]; ld.param.u64 %rd3, [madd4_param_3]; ld.param.f32 %f2, [madd4_param_4]; ld.param.u64 %rd4, [madd4_param_5]; ld.param.f32 %f3, [madd4_param_6]; ld.param.u64 %rd5, [madd4_param_7]; ld.param.f32 %f4, [madd4_param_8]; ld.param.u32 %r2, [madd4_param_9]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd6, %rd2; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f5, [%rd8]; cvta.to.global.u64 %rd9, %rd3; add.s64 %rd10, %rd9, %rd7; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f2; fma.rn.f32 %f8, %f5, %f1, %f7; cvta.to.global.u64 %rd11, %rd4; add.s64 %rd12, %rd11, %rd7; ld.global.nc.f32 %f9, [%rd12]; fma.rn.f32 %f10, %f9, %f3, %f8; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd7; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f4, %f10; cvta.to.global.u64 %rd15, %rd1; add.s64 %rd16, %rd15, %rd7; st.global.f32 [%rd16], %f12; BB0_2: ret; } ` ) mumax3-3.10/cuda/madd5.cu000066400000000000000000000011161371432437400151150ustar00rootroot00000000000000 // dst[i] = src1[i] * fac1 + src2[i] * fac2 + src3[i] * fac3 + src4[i] * fac4 + src5[i] * fac5 extern "C" __global__ void madd5(float* __restrict__ dst, float* __restrict__ src1, float fac1, float* __restrict__ src2, float fac2, float* __restrict__ src3, float fac3, float* __restrict__ src4, float fac4, float* __restrict__ src5, float fac5, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if(i < N) { dst[i] = (fac1*src1[i]) + (fac2*src2[i]) + (fac3*src3[i]) + (fac4*src4[i]) + (fac5*src5[i]); } } mumax3-3.10/cuda/madd5_wrapper.go000066400000000000000000000546271371432437400166720ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for madd5 kernel var madd5_code cu.Function // Stores the arguments for madd5 kernel invocation type madd5_args_t struct { arg_dst unsafe.Pointer arg_src1 unsafe.Pointer arg_fac1 float32 arg_src2 unsafe.Pointer arg_fac2 float32 arg_src3 unsafe.Pointer arg_fac3 float32 arg_src4 unsafe.Pointer arg_fac4 float32 arg_src5 unsafe.Pointer arg_fac5 float32 arg_N int argptr [12]unsafe.Pointer sync.Mutex } // Stores the arguments for madd5 kernel invocation var madd5_args madd5_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. madd5_args.argptr[0] = unsafe.Pointer(&madd5_args.arg_dst) madd5_args.argptr[1] = unsafe.Pointer(&madd5_args.arg_src1) madd5_args.argptr[2] = unsafe.Pointer(&madd5_args.arg_fac1) madd5_args.argptr[3] = unsafe.Pointer(&madd5_args.arg_src2) madd5_args.argptr[4] = unsafe.Pointer(&madd5_args.arg_fac2) madd5_args.argptr[5] = unsafe.Pointer(&madd5_args.arg_src3) madd5_args.argptr[6] = unsafe.Pointer(&madd5_args.arg_fac3) madd5_args.argptr[7] = unsafe.Pointer(&madd5_args.arg_src4) madd5_args.argptr[8] = unsafe.Pointer(&madd5_args.arg_fac4) madd5_args.argptr[9] = unsafe.Pointer(&madd5_args.arg_src5) madd5_args.argptr[10] = unsafe.Pointer(&madd5_args.arg_fac5) madd5_args.argptr[11] = unsafe.Pointer(&madd5_args.arg_N) } // Wrapper for madd5 CUDA kernel, asynchronous. func k_madd5_async(dst unsafe.Pointer, src1 unsafe.Pointer, fac1 float32, src2 unsafe.Pointer, fac2 float32, src3 unsafe.Pointer, fac3 float32, src4 unsafe.Pointer, fac4 float32, src5 unsafe.Pointer, fac5 float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("madd5") } madd5_args.Lock() defer madd5_args.Unlock() if madd5_code == 0 { madd5_code = fatbinLoad(madd5_map, "madd5") } madd5_args.arg_dst = dst madd5_args.arg_src1 = src1 madd5_args.arg_fac1 = fac1 madd5_args.arg_src2 = src2 madd5_args.arg_fac2 = fac2 madd5_args.arg_src3 = src3 madd5_args.arg_fac3 = fac3 madd5_args.arg_src4 = src4 madd5_args.arg_fac4 = fac4 madd5_args.arg_src5 = src5 madd5_args.arg_fac5 = fac5 madd5_args.arg_N = N args := madd5_args.argptr[:] cu.LaunchKernel(madd5_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("madd5") } } // maps compute capability on PTX code for madd5 kernel. var madd5_map = map[int]string{0: "", 30: madd5_ptx_30, 35: madd5_ptx_35, 37: madd5_ptx_37, 50: madd5_ptx_50, 52: madd5_ptx_52, 53: madd5_ptx_53, 60: madd5_ptx_60, 61: madd5_ptx_61, 70: madd5_ptx_70, 75: madd5_ptx_75} // madd5 PTX code for various compute capabilities. const ( madd5_ptx_30 = ` .version 6.4 .target sm_30 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; BB0_2: ret; } ` madd5_ptx_35 = ` .version 6.4 .target sm_35 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; BB0_2: ret; } ` madd5_ptx_37 = ` .version 6.4 .target sm_37 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; BB0_2: ret; } ` madd5_ptx_50 = ` .version 6.4 .target sm_50 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; BB0_2: ret; } ` madd5_ptx_52 = ` .version 6.4 .target sm_52 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; BB0_2: ret; } ` madd5_ptx_53 = ` .version 6.4 .target sm_53 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; BB0_2: ret; } ` madd5_ptx_60 = ` .version 6.4 .target sm_60 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; BB0_2: ret; } ` madd5_ptx_61 = ` .version 6.4 .target sm_61 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; BB0_2: ret; } ` madd5_ptx_70 = ` .version 6.4 .target sm_70 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; BB0_2: ret; } ` madd5_ptx_75 = ` .version 6.4 .target sm_75 .address_size 64 // .globl madd5 .visible .entry madd5( .param .u64 madd5_param_0, .param .u64 madd5_param_1, .param .f32 madd5_param_2, .param .u64 madd5_param_3, .param .f32 madd5_param_4, .param .u64 madd5_param_5, .param .f32 madd5_param_6, .param .u64 madd5_param_7, .param .f32 madd5_param_8, .param .u64 madd5_param_9, .param .f32 madd5_param_10, .param .u32 madd5_param_11 ) { .reg .pred %p<2>; .reg .f32 %f<16>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [madd5_param_0]; ld.param.u64 %rd2, [madd5_param_1]; ld.param.f32 %f1, [madd5_param_2]; ld.param.u64 %rd3, [madd5_param_3]; ld.param.f32 %f2, [madd5_param_4]; ld.param.u64 %rd4, [madd5_param_5]; ld.param.f32 %f3, [madd5_param_6]; ld.param.u64 %rd5, [madd5_param_7]; ld.param.f32 %f4, [madd5_param_8]; ld.param.u64 %rd6, [madd5_param_9]; ld.param.f32 %f5, [madd5_param_10]; ld.param.u32 %r2, [madd5_param_11]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f6, [%rd9]; cvta.to.global.u64 %rd10, %rd3; add.s64 %rd11, %rd10, %rd8; ld.global.nc.f32 %f7, [%rd11]; mul.f32 %f8, %f7, %f2; fma.rn.f32 %f9, %f6, %f1, %f8; cvta.to.global.u64 %rd12, %rd4; add.s64 %rd13, %rd12, %rd8; ld.global.nc.f32 %f10, [%rd13]; fma.rn.f32 %f11, %f10, %f3, %f9; cvta.to.global.u64 %rd14, %rd5; add.s64 %rd15, %rd14, %rd8; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f4, %f11; cvta.to.global.u64 %rd16, %rd6; add.s64 %rd17, %rd16, %rd8; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f5, %f13; cvta.to.global.u64 %rd18, %rd1; add.s64 %rd19, %rd18, %rd8; st.global.f32 [%rd19], %f15; BB0_2: ret; } ` ) mumax3-3.10/cuda/madd6.cu000066400000000000000000000012341371432437400151170ustar00rootroot00000000000000 // dst[i] = src1[i] * fac1 + src2[i] * fac2 + src3[i] * fac3 + src4[i] * fac4 + src5[i] * fac5 + src6[i] * fac6 extern "C" __global__ void madd6(float* __restrict__ dst, float* __restrict__ src1, float fac1, float* __restrict__ src2, float fac2, float* __restrict__ src3, float fac3, float* __restrict__ src4, float fac4, float* __restrict__ src5, float fac5, float* __restrict__ src6, float fac6, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if(i < N) { dst[i] = (fac1*src1[i]) + (fac2*src2[i]) + (fac3*src3[i]) + (fac4*src4[i]) + (fac5*src5[i]) + (fac6*src6[i]); } } mumax3-3.10/cuda/madd6_wrapper.go000066400000000000000000000625001371432437400166600ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for madd6 kernel var madd6_code cu.Function // Stores the arguments for madd6 kernel invocation type madd6_args_t struct { arg_dst unsafe.Pointer arg_src1 unsafe.Pointer arg_fac1 float32 arg_src2 unsafe.Pointer arg_fac2 float32 arg_src3 unsafe.Pointer arg_fac3 float32 arg_src4 unsafe.Pointer arg_fac4 float32 arg_src5 unsafe.Pointer arg_fac5 float32 arg_src6 unsafe.Pointer arg_fac6 float32 arg_N int argptr [14]unsafe.Pointer sync.Mutex } // Stores the arguments for madd6 kernel invocation var madd6_args madd6_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. madd6_args.argptr[0] = unsafe.Pointer(&madd6_args.arg_dst) madd6_args.argptr[1] = unsafe.Pointer(&madd6_args.arg_src1) madd6_args.argptr[2] = unsafe.Pointer(&madd6_args.arg_fac1) madd6_args.argptr[3] = unsafe.Pointer(&madd6_args.arg_src2) madd6_args.argptr[4] = unsafe.Pointer(&madd6_args.arg_fac2) madd6_args.argptr[5] = unsafe.Pointer(&madd6_args.arg_src3) madd6_args.argptr[6] = unsafe.Pointer(&madd6_args.arg_fac3) madd6_args.argptr[7] = unsafe.Pointer(&madd6_args.arg_src4) madd6_args.argptr[8] = unsafe.Pointer(&madd6_args.arg_fac4) madd6_args.argptr[9] = unsafe.Pointer(&madd6_args.arg_src5) madd6_args.argptr[10] = unsafe.Pointer(&madd6_args.arg_fac5) madd6_args.argptr[11] = unsafe.Pointer(&madd6_args.arg_src6) madd6_args.argptr[12] = unsafe.Pointer(&madd6_args.arg_fac6) madd6_args.argptr[13] = unsafe.Pointer(&madd6_args.arg_N) } // Wrapper for madd6 CUDA kernel, asynchronous. func k_madd6_async(dst unsafe.Pointer, src1 unsafe.Pointer, fac1 float32, src2 unsafe.Pointer, fac2 float32, src3 unsafe.Pointer, fac3 float32, src4 unsafe.Pointer, fac4 float32, src5 unsafe.Pointer, fac5 float32, src6 unsafe.Pointer, fac6 float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("madd6") } madd6_args.Lock() defer madd6_args.Unlock() if madd6_code == 0 { madd6_code = fatbinLoad(madd6_map, "madd6") } madd6_args.arg_dst = dst madd6_args.arg_src1 = src1 madd6_args.arg_fac1 = fac1 madd6_args.arg_src2 = src2 madd6_args.arg_fac2 = fac2 madd6_args.arg_src3 = src3 madd6_args.arg_fac3 = fac3 madd6_args.arg_src4 = src4 madd6_args.arg_fac4 = fac4 madd6_args.arg_src5 = src5 madd6_args.arg_fac5 = fac5 madd6_args.arg_src6 = src6 madd6_args.arg_fac6 = fac6 madd6_args.arg_N = N args := madd6_args.argptr[:] cu.LaunchKernel(madd6_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("madd6") } } // maps compute capability on PTX code for madd6 kernel. var madd6_map = map[int]string{0: "", 30: madd6_ptx_30, 35: madd6_ptx_35, 37: madd6_ptx_37, 50: madd6_ptx_50, 52: madd6_ptx_52, 53: madd6_ptx_53, 60: madd6_ptx_60, 61: madd6_ptx_61, 70: madd6_ptx_70, 75: madd6_ptx_75} // madd6 PTX code for various compute capabilities. const ( madd6_ptx_30 = ` .version 6.4 .target sm_30 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; BB0_2: ret; } ` madd6_ptx_35 = ` .version 6.4 .target sm_35 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; BB0_2: ret; } ` madd6_ptx_37 = ` .version 6.4 .target sm_37 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; BB0_2: ret; } ` madd6_ptx_50 = ` .version 6.4 .target sm_50 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; BB0_2: ret; } ` madd6_ptx_52 = ` .version 6.4 .target sm_52 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; BB0_2: ret; } ` madd6_ptx_53 = ` .version 6.4 .target sm_53 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; BB0_2: ret; } ` madd6_ptx_60 = ` .version 6.4 .target sm_60 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; BB0_2: ret; } ` madd6_ptx_61 = ` .version 6.4 .target sm_61 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; BB0_2: ret; } ` madd6_ptx_70 = ` .version 6.4 .target sm_70 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; BB0_2: ret; } ` madd6_ptx_75 = ` .version 6.4 .target sm_75 .address_size 64 // .globl madd6 .visible .entry madd6( .param .u64 madd6_param_0, .param .u64 madd6_param_1, .param .f32 madd6_param_2, .param .u64 madd6_param_3, .param .f32 madd6_param_4, .param .u64 madd6_param_5, .param .f32 madd6_param_6, .param .u64 madd6_param_7, .param .f32 madd6_param_8, .param .u64 madd6_param_9, .param .f32 madd6_param_10, .param .u64 madd6_param_11, .param .f32 madd6_param_12, .param .u32 madd6_param_13 ) { .reg .pred %p<2>; .reg .f32 %f<19>; .reg .b32 %r<9>; .reg .b64 %rd<23>; ld.param.u64 %rd1, [madd6_param_0]; ld.param.u64 %rd2, [madd6_param_1]; ld.param.f32 %f1, [madd6_param_2]; ld.param.u64 %rd3, [madd6_param_3]; ld.param.f32 %f2, [madd6_param_4]; ld.param.u64 %rd4, [madd6_param_5]; ld.param.f32 %f3, [madd6_param_6]; ld.param.u64 %rd5, [madd6_param_7]; ld.param.f32 %f4, [madd6_param_8]; ld.param.u64 %rd6, [madd6_param_9]; ld.param.f32 %f5, [madd6_param_10]; ld.param.u64 %rd7, [madd6_param_11]; ld.param.f32 %f6, [madd6_param_12]; ld.param.u32 %r2, [madd6_param_13]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f7, [%rd10]; cvta.to.global.u64 %rd11, %rd3; add.s64 %rd12, %rd11, %rd9; ld.global.nc.f32 %f8, [%rd12]; mul.f32 %f9, %f8, %f2; fma.rn.f32 %f10, %f7, %f1, %f9; cvta.to.global.u64 %rd13, %rd4; add.s64 %rd14, %rd13, %rd9; ld.global.nc.f32 %f11, [%rd14]; fma.rn.f32 %f12, %f11, %f3, %f10; cvta.to.global.u64 %rd15, %rd5; add.s64 %rd16, %rd15, %rd9; ld.global.nc.f32 %f13, [%rd16]; fma.rn.f32 %f14, %f13, %f4, %f12; cvta.to.global.u64 %rd17, %rd6; add.s64 %rd18, %rd17, %rd9; ld.global.nc.f32 %f15, [%rd18]; fma.rn.f32 %f16, %f15, %f5, %f14; cvta.to.global.u64 %rd19, %rd7; add.s64 %rd20, %rd19, %rd9; ld.global.nc.f32 %f17, [%rd20]; fma.rn.f32 %f18, %f17, %f6, %f16; cvta.to.global.u64 %rd21, %rd1; add.s64 %rd22, %rd21, %rd9; st.global.f32 [%rd22], %f18; BB0_2: ret; } ` ) mumax3-3.10/cuda/madd7.cu000066400000000000000000000013521371432437400151210ustar00rootroot00000000000000 // dst[i] = src1[i] * fac1 + src2[i] * fac2 + src3[i] * fac3 + src4[i] * fac4 + src5[i] * fac5 + src6[i] * fac6 + src7[i] * fac7 extern "C" __global__ void madd7(float* __restrict__ dst, float* __restrict__ src1, float fac1, float* __restrict__ src2, float fac2, float* __restrict__ src3, float fac3, float* __restrict__ src4, float fac4, float* __restrict__ src5, float fac5, float* __restrict__ src6, float fac6, float* __restrict__ src7, float fac7, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if(i < N) { dst[i] = (fac1*src1[i]) + (fac2*src2[i]) + (fac3*src3[i]) + (fac4*src4[i]) + (fac5*src5[i]) + (fac6*src6[i]) + (fac7*src7[i]); } } mumax3-3.10/cuda/madd7_wrapper.go000066400000000000000000000704571371432437400166730ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for madd7 kernel var madd7_code cu.Function // Stores the arguments for madd7 kernel invocation type madd7_args_t struct { arg_dst unsafe.Pointer arg_src1 unsafe.Pointer arg_fac1 float32 arg_src2 unsafe.Pointer arg_fac2 float32 arg_src3 unsafe.Pointer arg_fac3 float32 arg_src4 unsafe.Pointer arg_fac4 float32 arg_src5 unsafe.Pointer arg_fac5 float32 arg_src6 unsafe.Pointer arg_fac6 float32 arg_src7 unsafe.Pointer arg_fac7 float32 arg_N int argptr [16]unsafe.Pointer sync.Mutex } // Stores the arguments for madd7 kernel invocation var madd7_args madd7_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. madd7_args.argptr[0] = unsafe.Pointer(&madd7_args.arg_dst) madd7_args.argptr[1] = unsafe.Pointer(&madd7_args.arg_src1) madd7_args.argptr[2] = unsafe.Pointer(&madd7_args.arg_fac1) madd7_args.argptr[3] = unsafe.Pointer(&madd7_args.arg_src2) madd7_args.argptr[4] = unsafe.Pointer(&madd7_args.arg_fac2) madd7_args.argptr[5] = unsafe.Pointer(&madd7_args.arg_src3) madd7_args.argptr[6] = unsafe.Pointer(&madd7_args.arg_fac3) madd7_args.argptr[7] = unsafe.Pointer(&madd7_args.arg_src4) madd7_args.argptr[8] = unsafe.Pointer(&madd7_args.arg_fac4) madd7_args.argptr[9] = unsafe.Pointer(&madd7_args.arg_src5) madd7_args.argptr[10] = unsafe.Pointer(&madd7_args.arg_fac5) madd7_args.argptr[11] = unsafe.Pointer(&madd7_args.arg_src6) madd7_args.argptr[12] = unsafe.Pointer(&madd7_args.arg_fac6) madd7_args.argptr[13] = unsafe.Pointer(&madd7_args.arg_src7) madd7_args.argptr[14] = unsafe.Pointer(&madd7_args.arg_fac7) madd7_args.argptr[15] = unsafe.Pointer(&madd7_args.arg_N) } // Wrapper for madd7 CUDA kernel, asynchronous. func k_madd7_async(dst unsafe.Pointer, src1 unsafe.Pointer, fac1 float32, src2 unsafe.Pointer, fac2 float32, src3 unsafe.Pointer, fac3 float32, src4 unsafe.Pointer, fac4 float32, src5 unsafe.Pointer, fac5 float32, src6 unsafe.Pointer, fac6 float32, src7 unsafe.Pointer, fac7 float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("madd7") } madd7_args.Lock() defer madd7_args.Unlock() if madd7_code == 0 { madd7_code = fatbinLoad(madd7_map, "madd7") } madd7_args.arg_dst = dst madd7_args.arg_src1 = src1 madd7_args.arg_fac1 = fac1 madd7_args.arg_src2 = src2 madd7_args.arg_fac2 = fac2 madd7_args.arg_src3 = src3 madd7_args.arg_fac3 = fac3 madd7_args.arg_src4 = src4 madd7_args.arg_fac4 = fac4 madd7_args.arg_src5 = src5 madd7_args.arg_fac5 = fac5 madd7_args.arg_src6 = src6 madd7_args.arg_fac6 = fac6 madd7_args.arg_src7 = src7 madd7_args.arg_fac7 = fac7 madd7_args.arg_N = N args := madd7_args.argptr[:] cu.LaunchKernel(madd7_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("madd7") } } // maps compute capability on PTX code for madd7 kernel. var madd7_map = map[int]string{0: "", 30: madd7_ptx_30, 35: madd7_ptx_35, 37: madd7_ptx_37, 50: madd7_ptx_50, 52: madd7_ptx_52, 53: madd7_ptx_53, 60: madd7_ptx_60, 61: madd7_ptx_61, 70: madd7_ptx_70, 75: madd7_ptx_75} // madd7 PTX code for various compute capabilities. const ( madd7_ptx_30 = ` .version 6.4 .target sm_30 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; BB0_2: ret; } ` madd7_ptx_35 = ` .version 6.4 .target sm_35 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; BB0_2: ret; } ` madd7_ptx_37 = ` .version 6.4 .target sm_37 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; BB0_2: ret; } ` madd7_ptx_50 = ` .version 6.4 .target sm_50 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; BB0_2: ret; } ` madd7_ptx_52 = ` .version 6.4 .target sm_52 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; BB0_2: ret; } ` madd7_ptx_53 = ` .version 6.4 .target sm_53 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; BB0_2: ret; } ` madd7_ptx_60 = ` .version 6.4 .target sm_60 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; BB0_2: ret; } ` madd7_ptx_61 = ` .version 6.4 .target sm_61 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; BB0_2: ret; } ` madd7_ptx_70 = ` .version 6.4 .target sm_70 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; BB0_2: ret; } ` madd7_ptx_75 = ` .version 6.4 .target sm_75 .address_size 64 // .globl madd7 .visible .entry madd7( .param .u64 madd7_param_0, .param .u64 madd7_param_1, .param .f32 madd7_param_2, .param .u64 madd7_param_3, .param .f32 madd7_param_4, .param .u64 madd7_param_5, .param .f32 madd7_param_6, .param .u64 madd7_param_7, .param .f32 madd7_param_8, .param .u64 madd7_param_9, .param .f32 madd7_param_10, .param .u64 madd7_param_11, .param .f32 madd7_param_12, .param .u64 madd7_param_13, .param .f32 madd7_param_14, .param .u32 madd7_param_15 ) { .reg .pred %p<2>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<26>; ld.param.u64 %rd1, [madd7_param_0]; ld.param.u64 %rd2, [madd7_param_1]; ld.param.f32 %f1, [madd7_param_2]; ld.param.u64 %rd3, [madd7_param_3]; ld.param.f32 %f2, [madd7_param_4]; ld.param.u64 %rd4, [madd7_param_5]; ld.param.f32 %f3, [madd7_param_6]; ld.param.u64 %rd5, [madd7_param_7]; ld.param.f32 %f4, [madd7_param_8]; ld.param.u64 %rd6, [madd7_param_9]; ld.param.f32 %f5, [madd7_param_10]; ld.param.u64 %rd7, [madd7_param_11]; ld.param.f32 %f6, [madd7_param_12]; ld.param.u64 %rd8, [madd7_param_13]; ld.param.f32 %f7, [madd7_param_14]; ld.param.u32 %r2, [madd7_param_15]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd9, %rd2; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f8, [%rd11]; cvta.to.global.u64 %rd12, %rd3; add.s64 %rd13, %rd12, %rd10; ld.global.nc.f32 %f9, [%rd13]; mul.f32 %f10, %f9, %f2; fma.rn.f32 %f11, %f8, %f1, %f10; cvta.to.global.u64 %rd14, %rd4; add.s64 %rd15, %rd14, %rd10; ld.global.nc.f32 %f12, [%rd15]; fma.rn.f32 %f13, %f12, %f3, %f11; cvta.to.global.u64 %rd16, %rd5; add.s64 %rd17, %rd16, %rd10; ld.global.nc.f32 %f14, [%rd17]; fma.rn.f32 %f15, %f14, %f4, %f13; cvta.to.global.u64 %rd18, %rd6; add.s64 %rd19, %rd18, %rd10; ld.global.nc.f32 %f16, [%rd19]; fma.rn.f32 %f17, %f16, %f5, %f15; cvta.to.global.u64 %rd20, %rd7; add.s64 %rd21, %rd20, %rd10; ld.global.nc.f32 %f18, [%rd21]; fma.rn.f32 %f19, %f18, %f6, %f17; cvta.to.global.u64 %rd22, %rd8; add.s64 %rd23, %rd22, %rd10; ld.global.nc.f32 %f20, [%rd23]; fma.rn.f32 %f21, %f20, %f7, %f19; cvta.to.global.u64 %rd24, %rd1; add.s64 %rd25, %rd24, %rd10; st.global.f32 [%rd25], %f21; BB0_2: ret; } ` ) mumax3-3.10/cuda/magnetoelastic.go000066400000000000000000000031131371432437400171170ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Add magneto-elasticit coupling field to the effective field. // see magnetoelasticfield.cu func AddMagnetoelasticField(Beff, m *data.Slice, exx, eyy, ezz, exy, exz, eyz, B1, B2, Msat MSlice) { util.Argument(Beff.Size() == m.Size()) util.Argument(Beff.Size() == exx.Size()) util.Argument(Beff.Size() == eyy.Size()) util.Argument(Beff.Size() == ezz.Size()) util.Argument(Beff.Size() == exy.Size()) util.Argument(Beff.Size() == exz.Size()) util.Argument(Beff.Size() == eyz.Size()) N := Beff.Len() cfg := make1DConf(N) k_addmagnetoelasticfield_async(Beff.DevPtr(X), Beff.DevPtr(Y), Beff.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), exx.DevPtr(0), exx.Mul(0), eyy.DevPtr(0), eyy.Mul(0), ezz.DevPtr(0), ezz.Mul(0), exy.DevPtr(0), exy.Mul(0), exz.DevPtr(0), exz.Mul(0), eyz.DevPtr(0), eyz.Mul(0), B1.DevPtr(0), B1.Mul(0), B2.DevPtr(0), B2.Mul(0), Msat.DevPtr(0), Msat.Mul(0), N, cfg) } // Calculate magneto-elasticit force density // see magnetoelasticforce.cu func GetMagnetoelasticForceDensity(out, m *data.Slice, B1, B2 MSlice, mesh *data.Mesh) { util.Argument(out.Size() == m.Size()) cellsize := mesh.CellSize() N := mesh.Size() cfg := make3DConf(N) rcsx := float32(1.0 / cellsize[X]) rcsy := float32(1.0 / cellsize[Y]) rcsz := float32(1.0 / cellsize[Z]) k_getmagnetoelasticforce_async(out.DevPtr(X), out.DevPtr(Y), out.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), B1.DevPtr(0), B1.Mul(0), B2.DevPtr(0), B2.Mul(0), rcsx, rcsy, rcsz, N[X], N[Y], N[Z], mesh.PBC_code(), cfg) } mumax3-3.10/cuda/magnetoelasticfield.cu000066400000000000000000000033211371432437400201260ustar00rootroot00000000000000#include #include #include "amul.h" #include "float3.h" // Add magneto-elastic coupling field to B. // H = - δUmel / δM, // where Umel is magneto-elastic energy denstiy given by the eq. (12.18) of Gurevich&Melkov "Magnetization Oscillations and Waves", CRC Press, 1996 extern "C" __global__ void addmagnetoelasticfield(float* __restrict__ Bx, float* __restrict__ By, float* __restrict__ Bz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ exx_, float exx_mul, float* __restrict__ eyy_, float eyy_mul, float* __restrict__ ezz_, float ezz_mul, float* __restrict__ exy_, float exy_mul, float* __restrict__ exz_, float exz_mul, float* __restrict__ eyz_, float eyz_mul, float* __restrict__ B1_, float B1_mul, float* __restrict__ B2_, float B2_mul, float* __restrict__ Ms_, float Ms_mul, int N) { int I = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (I < N) { float Exx = amul(exx_, exx_mul, I); float Eyy = amul(eyy_, eyy_mul, I); float Ezz = amul(ezz_, ezz_mul, I); float Exy = amul(exy_, exy_mul, I); float Eyx = Exy; float Exz = amul(exz_, exz_mul, I); float Ezx = Exz; float Eyz = amul(eyz_, eyz_mul, I); float Ezy = Eyz; float invMs = inv_Msat(Ms_, Ms_mul, I); float B1 = amul(B1_, B1_mul, I) * invMs; float B2 = amul(B2_, B2_mul, I) * invMs; float3 m = {mx[I], my[I], mz[I]}; Bx[I] += -(2.0f*B1*m.x*Exx + B2*(m.y*Exy + m.z*Exz)); By[I] += -(2.0f*B1*m.y*Eyy + B2*(m.x*Eyx + m.z*Eyz)); Bz[I] += -(2.0f*B1*m.z*Ezz + B2*(m.x*Ezx + m.y*Ezy)); } } mumax3-3.10/cuda/magnetoelasticfield_wrapper.go000066400000000000000000002602231371432437400216720ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for addmagnetoelasticfield kernel var addmagnetoelasticfield_code cu.Function // Stores the arguments for addmagnetoelasticfield kernel invocation type addmagnetoelasticfield_args_t struct { arg_Bx unsafe.Pointer arg_By unsafe.Pointer arg_Bz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_exx_ unsafe.Pointer arg_exx_mul float32 arg_eyy_ unsafe.Pointer arg_eyy_mul float32 arg_ezz_ unsafe.Pointer arg_ezz_mul float32 arg_exy_ unsafe.Pointer arg_exy_mul float32 arg_exz_ unsafe.Pointer arg_exz_mul float32 arg_eyz_ unsafe.Pointer arg_eyz_mul float32 arg_B1_ unsafe.Pointer arg_B1_mul float32 arg_B2_ unsafe.Pointer arg_B2_mul float32 arg_Ms_ unsafe.Pointer arg_Ms_mul float32 arg_N int argptr [25]unsafe.Pointer sync.Mutex } // Stores the arguments for addmagnetoelasticfield kernel invocation var addmagnetoelasticfield_args addmagnetoelasticfield_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. addmagnetoelasticfield_args.argptr[0] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_Bx) addmagnetoelasticfield_args.argptr[1] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_By) addmagnetoelasticfield_args.argptr[2] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_Bz) addmagnetoelasticfield_args.argptr[3] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_mx) addmagnetoelasticfield_args.argptr[4] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_my) addmagnetoelasticfield_args.argptr[5] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_mz) addmagnetoelasticfield_args.argptr[6] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exx_) addmagnetoelasticfield_args.argptr[7] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exx_mul) addmagnetoelasticfield_args.argptr[8] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_eyy_) addmagnetoelasticfield_args.argptr[9] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_eyy_mul) addmagnetoelasticfield_args.argptr[10] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_ezz_) addmagnetoelasticfield_args.argptr[11] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_ezz_mul) addmagnetoelasticfield_args.argptr[12] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exy_) addmagnetoelasticfield_args.argptr[13] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exy_mul) addmagnetoelasticfield_args.argptr[14] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exz_) addmagnetoelasticfield_args.argptr[15] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_exz_mul) addmagnetoelasticfield_args.argptr[16] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_eyz_) addmagnetoelasticfield_args.argptr[17] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_eyz_mul) addmagnetoelasticfield_args.argptr[18] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_B1_) addmagnetoelasticfield_args.argptr[19] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_B1_mul) addmagnetoelasticfield_args.argptr[20] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_B2_) addmagnetoelasticfield_args.argptr[21] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_B2_mul) addmagnetoelasticfield_args.argptr[22] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_Ms_) addmagnetoelasticfield_args.argptr[23] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_Ms_mul) addmagnetoelasticfield_args.argptr[24] = unsafe.Pointer(&addmagnetoelasticfield_args.arg_N) } // Wrapper for addmagnetoelasticfield CUDA kernel, asynchronous. func k_addmagnetoelasticfield_async(Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, exx_ unsafe.Pointer, exx_mul float32, eyy_ unsafe.Pointer, eyy_mul float32, ezz_ unsafe.Pointer, ezz_mul float32, exy_ unsafe.Pointer, exy_mul float32, exz_ unsafe.Pointer, exz_mul float32, eyz_ unsafe.Pointer, eyz_mul float32, B1_ unsafe.Pointer, B1_mul float32, B2_ unsafe.Pointer, B2_mul float32, Ms_ unsafe.Pointer, Ms_mul float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("addmagnetoelasticfield") } addmagnetoelasticfield_args.Lock() defer addmagnetoelasticfield_args.Unlock() if addmagnetoelasticfield_code == 0 { addmagnetoelasticfield_code = fatbinLoad(addmagnetoelasticfield_map, "addmagnetoelasticfield") } addmagnetoelasticfield_args.arg_Bx = Bx addmagnetoelasticfield_args.arg_By = By addmagnetoelasticfield_args.arg_Bz = Bz addmagnetoelasticfield_args.arg_mx = mx addmagnetoelasticfield_args.arg_my = my addmagnetoelasticfield_args.arg_mz = mz addmagnetoelasticfield_args.arg_exx_ = exx_ addmagnetoelasticfield_args.arg_exx_mul = exx_mul addmagnetoelasticfield_args.arg_eyy_ = eyy_ addmagnetoelasticfield_args.arg_eyy_mul = eyy_mul addmagnetoelasticfield_args.arg_ezz_ = ezz_ addmagnetoelasticfield_args.arg_ezz_mul = ezz_mul addmagnetoelasticfield_args.arg_exy_ = exy_ addmagnetoelasticfield_args.arg_exy_mul = exy_mul addmagnetoelasticfield_args.arg_exz_ = exz_ addmagnetoelasticfield_args.arg_exz_mul = exz_mul addmagnetoelasticfield_args.arg_eyz_ = eyz_ addmagnetoelasticfield_args.arg_eyz_mul = eyz_mul addmagnetoelasticfield_args.arg_B1_ = B1_ addmagnetoelasticfield_args.arg_B1_mul = B1_mul addmagnetoelasticfield_args.arg_B2_ = B2_ addmagnetoelasticfield_args.arg_B2_mul = B2_mul addmagnetoelasticfield_args.arg_Ms_ = Ms_ addmagnetoelasticfield_args.arg_Ms_mul = Ms_mul addmagnetoelasticfield_args.arg_N = N args := addmagnetoelasticfield_args.argptr[:] cu.LaunchKernel(addmagnetoelasticfield_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("addmagnetoelasticfield") } } // maps compute capability on PTX code for addmagnetoelasticfield kernel. var addmagnetoelasticfield_map = map[int]string{0: "", 30: addmagnetoelasticfield_ptx_30, 32: addmagnetoelasticfield_ptx_32, 35: addmagnetoelasticfield_ptx_35, 37: addmagnetoelasticfield_ptx_37, 50: addmagnetoelasticfield_ptx_50, 52: addmagnetoelasticfield_ptx_52, 53: addmagnetoelasticfield_ptx_53, 60: addmagnetoelasticfield_ptx_60, 61: addmagnetoelasticfield_ptx_61, 62: addmagnetoelasticfield_ptx_62, 70: addmagnetoelasticfield_ptx_70, 72: addmagnetoelasticfield_ptx_72, 75: addmagnetoelasticfield_ptx_75} // addmagnetoelasticfield PTX code for various compute capabilities. const ( addmagnetoelasticfield_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<77>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f67, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f68, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f69, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f70, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f71, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f72, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f75, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f76, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f73, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.f32 %f30, [%rd18]; mul.f32 %f67, %f30, %f67; BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.f32 %f31, [%rd21]; mul.f32 %f68, %f31, %f68; BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.f32 %f32, [%rd24]; mul.f32 %f69, %f32, %f69; BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.f32 %f33, [%rd27]; mul.f32 %f70, %f33, %f70; BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.f32 %f34, [%rd30]; mul.f32 %f71, %f34, %f71; BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.f32 %f35, [%rd33]; mul.f32 %f72, %f35, %f72; BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.f32 %f36, [%rd36]; mul.f32 %f73, %f36, %f73; BB0_15: setp.eq.f32 %p9, %f73, 0f00000000; mov.f32 %f74, 0f00000000; @%p9 bra BB0_17; rcp.rn.f32 %f74, %f73; BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.f32 %f38, [%rd39]; mul.f32 %f75, %f38, %f75; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.f32 %f39, [%rd42]; mul.f32 %f76, %f39, %f76; BB0_21: cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; mul.f32 %f40, %f74, %f75; fma.rn.f32 %f41, %f74, %f75, %f40; ld.global.f32 %f42, [%rd45]; mul.f32 %f43, %f41, %f42; ld.global.f32 %f44, [%rd47]; ld.global.f32 %f45, [%rd49]; mul.f32 %f46, %f71, %f45; fma.rn.f32 %f47, %f70, %f44, %f46; mul.f32 %f48, %f74, %f76; mul.f32 %f49, %f48, %f47; fma.rn.f32 %f50, %f67, %f43, %f49; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f51, [%rd51]; sub.f32 %f52, %f51, %f50; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f41, %f44; mul.f32 %f54, %f72, %f45; fma.rn.f32 %f55, %f70, %f42, %f54; mul.f32 %f56, %f48, %f55; fma.rn.f32 %f57, %f68, %f53, %f56; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f58, [%rd53]; sub.f32 %f59, %f58, %f57; st.global.f32 [%rd53], %f59; mul.f32 %f60, %f41, %f45; mul.f32 %f61, %f72, %f44; fma.rn.f32 %f62, %f71, %f42, %f61; mul.f32 %f63, %f48, %f62; fma.rn.f32 %f64, %f69, %f60, %f63; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f65, [%rd55]; sub.f32 %f66, %f65, %f64; st.global.f32 [%rd55], %f66; BB0_22: ret; } ` addmagnetoelasticfield_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<77>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f67, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f68, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f69, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f70, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f71, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f72, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f75, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f76, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f73, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f67, %f30, %f67; BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f68, %f31, %f68; BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f69, %f32, %f69; BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f70, %f33, %f70; BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f71, %f34, %f71; BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f72, %f35, %f72; BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f73, %f36, %f73; BB0_15: setp.eq.f32 %p9, %f73, 0f00000000; mov.f32 %f74, 0f00000000; @%p9 bra BB0_17; rcp.rn.f32 %f74, %f73; BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f75, %f38, %f75; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f76, %f39, %f76; BB0_21: cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; mul.f32 %f40, %f74, %f75; fma.rn.f32 %f41, %f74, %f75, %f40; ld.global.nc.f32 %f42, [%rd45]; mul.f32 %f43, %f41, %f42; ld.global.nc.f32 %f44, [%rd47]; ld.global.nc.f32 %f45, [%rd49]; mul.f32 %f46, %f71, %f45; fma.rn.f32 %f47, %f70, %f44, %f46; mul.f32 %f48, %f74, %f76; mul.f32 %f49, %f48, %f47; fma.rn.f32 %f50, %f67, %f43, %f49; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f51, [%rd51]; sub.f32 %f52, %f51, %f50; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f41, %f44; mul.f32 %f54, %f72, %f45; fma.rn.f32 %f55, %f70, %f42, %f54; mul.f32 %f56, %f48, %f55; fma.rn.f32 %f57, %f68, %f53, %f56; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f58, [%rd53]; sub.f32 %f59, %f58, %f57; st.global.f32 [%rd53], %f59; mul.f32 %f60, %f41, %f45; mul.f32 %f61, %f72, %f44; fma.rn.f32 %f62, %f71, %f42, %f61; mul.f32 %f63, %f48, %f62; fma.rn.f32 %f64, %f69, %f60, %f63; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f65, [%rd55]; sub.f32 %f66, %f65, %f64; st.global.f32 [%rd55], %f66; BB0_22: ret; } ` addmagnetoelasticfield_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<77>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f67, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f68, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f69, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f70, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f71, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f72, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f75, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f76, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f73, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f67, %f30, %f67; BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f68, %f31, %f68; BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f69, %f32, %f69; BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f70, %f33, %f70; BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f71, %f34, %f71; BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f72, %f35, %f72; BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f73, %f36, %f73; BB0_15: setp.eq.f32 %p9, %f73, 0f00000000; mov.f32 %f74, 0f00000000; @%p9 bra BB0_17; rcp.rn.f32 %f74, %f73; BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f75, %f38, %f75; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f76, %f39, %f76; BB0_21: cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; mul.f32 %f40, %f74, %f75; fma.rn.f32 %f41, %f74, %f75, %f40; ld.global.nc.f32 %f42, [%rd45]; mul.f32 %f43, %f41, %f42; ld.global.nc.f32 %f44, [%rd47]; ld.global.nc.f32 %f45, [%rd49]; mul.f32 %f46, %f71, %f45; fma.rn.f32 %f47, %f70, %f44, %f46; mul.f32 %f48, %f74, %f76; mul.f32 %f49, %f48, %f47; fma.rn.f32 %f50, %f67, %f43, %f49; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f51, [%rd51]; sub.f32 %f52, %f51, %f50; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f41, %f44; mul.f32 %f54, %f72, %f45; fma.rn.f32 %f55, %f70, %f42, %f54; mul.f32 %f56, %f48, %f55; fma.rn.f32 %f57, %f68, %f53, %f56; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f58, [%rd53]; sub.f32 %f59, %f58, %f57; st.global.f32 [%rd53], %f59; mul.f32 %f60, %f41, %f45; mul.f32 %f61, %f72, %f44; fma.rn.f32 %f62, %f71, %f42, %f61; mul.f32 %f63, %f48, %f62; fma.rn.f32 %f64, %f69, %f60, %f63; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f65, [%rd55]; sub.f32 %f66, %f65, %f64; st.global.f32 [%rd55], %f66; BB0_22: ret; } ` addmagnetoelasticfield_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<77>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f67, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f68, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f69, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f70, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f71, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f72, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f75, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f76, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f73, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f67, %f30, %f67; BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f68, %f31, %f68; BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f69, %f32, %f69; BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f70, %f33, %f70; BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f71, %f34, %f71; BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f72, %f35, %f72; BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f73, %f36, %f73; BB0_15: setp.eq.f32 %p9, %f73, 0f00000000; mov.f32 %f74, 0f00000000; @%p9 bra BB0_17; rcp.rn.f32 %f74, %f73; BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f75, %f38, %f75; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f76, %f39, %f76; BB0_21: cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; mul.f32 %f40, %f74, %f75; fma.rn.f32 %f41, %f74, %f75, %f40; ld.global.nc.f32 %f42, [%rd45]; mul.f32 %f43, %f41, %f42; ld.global.nc.f32 %f44, [%rd47]; ld.global.nc.f32 %f45, [%rd49]; mul.f32 %f46, %f71, %f45; fma.rn.f32 %f47, %f70, %f44, %f46; mul.f32 %f48, %f74, %f76; mul.f32 %f49, %f48, %f47; fma.rn.f32 %f50, %f67, %f43, %f49; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f51, [%rd51]; sub.f32 %f52, %f51, %f50; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f41, %f44; mul.f32 %f54, %f72, %f45; fma.rn.f32 %f55, %f70, %f42, %f54; mul.f32 %f56, %f48, %f55; fma.rn.f32 %f57, %f68, %f53, %f56; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f58, [%rd53]; sub.f32 %f59, %f58, %f57; st.global.f32 [%rd53], %f59; mul.f32 %f60, %f41, %f45; mul.f32 %f61, %f72, %f44; fma.rn.f32 %f62, %f71, %f42, %f61; mul.f32 %f63, %f48, %f62; fma.rn.f32 %f64, %f69, %f60, %f63; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f65, [%rd55]; sub.f32 %f66, %f65, %f64; st.global.f32 [%rd55], %f66; BB0_22: ret; } ` addmagnetoelasticfield_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<77>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f67, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f68, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f69, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f70, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f71, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f72, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f75, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f76, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f73, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f67, %f30, %f67; BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f68, %f31, %f68; BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f69, %f32, %f69; BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f70, %f33, %f70; BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f71, %f34, %f71; BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f72, %f35, %f72; BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f73, %f36, %f73; BB0_15: setp.eq.f32 %p9, %f73, 0f00000000; mov.f32 %f74, 0f00000000; @%p9 bra BB0_17; rcp.rn.f32 %f74, %f73; BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f75, %f38, %f75; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f76, %f39, %f76; BB0_21: cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; mul.f32 %f40, %f74, %f75; fma.rn.f32 %f41, %f74, %f75, %f40; ld.global.nc.f32 %f42, [%rd45]; mul.f32 %f43, %f41, %f42; ld.global.nc.f32 %f44, [%rd47]; ld.global.nc.f32 %f45, [%rd49]; mul.f32 %f46, %f71, %f45; fma.rn.f32 %f47, %f70, %f44, %f46; mul.f32 %f48, %f74, %f76; mul.f32 %f49, %f48, %f47; fma.rn.f32 %f50, %f67, %f43, %f49; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f51, [%rd51]; sub.f32 %f52, %f51, %f50; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f41, %f44; mul.f32 %f54, %f72, %f45; fma.rn.f32 %f55, %f70, %f42, %f54; mul.f32 %f56, %f48, %f55; fma.rn.f32 %f57, %f68, %f53, %f56; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f58, [%rd53]; sub.f32 %f59, %f58, %f57; st.global.f32 [%rd53], %f59; mul.f32 %f60, %f41, %f45; mul.f32 %f61, %f72, %f44; fma.rn.f32 %f62, %f71, %f42, %f61; mul.f32 %f63, %f48, %f62; fma.rn.f32 %f64, %f69, %f60, %f63; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f65, [%rd55]; sub.f32 %f66, %f65, %f64; st.global.f32 [%rd55], %f66; BB0_22: ret; } ` addmagnetoelasticfield_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<77>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f67, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f68, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f69, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f70, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f71, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f72, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f75, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f76, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f73, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f67, %f30, %f67; BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f68, %f31, %f68; BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f69, %f32, %f69; BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f70, %f33, %f70; BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f71, %f34, %f71; BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f72, %f35, %f72; BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f73, %f36, %f73; BB0_15: setp.eq.f32 %p9, %f73, 0f00000000; mov.f32 %f74, 0f00000000; @%p9 bra BB0_17; rcp.rn.f32 %f74, %f73; BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f75, %f38, %f75; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f76, %f39, %f76; BB0_21: cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; mul.f32 %f40, %f74, %f75; fma.rn.f32 %f41, %f74, %f75, %f40; ld.global.nc.f32 %f42, [%rd45]; mul.f32 %f43, %f41, %f42; ld.global.nc.f32 %f44, [%rd47]; ld.global.nc.f32 %f45, [%rd49]; mul.f32 %f46, %f71, %f45; fma.rn.f32 %f47, %f70, %f44, %f46; mul.f32 %f48, %f74, %f76; mul.f32 %f49, %f48, %f47; fma.rn.f32 %f50, %f67, %f43, %f49; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f51, [%rd51]; sub.f32 %f52, %f51, %f50; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f41, %f44; mul.f32 %f54, %f72, %f45; fma.rn.f32 %f55, %f70, %f42, %f54; mul.f32 %f56, %f48, %f55; fma.rn.f32 %f57, %f68, %f53, %f56; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f58, [%rd53]; sub.f32 %f59, %f58, %f57; st.global.f32 [%rd53], %f59; mul.f32 %f60, %f41, %f45; mul.f32 %f61, %f72, %f44; fma.rn.f32 %f62, %f71, %f42, %f61; mul.f32 %f63, %f48, %f62; fma.rn.f32 %f64, %f69, %f60, %f63; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f65, [%rd55]; sub.f32 %f66, %f65, %f64; st.global.f32 [%rd55], %f66; BB0_22: ret; } ` addmagnetoelasticfield_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<77>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f67, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f68, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f69, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f70, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f71, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f72, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f75, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f76, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f73, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f67, %f30, %f67; BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f68, %f31, %f68; BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f69, %f32, %f69; BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f70, %f33, %f70; BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f71, %f34, %f71; BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f72, %f35, %f72; BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f73, %f36, %f73; BB0_15: setp.eq.f32 %p9, %f73, 0f00000000; mov.f32 %f74, 0f00000000; @%p9 bra BB0_17; rcp.rn.f32 %f74, %f73; BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f75, %f38, %f75; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f76, %f39, %f76; BB0_21: cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; mul.f32 %f40, %f74, %f75; fma.rn.f32 %f41, %f74, %f75, %f40; ld.global.nc.f32 %f42, [%rd45]; mul.f32 %f43, %f41, %f42; ld.global.nc.f32 %f44, [%rd47]; ld.global.nc.f32 %f45, [%rd49]; mul.f32 %f46, %f71, %f45; fma.rn.f32 %f47, %f70, %f44, %f46; mul.f32 %f48, %f74, %f76; mul.f32 %f49, %f48, %f47; fma.rn.f32 %f50, %f67, %f43, %f49; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f51, [%rd51]; sub.f32 %f52, %f51, %f50; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f41, %f44; mul.f32 %f54, %f72, %f45; fma.rn.f32 %f55, %f70, %f42, %f54; mul.f32 %f56, %f48, %f55; fma.rn.f32 %f57, %f68, %f53, %f56; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f58, [%rd53]; sub.f32 %f59, %f58, %f57; st.global.f32 [%rd53], %f59; mul.f32 %f60, %f41, %f45; mul.f32 %f61, %f72, %f44; fma.rn.f32 %f62, %f71, %f42, %f61; mul.f32 %f63, %f48, %f62; fma.rn.f32 %f64, %f69, %f60, %f63; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f65, [%rd55]; sub.f32 %f66, %f65, %f64; st.global.f32 [%rd55], %f66; BB0_22: ret; } ` addmagnetoelasticfield_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<77>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f67, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f68, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f69, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f70, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f71, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f72, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f75, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f76, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f73, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f67, %f30, %f67; BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f68, %f31, %f68; BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f69, %f32, %f69; BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f70, %f33, %f70; BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f71, %f34, %f71; BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f72, %f35, %f72; BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f73, %f36, %f73; BB0_15: setp.eq.f32 %p9, %f73, 0f00000000; mov.f32 %f74, 0f00000000; @%p9 bra BB0_17; rcp.rn.f32 %f74, %f73; BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f75, %f38, %f75; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f76, %f39, %f76; BB0_21: cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; mul.f32 %f40, %f74, %f75; fma.rn.f32 %f41, %f74, %f75, %f40; ld.global.nc.f32 %f42, [%rd45]; mul.f32 %f43, %f41, %f42; ld.global.nc.f32 %f44, [%rd47]; ld.global.nc.f32 %f45, [%rd49]; mul.f32 %f46, %f71, %f45; fma.rn.f32 %f47, %f70, %f44, %f46; mul.f32 %f48, %f74, %f76; mul.f32 %f49, %f48, %f47; fma.rn.f32 %f50, %f67, %f43, %f49; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f51, [%rd51]; sub.f32 %f52, %f51, %f50; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f41, %f44; mul.f32 %f54, %f72, %f45; fma.rn.f32 %f55, %f70, %f42, %f54; mul.f32 %f56, %f48, %f55; fma.rn.f32 %f57, %f68, %f53, %f56; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f58, [%rd53]; sub.f32 %f59, %f58, %f57; st.global.f32 [%rd53], %f59; mul.f32 %f60, %f41, %f45; mul.f32 %f61, %f72, %f44; fma.rn.f32 %f62, %f71, %f42, %f61; mul.f32 %f63, %f48, %f62; fma.rn.f32 %f64, %f69, %f60, %f63; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f65, [%rd55]; sub.f32 %f66, %f65, %f64; st.global.f32 [%rd55], %f66; BB0_22: ret; } ` addmagnetoelasticfield_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<77>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f67, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f68, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f69, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f70, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f71, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f72, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f75, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f76, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f73, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f67, %f30, %f67; BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f68, %f31, %f68; BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f69, %f32, %f69; BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f70, %f33, %f70; BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f71, %f34, %f71; BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f72, %f35, %f72; BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f73, %f36, %f73; BB0_15: setp.eq.f32 %p9, %f73, 0f00000000; mov.f32 %f74, 0f00000000; @%p9 bra BB0_17; rcp.rn.f32 %f74, %f73; BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f75, %f38, %f75; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f76, %f39, %f76; BB0_21: cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; mul.f32 %f40, %f74, %f75; fma.rn.f32 %f41, %f74, %f75, %f40; ld.global.nc.f32 %f42, [%rd45]; mul.f32 %f43, %f41, %f42; ld.global.nc.f32 %f44, [%rd47]; ld.global.nc.f32 %f45, [%rd49]; mul.f32 %f46, %f71, %f45; fma.rn.f32 %f47, %f70, %f44, %f46; mul.f32 %f48, %f74, %f76; mul.f32 %f49, %f48, %f47; fma.rn.f32 %f50, %f67, %f43, %f49; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f51, [%rd51]; sub.f32 %f52, %f51, %f50; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f41, %f44; mul.f32 %f54, %f72, %f45; fma.rn.f32 %f55, %f70, %f42, %f54; mul.f32 %f56, %f48, %f55; fma.rn.f32 %f57, %f68, %f53, %f56; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f58, [%rd53]; sub.f32 %f59, %f58, %f57; st.global.f32 [%rd53], %f59; mul.f32 %f60, %f41, %f45; mul.f32 %f61, %f72, %f44; fma.rn.f32 %f62, %f71, %f42, %f61; mul.f32 %f63, %f48, %f62; fma.rn.f32 %f64, %f69, %f60, %f63; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f65, [%rd55]; sub.f32 %f66, %f65, %f64; st.global.f32 [%rd55], %f66; BB0_22: ret; } ` addmagnetoelasticfield_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<77>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f67, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f68, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f69, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f70, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f71, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f72, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f75, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f76, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f73, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f67, %f30, %f67; BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f68, %f31, %f68; BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f69, %f32, %f69; BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f70, %f33, %f70; BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f71, %f34, %f71; BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f72, %f35, %f72; BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f73, %f36, %f73; BB0_15: setp.eq.f32 %p9, %f73, 0f00000000; mov.f32 %f74, 0f00000000; @%p9 bra BB0_17; rcp.rn.f32 %f74, %f73; BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f75, %f38, %f75; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f76, %f39, %f76; BB0_21: cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; mul.f32 %f40, %f74, %f75; fma.rn.f32 %f41, %f74, %f75, %f40; ld.global.nc.f32 %f42, [%rd45]; mul.f32 %f43, %f41, %f42; ld.global.nc.f32 %f44, [%rd47]; ld.global.nc.f32 %f45, [%rd49]; mul.f32 %f46, %f71, %f45; fma.rn.f32 %f47, %f70, %f44, %f46; mul.f32 %f48, %f74, %f76; mul.f32 %f49, %f48, %f47; fma.rn.f32 %f50, %f67, %f43, %f49; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f51, [%rd51]; sub.f32 %f52, %f51, %f50; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f41, %f44; mul.f32 %f54, %f72, %f45; fma.rn.f32 %f55, %f70, %f42, %f54; mul.f32 %f56, %f48, %f55; fma.rn.f32 %f57, %f68, %f53, %f56; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f58, [%rd53]; sub.f32 %f59, %f58, %f57; st.global.f32 [%rd53], %f59; mul.f32 %f60, %f41, %f45; mul.f32 %f61, %f72, %f44; fma.rn.f32 %f62, %f71, %f42, %f61; mul.f32 %f63, %f48, %f62; fma.rn.f32 %f64, %f69, %f60, %f63; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f65, [%rd55]; sub.f32 %f66, %f65, %f64; st.global.f32 [%rd55], %f66; BB0_22: ret; } ` addmagnetoelasticfield_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<77>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f67, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f68, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f69, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f70, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f71, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f72, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f75, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f76, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f73, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f67, %f30, %f67; BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f68, %f31, %f68; BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f69, %f32, %f69; BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f70, %f33, %f70; BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f71, %f34, %f71; BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f72, %f35, %f72; BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f73, %f36, %f73; BB0_15: setp.eq.f32 %p9, %f73, 0f00000000; mov.f32 %f74, 0f00000000; @%p9 bra BB0_17; rcp.rn.f32 %f74, %f73; BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f75, %f38, %f75; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f76, %f39, %f76; BB0_21: cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; mul.f32 %f40, %f74, %f75; fma.rn.f32 %f41, %f74, %f75, %f40; ld.global.nc.f32 %f42, [%rd45]; mul.f32 %f43, %f41, %f42; ld.global.nc.f32 %f44, [%rd47]; ld.global.nc.f32 %f45, [%rd49]; mul.f32 %f46, %f71, %f45; fma.rn.f32 %f47, %f70, %f44, %f46; mul.f32 %f48, %f74, %f76; mul.f32 %f49, %f48, %f47; fma.rn.f32 %f50, %f67, %f43, %f49; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f51, [%rd51]; sub.f32 %f52, %f51, %f50; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f41, %f44; mul.f32 %f54, %f72, %f45; fma.rn.f32 %f55, %f70, %f42, %f54; mul.f32 %f56, %f48, %f55; fma.rn.f32 %f57, %f68, %f53, %f56; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f58, [%rd53]; sub.f32 %f59, %f58, %f57; st.global.f32 [%rd53], %f59; mul.f32 %f60, %f41, %f45; mul.f32 %f61, %f72, %f44; fma.rn.f32 %f62, %f71, %f42, %f61; mul.f32 %f63, %f48, %f62; fma.rn.f32 %f64, %f69, %f60, %f63; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f65, [%rd55]; sub.f32 %f66, %f65, %f64; st.global.f32 [%rd55], %f66; BB0_22: ret; } ` addmagnetoelasticfield_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<77>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f67, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f68, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f69, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f70, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f71, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f72, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f75, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f76, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f73, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f67, %f30, %f67; BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f68, %f31, %f68; BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f69, %f32, %f69; BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f70, %f33, %f70; BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f71, %f34, %f71; BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f72, %f35, %f72; BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f73, %f36, %f73; BB0_15: setp.eq.f32 %p9, %f73, 0f00000000; mov.f32 %f74, 0f00000000; @%p9 bra BB0_17; rcp.rn.f32 %f74, %f73; BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f75, %f38, %f75; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f76, %f39, %f76; BB0_21: cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; mul.f32 %f40, %f74, %f75; fma.rn.f32 %f41, %f74, %f75, %f40; ld.global.nc.f32 %f42, [%rd45]; mul.f32 %f43, %f41, %f42; ld.global.nc.f32 %f44, [%rd47]; ld.global.nc.f32 %f45, [%rd49]; mul.f32 %f46, %f71, %f45; fma.rn.f32 %f47, %f70, %f44, %f46; mul.f32 %f48, %f74, %f76; mul.f32 %f49, %f48, %f47; fma.rn.f32 %f50, %f67, %f43, %f49; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f51, [%rd51]; sub.f32 %f52, %f51, %f50; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f41, %f44; mul.f32 %f54, %f72, %f45; fma.rn.f32 %f55, %f70, %f42, %f54; mul.f32 %f56, %f48, %f55; fma.rn.f32 %f57, %f68, %f53, %f56; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f58, [%rd53]; sub.f32 %f59, %f58, %f57; st.global.f32 [%rd53], %f59; mul.f32 %f60, %f41, %f45; mul.f32 %f61, %f72, %f44; fma.rn.f32 %f62, %f71, %f42, %f61; mul.f32 %f63, %f48, %f62; fma.rn.f32 %f64, %f69, %f60, %f63; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f65, [%rd55]; sub.f32 %f66, %f65, %f64; st.global.f32 [%rd55], %f66; BB0_22: ret; } ` addmagnetoelasticfield_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl addmagnetoelasticfield .visible .entry addmagnetoelasticfield( .param .u64 addmagnetoelasticfield_param_0, .param .u64 addmagnetoelasticfield_param_1, .param .u64 addmagnetoelasticfield_param_2, .param .u64 addmagnetoelasticfield_param_3, .param .u64 addmagnetoelasticfield_param_4, .param .u64 addmagnetoelasticfield_param_5, .param .u64 addmagnetoelasticfield_param_6, .param .f32 addmagnetoelasticfield_param_7, .param .u64 addmagnetoelasticfield_param_8, .param .f32 addmagnetoelasticfield_param_9, .param .u64 addmagnetoelasticfield_param_10, .param .f32 addmagnetoelasticfield_param_11, .param .u64 addmagnetoelasticfield_param_12, .param .f32 addmagnetoelasticfield_param_13, .param .u64 addmagnetoelasticfield_param_14, .param .f32 addmagnetoelasticfield_param_15, .param .u64 addmagnetoelasticfield_param_16, .param .f32 addmagnetoelasticfield_param_17, .param .u64 addmagnetoelasticfield_param_18, .param .f32 addmagnetoelasticfield_param_19, .param .u64 addmagnetoelasticfield_param_20, .param .f32 addmagnetoelasticfield_param_21, .param .u64 addmagnetoelasticfield_param_22, .param .f32 addmagnetoelasticfield_param_23, .param .u32 addmagnetoelasticfield_param_24 ) { .reg .pred %p<12>; .reg .f32 %f<77>; .reg .b32 %r<9>; .reg .b64 %rd<56>; ld.param.u64 %rd1, [addmagnetoelasticfield_param_0]; ld.param.u64 %rd2, [addmagnetoelasticfield_param_1]; ld.param.u64 %rd3, [addmagnetoelasticfield_param_2]; ld.param.u64 %rd4, [addmagnetoelasticfield_param_3]; ld.param.u64 %rd5, [addmagnetoelasticfield_param_4]; ld.param.u64 %rd6, [addmagnetoelasticfield_param_5]; ld.param.u64 %rd7, [addmagnetoelasticfield_param_6]; ld.param.f32 %f67, [addmagnetoelasticfield_param_7]; ld.param.u64 %rd8, [addmagnetoelasticfield_param_8]; ld.param.f32 %f68, [addmagnetoelasticfield_param_9]; ld.param.u64 %rd9, [addmagnetoelasticfield_param_10]; ld.param.f32 %f69, [addmagnetoelasticfield_param_11]; ld.param.u64 %rd10, [addmagnetoelasticfield_param_12]; ld.param.f32 %f70, [addmagnetoelasticfield_param_13]; ld.param.u64 %rd11, [addmagnetoelasticfield_param_14]; ld.param.f32 %f71, [addmagnetoelasticfield_param_15]; ld.param.u64 %rd12, [addmagnetoelasticfield_param_16]; ld.param.f32 %f72, [addmagnetoelasticfield_param_17]; ld.param.u64 %rd13, [addmagnetoelasticfield_param_18]; ld.param.f32 %f75, [addmagnetoelasticfield_param_19]; ld.param.u64 %rd14, [addmagnetoelasticfield_param_20]; ld.param.f32 %f76, [addmagnetoelasticfield_param_21]; ld.param.u64 %rd15, [addmagnetoelasticfield_param_22]; ld.param.f32 %f73, [addmagnetoelasticfield_param_23]; ld.param.u32 %r2, [addmagnetoelasticfield_param_24]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_22; setp.eq.s64 %p2, %rd7, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd16, %rd7; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f30, [%rd18]; mul.f32 %f67, %f30, %f67; BB0_3: setp.eq.s64 %p3, %rd8, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd19, %rd8; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f31, [%rd21]; mul.f32 %f68, %f31, %f68; BB0_5: setp.eq.s64 %p4, %rd9, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f32, [%rd24]; mul.f32 %f69, %f32, %f69; BB0_7: setp.eq.s64 %p5, %rd10, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd25, %rd10; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f33, [%rd27]; mul.f32 %f70, %f33, %f70; BB0_9: setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd28, %rd11; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f34, [%rd30]; mul.f32 %f71, %f34, %f71; BB0_11: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd31, %rd12; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; ld.global.nc.f32 %f35, [%rd33]; mul.f32 %f72, %f35, %f72; BB0_13: setp.eq.s64 %p8, %rd15, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd34, %rd15; mul.wide.s32 %rd35, %r1, 4; add.s64 %rd36, %rd34, %rd35; ld.global.nc.f32 %f36, [%rd36]; mul.f32 %f73, %f36, %f73; BB0_15: setp.eq.f32 %p9, %f73, 0f00000000; mov.f32 %f74, 0f00000000; @%p9 bra BB0_17; rcp.rn.f32 %f74, %f73; BB0_17: setp.eq.s64 %p10, %rd13, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd37, %rd13; mul.wide.s32 %rd38, %r1, 4; add.s64 %rd39, %rd37, %rd38; ld.global.nc.f32 %f38, [%rd39]; mul.f32 %f75, %f38, %f75; BB0_19: setp.eq.s64 %p11, %rd14, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd40, %rd14; mul.wide.s32 %rd41, %r1, 4; add.s64 %rd42, %rd40, %rd41; ld.global.nc.f32 %f39, [%rd42]; mul.f32 %f76, %f39, %f76; BB0_21: cvta.to.global.u64 %rd43, %rd4; mul.wide.s32 %rd44, %r1, 4; add.s64 %rd45, %rd43, %rd44; cvta.to.global.u64 %rd46, %rd5; add.s64 %rd47, %rd46, %rd44; cvta.to.global.u64 %rd48, %rd6; add.s64 %rd49, %rd48, %rd44; mul.f32 %f40, %f74, %f75; fma.rn.f32 %f41, %f74, %f75, %f40; ld.global.nc.f32 %f42, [%rd45]; mul.f32 %f43, %f41, %f42; ld.global.nc.f32 %f44, [%rd47]; ld.global.nc.f32 %f45, [%rd49]; mul.f32 %f46, %f71, %f45; fma.rn.f32 %f47, %f70, %f44, %f46; mul.f32 %f48, %f74, %f76; mul.f32 %f49, %f48, %f47; fma.rn.f32 %f50, %f67, %f43, %f49; cvta.to.global.u64 %rd50, %rd1; add.s64 %rd51, %rd50, %rd44; ld.global.f32 %f51, [%rd51]; sub.f32 %f52, %f51, %f50; st.global.f32 [%rd51], %f52; mul.f32 %f53, %f41, %f44; mul.f32 %f54, %f72, %f45; fma.rn.f32 %f55, %f70, %f42, %f54; mul.f32 %f56, %f48, %f55; fma.rn.f32 %f57, %f68, %f53, %f56; cvta.to.global.u64 %rd52, %rd2; add.s64 %rd53, %rd52, %rd44; ld.global.f32 %f58, [%rd53]; sub.f32 %f59, %f58, %f57; st.global.f32 [%rd53], %f59; mul.f32 %f60, %f41, %f45; mul.f32 %f61, %f72, %f44; fma.rn.f32 %f62, %f71, %f42, %f61; mul.f32 %f63, %f48, %f62; fma.rn.f32 %f64, %f69, %f60, %f63; cvta.to.global.u64 %rd54, %rd3; add.s64 %rd55, %rd54, %rd44; ld.global.f32 %f65, [%rd55]; sub.f32 %f66, %f65, %f64; st.global.f32 [%rd55], %f66; BB0_22: ret; } ` ) mumax3-3.10/cuda/magnetoelasticforce.cu000066400000000000000000000174051371432437400201510ustar00rootroot00000000000000#include #include #include "amul.h" #include "float3.h" #include "stencil.h" // Calculate magneto-elastic force density // fmelp = Σ ∂σpq / ∂xq (q = x, y, z) , σpq = ∂Umel / ∂epq, // where epq is the strain tensor and // Umel is the magneto-elastic energy density given by the eq. (12.18) of Gurevich&Melkov "Magnetization Oscillations and Waves", CRC Press, 1996 extern "C" __global__ void getmagnetoelasticforce(float* __restrict__ fx, float* __restrict__ fy, float* __restrict__ fz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ B1_, float B1_mul, float* __restrict__ B2_, float B2_mul, float rcsx, float rcsy, float rcsz, int Nx, int Ny, int Nz, uint8_t PBC) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } int I = idx(ix, iy, iz); // central cell index float3 m0 = make_float3(mx[I], my[I], mz[I]); // +0 float3 dmdx = make_float3(0.0f, 0.0f, 0.0f); // ∂m/∂x float3 dmdy = make_float3(0.0f, 0.0f, 0.0f); // ∂m/∂y float3 dmdz = make_float3(0.0f, 0.0f, 0.0f); // ∂m/∂z int i_; // neighbor index // ∂m/∂x { float3 m_m2 = make_float3(0.0f, 0.0f, 0.0f); // -2 i_ = idx(lclampx(ix-2), iy, iz); // load neighbor m if inside grid, keep 0 otherwise if (ix-2 >= 0 || PBCx) { m_m2 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_m1 = make_float3(0.0f, 0.0f, 0.0f); // -1 i_ = idx(lclampx(ix-1), iy, iz); // load neighbor m if inside grid, keep 0 otherwise if (ix-1 >= 0 || PBCx) { m_m1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p1 = make_float3(0.0f, 0.0f, 0.0f); // +1 i_ = idx(hclampx(ix+1), iy, iz); if (ix+1 < Nx || PBCx) { m_p1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p2 = make_float3(0.0f, 0.0f, 0.0f); // +2 i_ = idx(hclampx(ix+2), iy, iz); if (ix+2 < Nx || PBCx) { m_p2 = make_float3(mx[i_], my[i_], mz[i_]); } if (is0(m_p1) && is0(m_m1)) // +0 { dmdx = make_float3(0.0f, 0.0f, 0.0f); // --1-- zero } else if ((is0(m_m2) | is0(m_p2)) && !is0(m_p1) && !is0(m_m1)) { dmdx = 0.5f * (m_p1 - m_m1); // -111-, 1111-, -1111 central difference, ε ~ h^2 } else if (is0(m_p1) && is0(m_m2)) { dmdx = m0 - m_m1; // -11-- backward difference, ε ~ h^1 } else if (is0(m_m1) && is0(m_p2)) { dmdx = -m0 + m_p1; // --11- forward difference, ε ~ h^1 } else if (!is0(m_m2) && is0(m_p1)) { dmdx = 0.5f * m_m2 - 2.0f * m_m1 + 1.5f * m0; // 111-- backward difference, ε ~ h^2 } else if (!is0(m_p2) && is0(m_m1)) { dmdx = -0.5f * m_p2 + 2.0f * m_p1 - 1.5f * m0; // --111 forward difference, ε ~ h^2 } else { dmdx = (2.0f/3.0f)*(m_p1 - m_m1) + (1.0f/12.0f)*(m_m2 - m_p2); // 11111 central difference, ε ~ h^4 } } // ∂m/∂y { float3 m_m2 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, lclampy(iy-2), iz); if (iy-2 >= 0 || PBCy) { m_m2 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_m1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, lclampy(iy-1), iz); if (iy-1 >= 0 || PBCy) { m_m1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, hclampy(iy+1), iz); if (iy+1 < Ny || PBCy) { m_p1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p2 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, hclampy(iy+2), iz); if (iy+2 < Ny || PBCy) { m_p2 = make_float3(mx[i_], my[i_], mz[i_]); } if (is0(m_p1) && is0(m_m1)) // +0 { dmdy = make_float3(0.0f, 0.0f, 0.0f); // --1-- zero } else if ((is0(m_m2) | is0(m_p2)) && !is0(m_p1) && !is0(m_m1)) { dmdy = 0.5f * (m_p1 - m_m1); // -111-, 1111-, -1111 central difference, ε ~ h^2 } else if (is0(m_p1) && is0(m_m2)) { dmdy = m0 - m_m1; // -11-- backward difference, ε ~ h^1 } else if (is0(m_m1) && is0(m_p2)) { dmdy = -m0 + m_p1; // --11- forward difference, ε ~ h^1 } else if (!is0(m_m2) && is0(m_p1)) { dmdy = 0.5f * m_m2 - 2.0f * m_m1 + 1.5f * m0; // 111-- backward difference, ε ~ h^2 } else if (!is0(m_p2) && is0(m_m1)) { dmdy = -0.5f * m_p2 + 2.0f * m_p1 - 1.5f * m0; // --111 forward difference, ε ~ h^2 } else { dmdy = (2.0f/3.0f)*(m_p1 - m_m1) + (1.0f/12.0f)*(m_m2 - m_p2); // 11111 central difference, ε ~ h^4 } } // ∂u/∂z { float3 m_m2 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, iy, lclampz(iz-2)); if (iz-2 >= 0 || PBCz) { m_m2 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_m1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, iy, lclampz(iz-1)); if (iz-1 >= 0 || PBCz) { m_m1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, iy, hclampz(iz+1)); if (iz+1 < Nz || PBCz) { m_p1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p2 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, iy, hclampz(iz+2)); if (iz+2 < Nz || PBCz) { m_p2 = make_float3(mx[i_], my[i_], mz[i_]); } if (is0(m_p1) && is0(m_m1)) // +0 { dmdz = make_float3(0.0f, 0.0f, 0.0f); // --1-- zero } else if ((is0(m_m2) | is0(m_p2)) && !is0(m_p1) && !is0(m_m1)) { dmdz = 0.5f * (m_p1 - m_m1); // -111-, 1111-, -1111 central difference, ε ~ h^2 } else if (is0(m_p1) && is0(m_m2)) { dmdz = m0 - m_m1; // -11-- backward difference, ε ~ h^1 } else if (is0(m_m1) && is0(m_p2)) { dmdz = -m0 + m_p1; // --11- forward difference, ε ~ h^1 } else if (!is0(m_m2) && is0(m_p1)) { dmdz = 0.5f * m_m2 - 2.0f * m_m1 + 1.5f * m0; // 111-- backward difference, ε ~ h^2 } else if (!is0(m_p2) && is0(m_m1)) { dmdz = -0.5f * m_p2 + 2.0f * m_p1 - 1.5f * m0; // --111 forward difference, ε ~ h^2 } else { dmdz = (2.0f/3.0f)*(m_p1 - m_m1) + (1.0f/12.0f)*(m_m2 - m_p2); // 11111 central difference, ε ~ h^4 } } dmdx *= rcsx; dmdy *= rcsy; dmdz *= rcsz; float B1 = amul(B1_, B1_mul, I); float B2 = amul(B2_, B2_mul, I); fx[I] = 2.0f*B1*m0.x*dmdx.x + B2*(m0.x*(dmdy.y + dmdz.z) + m0.y*dmdy.x + m0.z*dmdz.x); fy[I] = 2.0f*B1*m0.y*dmdy.y + B2*(m0.x*dmdx.y + m0.y*(dmdx.x + dmdz.z) + m0.z*dmdz.y); fz[I] = 2.0f*B1*m0.z*dmdz.z + B2*(m0.x*dmdx.z + m0.y*dmdy.z + m0.z*(dmdx.x + dmdy.y)); } mumax3-3.10/cuda/magnetoelasticforce_wrapper.go000066400000000000000000011133561371432437400217120ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for getmagnetoelasticforce kernel var getmagnetoelasticforce_code cu.Function // Stores the arguments for getmagnetoelasticforce kernel invocation type getmagnetoelasticforce_args_t struct { arg_fx unsafe.Pointer arg_fy unsafe.Pointer arg_fz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_B1_ unsafe.Pointer arg_B1_mul float32 arg_B2_ unsafe.Pointer arg_B2_mul float32 arg_rcsx float32 arg_rcsy float32 arg_rcsz float32 arg_Nx int arg_Ny int arg_Nz int arg_PBC byte argptr [17]unsafe.Pointer sync.Mutex } // Stores the arguments for getmagnetoelasticforce kernel invocation var getmagnetoelasticforce_args getmagnetoelasticforce_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. getmagnetoelasticforce_args.argptr[0] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_fx) getmagnetoelasticforce_args.argptr[1] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_fy) getmagnetoelasticforce_args.argptr[2] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_fz) getmagnetoelasticforce_args.argptr[3] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_mx) getmagnetoelasticforce_args.argptr[4] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_my) getmagnetoelasticforce_args.argptr[5] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_mz) getmagnetoelasticforce_args.argptr[6] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_B1_) getmagnetoelasticforce_args.argptr[7] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_B1_mul) getmagnetoelasticforce_args.argptr[8] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_B2_) getmagnetoelasticforce_args.argptr[9] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_B2_mul) getmagnetoelasticforce_args.argptr[10] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_rcsx) getmagnetoelasticforce_args.argptr[11] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_rcsy) getmagnetoelasticforce_args.argptr[12] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_rcsz) getmagnetoelasticforce_args.argptr[13] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_Nx) getmagnetoelasticforce_args.argptr[14] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_Ny) getmagnetoelasticforce_args.argptr[15] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_Nz) getmagnetoelasticforce_args.argptr[16] = unsafe.Pointer(&getmagnetoelasticforce_args.arg_PBC) } // Wrapper for getmagnetoelasticforce CUDA kernel, asynchronous. func k_getmagnetoelasticforce_async(fx unsafe.Pointer, fy unsafe.Pointer, fz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, B1_ unsafe.Pointer, B1_mul float32, B2_ unsafe.Pointer, B2_mul float32, rcsx float32, rcsy float32, rcsz float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("getmagnetoelasticforce") } getmagnetoelasticforce_args.Lock() defer getmagnetoelasticforce_args.Unlock() if getmagnetoelasticforce_code == 0 { getmagnetoelasticforce_code = fatbinLoad(getmagnetoelasticforce_map, "getmagnetoelasticforce") } getmagnetoelasticforce_args.arg_fx = fx getmagnetoelasticforce_args.arg_fy = fy getmagnetoelasticforce_args.arg_fz = fz getmagnetoelasticforce_args.arg_mx = mx getmagnetoelasticforce_args.arg_my = my getmagnetoelasticforce_args.arg_mz = mz getmagnetoelasticforce_args.arg_B1_ = B1_ getmagnetoelasticforce_args.arg_B1_mul = B1_mul getmagnetoelasticforce_args.arg_B2_ = B2_ getmagnetoelasticforce_args.arg_B2_mul = B2_mul getmagnetoelasticforce_args.arg_rcsx = rcsx getmagnetoelasticforce_args.arg_rcsy = rcsy getmagnetoelasticforce_args.arg_rcsz = rcsz getmagnetoelasticforce_args.arg_Nx = Nx getmagnetoelasticforce_args.arg_Ny = Ny getmagnetoelasticforce_args.arg_Nz = Nz getmagnetoelasticforce_args.arg_PBC = PBC args := getmagnetoelasticforce_args.argptr[:] cu.LaunchKernel(getmagnetoelasticforce_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("getmagnetoelasticforce") } } // maps compute capability on PTX code for getmagnetoelasticforce kernel. var getmagnetoelasticforce_map = map[int]string{0: "", 30: getmagnetoelasticforce_ptx_30, 32: getmagnetoelasticforce_ptx_32, 35: getmagnetoelasticforce_ptx_35, 37: getmagnetoelasticforce_ptx_37, 50: getmagnetoelasticforce_ptx_50, 52: getmagnetoelasticforce_ptx_52, 53: getmagnetoelasticforce_ptx_53, 60: getmagnetoelasticforce_ptx_60, 61: getmagnetoelasticforce_ptx_61, 62: getmagnetoelasticforce_ptx_62, 70: getmagnetoelasticforce_ptx_70, 72: getmagnetoelasticforce_ptx_72, 75: getmagnetoelasticforce_ptx_75} // getmagnetoelasticforce PTX code for various compute capabilities. const ( getmagnetoelasticforce_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<113>; .reg .b16 %rs<19>; .reg .f32 %f<414>; .reg .b32 %r<226>; .reg .b64 %rd<89>; ld.param.u64 %rd4, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_6]; ld.param.f32 %f412, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_8]; ld.param.f32 %f413, [getmagnetoelasticforce_param_9]; ld.param.f32 %f157, [getmagnetoelasticforce_param_10]; ld.param.f32 %f158, [getmagnetoelasticforce_param_11]; ld.param.f32 %f159, [getmagnetoelasticforce_param_12]; ld.param.u32 %r56, [getmagnetoelasticforce_param_13]; ld.param.u32 %r57, [getmagnetoelasticforce_param_14]; ld.param.u32 %r58, [getmagnetoelasticforce_param_15]; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; cvta.to.global.u64 %rd1, %rd9; cvta.to.global.u64 %rd2, %rd8; cvta.to.global.u64 %rd3, %rd7; mov.u32 %r59, %ntid.x; mov.u32 %r60, %ctaid.x; mov.u32 %r61, %tid.x; mad.lo.s32 %r1, %r59, %r60, %r61; mov.u32 %r62, %ntid.y; mov.u32 %r63, %ctaid.y; mov.u32 %r64, %tid.y; mad.lo.s32 %r2, %r62, %r63, %r64; mov.u32 %r65, %ntid.z; mov.u32 %r66, %ctaid.z; mov.u32 %r67, %tid.z; mad.lo.s32 %r3, %r65, %r66, %r67; setp.ge.s32 %p4, %r2, %r57; setp.ge.s32 %p5, %r1, %r56; or.pred %p6, %p4, %p5; setp.ge.s32 %p7, %r3, %r58; or.pred %p8, %p6, %p7; @%p8 bra BB0_108; mul.lo.s32 %r4, %r3, %r57; add.s32 %r68, %r4, %r2; mul.lo.s32 %r5, %r68, %r56; add.s32 %r69, %r5, %r1; mul.wide.s32 %rd12, %r69, 4; add.s64 %rd13, %rd3, %rd12; ld.global.f32 %f1, [%rd13]; add.s64 %rd14, %rd2, %rd12; ld.global.f32 %f2, [%rd14]; add.s64 %rd15, %rd1, %rd12; ld.global.f32 %f3, [%rd15]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r6, %r1, -2; @%p9 bra BB0_3; rem.s32 %r70, %r6, %r56; add.s32 %r71, %r70, %r56; rem.s32 %r214, %r71, %r56; bra.uni BB0_4; BB0_3: mov.u32 %r72, 0; max.s32 %r214, %r6, %r72; BB0_4: setp.lt.s32 %p11, %r6, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_6; add.s32 %r73, %r214, %r5; mul.wide.s32 %rd16, %r73, 4; add.s64 %rd17, %rd3, %rd16; ld.global.f32 %f7, [%rd17]; add.s64 %rd18, %rd2, %rd16; ld.global.f32 %f8, [%rd18]; add.s64 %rd19, %rd1, %rd16; ld.global.f32 %f9, [%rd19]; BB0_6: add.s32 %r10, %r1, -1; @%p9 bra BB0_8; rem.s32 %r74, %r10, %r56; add.s32 %r75, %r74, %r56; rem.s32 %r215, %r75, %r56; bra.uni BB0_9; BB0_8: mov.u32 %r76, 0; max.s32 %r215, %r10, %r76; BB0_9: setp.lt.s32 %p14, %r10, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_11; add.s32 %r77, %r215, %r5; mul.wide.s32 %rd20, %r77, 4; add.s64 %rd21, %rd3, %rd20; ld.global.f32 %f13, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.f32 %f14, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.f32 %f15, [%rd23]; BB0_11: add.s32 %r14, %r1, 1; @%p9 bra BB0_13; rem.s32 %r78, %r14, %r56; add.s32 %r79, %r78, %r56; rem.s32 %r216, %r79, %r56; bra.uni BB0_14; BB0_13: add.s32 %r80, %r56, -1; min.s32 %r216, %r14, %r80; BB0_14: setp.ge.s32 %p18, %r14, %r56; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_16; add.s32 %r81, %r216, %r5; mul.wide.s32 %rd24, %r81, 4; add.s64 %rd25, %rd3, %rd24; ld.global.f32 %f19, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.f32 %f20, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.f32 %f21, [%rd27]; BB0_16: add.s32 %r18, %r1, 2; @%p9 bra BB0_18; rem.s32 %r82, %r18, %r56; add.s32 %r83, %r82, %r56; rem.s32 %r217, %r83, %r56; bra.uni BB0_19; BB0_18: add.s32 %r84, %r56, -1; min.s32 %r217, %r18, %r84; BB0_19: add.s32 %r22, %r217, %r5; setp.ge.s32 %p22, %r18, %r56; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_21; mul.wide.s32 %rd28, %r22, 4; add.s64 %rd29, %rd3, %rd28; ld.global.f32 %f25, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.f32 %f26, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.f32 %f27, [%rd31]; BB0_21: mul.f32 %f172, %f20, %f20; fma.rn.f32 %f173, %f19, %f19, %f172; fma.rn.f32 %f28, %f21, %f21, %f173; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_23; mul.f32 %f177, %f14, %f14; fma.rn.f32 %f178, %f13, %f13, %f177; fma.rn.f32 %f179, %f15, %f15, %f178; setp.eq.f32 %p26, %f179, 0f00000000; mov.f32 %f379, 0f00000000; mov.f32 %f380, %f379; mov.f32 %f381, %f379; @%p26 bra BB0_35; BB0_23: mul.f32 %f180, %f8, %f8; fma.rn.f32 %f181, %f7, %f7, %f180; fma.rn.f32 %f29, %f9, %f9, %f181; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f182, %f26, %f26; fma.rn.f32 %f183, %f25, %f25, %f182; fma.rn.f32 %f30, %f27, %f27, %f183; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_25; mul.f32 %f184, %f14, %f14; fma.rn.f32 %f185, %f13, %f13, %f184; fma.rn.f32 %f186, %f15, %f15, %f185; setp.neu.f32 %p32, %f186, 0f00000000; @%p32 bra BB0_34; bra.uni BB0_25; BB0_34: sub.f32 %f215, %f19, %f13; mul.f32 %f379, %f215, 0f3F000000; sub.f32 %f216, %f20, %f14; mul.f32 %f380, %f216, 0f3F000000; sub.f32 %f217, %f21, %f15; mul.f32 %f381, %f217, 0f3F000000; bra.uni BB0_35; BB0_25: or.pred %p34, %p25, %p27; @%p34 bra BB0_27; bra.uni BB0_26; BB0_27: mul.f32 %f187, %f14, %f14; fma.rn.f32 %f188, %f13, %f13, %f187; fma.rn.f32 %f34, %f15, %f15, %f188; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_29; bra.uni BB0_28; BB0_29: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_31; bra.uni BB0_30; BB0_31: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_33; bra.uni BB0_32; BB0_33: sub.f32 %f206, %f19, %f13; sub.f32 %f207, %f20, %f14; sub.f32 %f208, %f21, %f15; sub.f32 %f209, %f7, %f25; mul.f32 %f210, %f209, 0f3DAAAAAB; sub.f32 %f211, %f8, %f26; mul.f32 %f212, %f211, 0f3DAAAAAB; sub.f32 %f213, %f9, %f27; mul.f32 %f214, %f213, 0f3DAAAAAB; fma.rn.f32 %f379, %f206, 0f3F2AAAAB, %f210; fma.rn.f32 %f380, %f207, 0f3F2AAAAB, %f212; fma.rn.f32 %f381, %f208, 0f3F2AAAAB, %f214; bra.uni BB0_35; BB0_26: sub.f32 %f379, %f1, %f13; sub.f32 %f380, %f2, %f14; sub.f32 %f381, %f3, %f15; bra.uni BB0_35; BB0_28: sub.f32 %f379, %f19, %f1; sub.f32 %f380, %f20, %f2; sub.f32 %f381, %f21, %f3; bra.uni BB0_35; BB0_30: mul.f32 %f189, %f13, 0fC0000000; fma.rn.f32 %f190, %f7, 0f3F000000, %f189; add.f32 %f191, %f14, %f14; mul.f32 %f192, %f8, 0f3F000000; sub.f32 %f193, %f192, %f191; add.f32 %f194, %f15, %f15; mul.f32 %f195, %f9, 0f3F000000; sub.f32 %f196, %f195, %f194; fma.rn.f32 %f379, %f1, 0f3FC00000, %f190; fma.rn.f32 %f380, %f2, 0f3FC00000, %f193; fma.rn.f32 %f381, %f3, 0f3FC00000, %f196; bra.uni BB0_35; BB0_32: mul.f32 %f197, %f25, 0fBF000000; fma.rn.f32 %f198, %f19, 0f40000000, %f197; mul.f32 %f199, %f26, 0fBF000000; fma.rn.f32 %f200, %f20, 0f40000000, %f199; mul.f32 %f201, %f27, 0fBF000000; fma.rn.f32 %f202, %f21, 0f40000000, %f201; mul.f32 %f203, %f1, 0f3FC00000; sub.f32 %f379, %f198, %f203; mul.f32 %f204, %f2, 0f3FC00000; sub.f32 %f380, %f200, %f204; mul.f32 %f205, %f3, 0f3FC00000; sub.f32 %f381, %f202, %f205; BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_37; rem.s32 %r85, %r23, %r57; add.s32 %r86, %r85, %r57; rem.s32 %r218, %r86, %r57; bra.uni BB0_38; BB0_37: mov.u32 %r87, 0; max.s32 %r218, %r23, %r87; BB0_38: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_40; add.s32 %r88, %r218, %r4; mad.lo.s32 %r89, %r88, %r56, %r1; mul.wide.s32 %rd32, %r89, 4; add.s64 %rd33, %rd3, %rd32; ld.global.f32 %f56, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.f32 %f57, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.f32 %f58, [%rd35]; BB0_40: add.s32 %r27, %r2, -1; @%p43 bra BB0_42; rem.s32 %r90, %r27, %r57; add.s32 %r91, %r90, %r57; rem.s32 %r219, %r91, %r57; bra.uni BB0_43; BB0_42: mov.u32 %r92, 0; max.s32 %r219, %r27, %r92; BB0_43: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_45; add.s32 %r93, %r219, %r4; mad.lo.s32 %r94, %r93, %r56, %r1; mul.wide.s32 %rd36, %r94, 4; add.s64 %rd37, %rd3, %rd36; ld.global.f32 %f62, [%rd37]; add.s64 %rd38, %rd2, %rd36; ld.global.f32 %f63, [%rd38]; add.s64 %rd39, %rd1, %rd36; ld.global.f32 %f64, [%rd39]; BB0_45: add.s32 %r31, %r2, 1; @%p43 bra BB0_47; rem.s32 %r95, %r31, %r57; add.s32 %r96, %r95, %r57; rem.s32 %r220, %r96, %r57; bra.uni BB0_48; BB0_47: add.s32 %r97, %r57, -1; min.s32 %r220, %r31, %r97; BB0_48: setp.ge.s32 %p52, %r31, %r57; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_50; add.s32 %r98, %r220, %r4; mad.lo.s32 %r99, %r98, %r56, %r1; mul.wide.s32 %rd40, %r99, 4; add.s64 %rd41, %rd3, %rd40; ld.global.f32 %f68, [%rd41]; add.s64 %rd42, %rd2, %rd40; ld.global.f32 %f69, [%rd42]; add.s64 %rd43, %rd1, %rd40; ld.global.f32 %f70, [%rd43]; BB0_50: add.s32 %r35, %r2, 2; @%p43 bra BB0_52; rem.s32 %r100, %r35, %r57; add.s32 %r101, %r100, %r57; rem.s32 %r221, %r101, %r57; bra.uni BB0_53; BB0_52: add.s32 %r102, %r57, -1; min.s32 %r221, %r35, %r102; BB0_53: add.s32 %r103, %r221, %r4; mad.lo.s32 %r39, %r103, %r56, %r1; setp.ge.s32 %p56, %r35, %r57; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_55; mul.wide.s32 %rd44, %r39, 4; add.s64 %rd45, %rd3, %rd44; ld.global.f32 %f74, [%rd45]; add.s64 %rd46, %rd2, %rd44; ld.global.f32 %f75, [%rd46]; add.s64 %rd47, %rd1, %rd44; ld.global.f32 %f76, [%rd47]; BB0_55: mul.f32 %f230, %f69, %f69; fma.rn.f32 %f231, %f68, %f68, %f230; fma.rn.f32 %f77, %f70, %f70, %f231; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_57; mul.f32 %f235, %f63, %f63; fma.rn.f32 %f236, %f62, %f62, %f235; fma.rn.f32 %f237, %f64, %f64, %f236; setp.eq.f32 %p60, %f237, 0f00000000; mov.f32 %f394, 0f00000000; mov.f32 %f395, %f394; mov.f32 %f396, %f394; @%p60 bra BB0_69; BB0_57: mul.f32 %f238, %f57, %f57; fma.rn.f32 %f239, %f56, %f56, %f238; fma.rn.f32 %f78, %f58, %f58, %f239; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f240, %f75, %f75; fma.rn.f32 %f241, %f74, %f74, %f240; fma.rn.f32 %f79, %f76, %f76, %f241; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_59; mul.f32 %f242, %f63, %f63; fma.rn.f32 %f243, %f62, %f62, %f242; fma.rn.f32 %f244, %f64, %f64, %f243; setp.neu.f32 %p66, %f244, 0f00000000; @%p66 bra BB0_68; bra.uni BB0_59; BB0_68: sub.f32 %f273, %f68, %f62; mul.f32 %f394, %f273, 0f3F000000; sub.f32 %f274, %f69, %f63; mul.f32 %f395, %f274, 0f3F000000; sub.f32 %f275, %f70, %f64; mul.f32 %f396, %f275, 0f3F000000; bra.uni BB0_69; BB0_59: or.pred %p68, %p59, %p61; @%p68 bra BB0_61; bra.uni BB0_60; BB0_61: mul.f32 %f245, %f63, %f63; fma.rn.f32 %f246, %f62, %f62, %f245; fma.rn.f32 %f83, %f64, %f64, %f246; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_63; bra.uni BB0_62; BB0_63: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_65; bra.uni BB0_64; BB0_65: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_67; bra.uni BB0_66; BB0_67: sub.f32 %f264, %f68, %f62; sub.f32 %f265, %f69, %f63; sub.f32 %f266, %f70, %f64; sub.f32 %f267, %f56, %f74; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f57, %f75; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f58, %f76; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f394, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f395, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f396, %f266, 0f3F2AAAAB, %f272; bra.uni BB0_69; BB0_60: sub.f32 %f394, %f1, %f62; sub.f32 %f395, %f2, %f63; sub.f32 %f396, %f3, %f64; bra.uni BB0_69; BB0_62: sub.f32 %f394, %f68, %f1; sub.f32 %f395, %f69, %f2; sub.f32 %f396, %f70, %f3; bra.uni BB0_69; BB0_64: mul.f32 %f247, %f62, 0fC0000000; fma.rn.f32 %f248, %f56, 0f3F000000, %f247; add.f32 %f249, %f63, %f63; mul.f32 %f250, %f57, 0f3F000000; sub.f32 %f251, %f250, %f249; add.f32 %f252, %f64, %f64; mul.f32 %f253, %f58, 0f3F000000; sub.f32 %f254, %f253, %f252; fma.rn.f32 %f394, %f1, 0f3FC00000, %f248; fma.rn.f32 %f395, %f2, 0f3FC00000, %f251; fma.rn.f32 %f396, %f3, 0f3FC00000, %f254; bra.uni BB0_69; BB0_66: mul.f32 %f255, %f74, 0fBF000000; fma.rn.f32 %f256, %f68, 0f40000000, %f255; mul.f32 %f257, %f75, 0fBF000000; fma.rn.f32 %f258, %f69, 0f40000000, %f257; mul.f32 %f259, %f76, 0fBF000000; fma.rn.f32 %f260, %f70, 0f40000000, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f394, %f256, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f395, %f258, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f396, %f260, %f263; BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p77, %rs3, 0; add.s32 %r40, %r3, -2; @%p77 bra BB0_71; rem.s32 %r108, %r40, %r58; add.s32 %r109, %r108, %r58; rem.s32 %r222, %r109, %r58; bra.uni BB0_72; BB0_71: mov.u32 %r110, 0; max.s32 %r222, %r40, %r110; BB0_72: setp.lt.s32 %p79, %r40, 0; mov.f32 %f105, 0f00000000; and.pred %p80, %p79, %p77; mov.f32 %f106, %f105; mov.f32 %f107, %f105; @%p80 bra BB0_74; mad.lo.s32 %r119, %r222, %r57, %r2; mad.lo.s32 %r120, %r119, %r56, %r1; mul.wide.s32 %rd49, %r120, 4; add.s64 %rd50, %rd3, %rd49; ld.global.f32 %f105, [%rd50]; add.s64 %rd52, %rd2, %rd49; ld.global.f32 %f106, [%rd52]; add.s64 %rd54, %rd1, %rd49; ld.global.f32 %f107, [%rd54]; BB0_74: add.s32 %r44, %r3, -1; @%p77 bra BB0_76; rem.s32 %r125, %r44, %r58; add.s32 %r126, %r125, %r58; rem.s32 %r223, %r126, %r58; bra.uni BB0_77; BB0_76: mov.u32 %r127, 0; max.s32 %r223, %r44, %r127; BB0_77: setp.lt.s32 %p82, %r44, 0; mov.f32 %f111, 0f00000000; and.pred %p84, %p82, %p77; mov.f32 %f112, %f111; mov.f32 %f113, %f111; @%p84 bra BB0_79; mad.lo.s32 %r136, %r223, %r57, %r2; mad.lo.s32 %r137, %r136, %r56, %r1; mul.wide.s32 %rd56, %r137, 4; add.s64 %rd57, %rd3, %rd56; ld.global.f32 %f111, [%rd57]; add.s64 %rd59, %rd2, %rd56; ld.global.f32 %f112, [%rd59]; add.s64 %rd61, %rd1, %rd56; ld.global.f32 %f113, [%rd61]; BB0_79: add.s32 %r48, %r3, 1; @%p77 bra BB0_81; rem.s32 %r142, %r48, %r58; add.s32 %r143, %r142, %r58; rem.s32 %r224, %r143, %r58; bra.uni BB0_82; BB0_81: add.s32 %r144, %r58, -1; min.s32 %r224, %r48, %r144; BB0_82: setp.ge.s32 %p86, %r48, %r58; mov.f32 %f117, 0f00000000; and.pred %p88, %p86, %p77; mov.f32 %f118, %f117; mov.f32 %f119, %f117; @%p88 bra BB0_84; mad.lo.s32 %r153, %r224, %r57, %r2; mad.lo.s32 %r154, %r153, %r56, %r1; mul.wide.s32 %rd63, %r154, 4; add.s64 %rd64, %rd3, %rd63; ld.global.f32 %f117, [%rd64]; add.s64 %rd66, %rd2, %rd63; ld.global.f32 %f118, [%rd66]; add.s64 %rd68, %rd1, %rd63; ld.global.f32 %f119, [%rd68]; BB0_84: add.s32 %r52, %r3, 2; @%p77 bra BB0_86; rem.s32 %r159, %r52, %r58; add.s32 %r160, %r159, %r58; rem.s32 %r225, %r160, %r58; bra.uni BB0_87; BB0_86: add.s32 %r161, %r58, -1; min.s32 %r225, %r52, %r161; BB0_87: setp.ge.s32 %p90, %r52, %r58; mov.f32 %f123, 0f00000000; and.pred %p92, %p90, %p77; mov.f32 %f124, %f123; mov.f32 %f125, %f123; @%p92 bra BB0_89; mad.lo.s32 %r170, %r225, %r57, %r2; mad.lo.s32 %r171, %r170, %r56, %r1; mul.wide.s32 %rd70, %r171, 4; add.s64 %rd71, %rd3, %rd70; ld.global.f32 %f123, [%rd71]; add.s64 %rd73, %rd2, %rd70; ld.global.f32 %f124, [%rd73]; add.s64 %rd75, %rd1, %rd70; ld.global.f32 %f125, [%rd75]; BB0_89: mul.f32 %f288, %f118, %f118; fma.rn.f32 %f289, %f117, %f117, %f288; fma.rn.f32 %f126, %f119, %f119, %f289; setp.neu.f32 %p93, %f126, 0f00000000; @%p93 bra BB0_91; mul.f32 %f293, %f112, %f112; fma.rn.f32 %f294, %f111, %f111, %f293; fma.rn.f32 %f295, %f113, %f113, %f294; setp.eq.f32 %p94, %f295, 0f00000000; mov.f32 %f409, 0f00000000; mov.f32 %f410, %f409; mov.f32 %f411, %f409; @%p94 bra BB0_103; BB0_91: mul.f32 %f296, %f106, %f106; fma.rn.f32 %f297, %f105, %f105, %f296; fma.rn.f32 %f127, %f107, %f107, %f297; setp.neu.f32 %p95, %f127, 0f00000000; mul.f32 %f298, %f124, %f124; fma.rn.f32 %f299, %f123, %f123, %f298; fma.rn.f32 %f128, %f125, %f125, %f299; setp.neu.f32 %p96, %f128, 0f00000000; and.pred %p97, %p95, %p96; setp.eq.f32 %p98, %f126, 0f00000000; or.pred %p99, %p97, %p98; @%p99 bra BB0_93; mul.f32 %f300, %f112, %f112; fma.rn.f32 %f301, %f111, %f111, %f300; fma.rn.f32 %f302, %f113, %f113, %f301; setp.neu.f32 %p100, %f302, 0f00000000; @%p100 bra BB0_102; bra.uni BB0_93; BB0_102: sub.f32 %f331, %f117, %f111; mul.f32 %f409, %f331, 0f3F000000; sub.f32 %f332, %f118, %f112; mul.f32 %f410, %f332, 0f3F000000; sub.f32 %f333, %f119, %f113; mul.f32 %f411, %f333, 0f3F000000; bra.uni BB0_103; BB0_93: or.pred %p102, %p93, %p95; @%p102 bra BB0_95; bra.uni BB0_94; BB0_95: mul.f32 %f303, %f112, %f112; fma.rn.f32 %f304, %f111, %f111, %f303; fma.rn.f32 %f132, %f113, %f113, %f304; setp.neu.f32 %p103, %f132, 0f00000000; or.pred %p105, %p103, %p96; @%p105 bra BB0_97; bra.uni BB0_96; BB0_97: setp.eq.f32 %p106, %f127, 0f00000000; or.pred %p107, %p106, %p93; @%p107 bra BB0_99; bra.uni BB0_98; BB0_99: setp.eq.f32 %p109, %f128, 0f00000000; or.pred %p110, %p109, %p103; @%p110 bra BB0_101; bra.uni BB0_100; BB0_101: sub.f32 %f322, %f117, %f111; sub.f32 %f323, %f118, %f112; sub.f32 %f324, %f119, %f113; sub.f32 %f325, %f105, %f123; mul.f32 %f326, %f325, 0f3DAAAAAB; sub.f32 %f327, %f106, %f124; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f107, %f125; mul.f32 %f330, %f329, 0f3DAAAAAB; fma.rn.f32 %f409, %f322, 0f3F2AAAAB, %f326; fma.rn.f32 %f410, %f323, 0f3F2AAAAB, %f328; fma.rn.f32 %f411, %f324, 0f3F2AAAAB, %f330; bra.uni BB0_103; BB0_94: sub.f32 %f409, %f1, %f111; sub.f32 %f410, %f2, %f112; sub.f32 %f411, %f3, %f113; bra.uni BB0_103; BB0_96: sub.f32 %f409, %f117, %f1; sub.f32 %f410, %f118, %f2; sub.f32 %f411, %f119, %f3; bra.uni BB0_103; BB0_98: mul.f32 %f305, %f111, 0fC0000000; fma.rn.f32 %f306, %f105, 0f3F000000, %f305; add.f32 %f307, %f112, %f112; mul.f32 %f308, %f106, 0f3F000000; sub.f32 %f309, %f308, %f307; add.f32 %f310, %f113, %f113; mul.f32 %f311, %f107, 0f3F000000; sub.f32 %f312, %f311, %f310; fma.rn.f32 %f409, %f1, 0f3FC00000, %f306; fma.rn.f32 %f410, %f2, 0f3FC00000, %f309; fma.rn.f32 %f411, %f3, 0f3FC00000, %f312; bra.uni BB0_103; BB0_100: mul.f32 %f313, %f123, 0fBF000000; fma.rn.f32 %f314, %f117, 0f40000000, %f313; mul.f32 %f315, %f124, 0fBF000000; fma.rn.f32 %f316, %f118, 0f40000000, %f315; mul.f32 %f317, %f125, 0fBF000000; fma.rn.f32 %f318, %f119, 0f40000000, %f317; mul.f32 %f319, %f1, 0f3FC00000; sub.f32 %f409, %f314, %f319; mul.f32 %f320, %f2, 0f3FC00000; sub.f32 %f410, %f316, %f320; mul.f32 %f321, %f3, 0f3FC00000; sub.f32 %f411, %f318, %f321; BB0_103: setp.eq.s64 %p111, %rd10, 0; @%p111 bra BB0_105; mad.lo.s32 %r180, %r3, %r57, %r2; mad.lo.s32 %r185, %r180, %r56, %r1; cvta.to.global.u64 %rd76, %rd10; mul.wide.s32 %rd77, %r185, 4; add.s64 %rd78, %rd76, %rd77; ld.global.f32 %f334, [%rd78]; mul.f32 %f412, %f334, %f412; BB0_105: setp.eq.s64 %p112, %rd11, 0; @%p112 bra BB0_107; mad.lo.s32 %r194, %r3, %r57, %r2; mad.lo.s32 %r199, %r194, %r56, %r1; cvta.to.global.u64 %rd79, %rd11; mul.wide.s32 %rd80, %r199, 4; add.s64 %rd81, %rd79, %rd80; ld.global.f32 %f335, [%rd81]; mul.f32 %f413, %f335, %f413; BB0_107: mul.f32 %f336, %f379, %f157; mul.f32 %f337, %f395, %f158; mul.f32 %f338, %f411, %f159; add.f32 %f339, %f412, %f412; mul.f32 %f340, %f1, %f339; add.f32 %f341, %f337, %f338; mul.f32 %f342, %f1, %f341; mul.f32 %f343, %f394, %f158; fma.rn.f32 %f344, %f2, %f343, %f342; mul.f32 %f345, %f409, %f159; fma.rn.f32 %f346, %f3, %f345, %f344; mul.f32 %f347, %f346, %f413; fma.rn.f32 %f348, %f336, %f340, %f347; mad.lo.s32 %r208, %r3, %r57, %r2; mad.lo.s32 %r213, %r208, %r56, %r1; cvta.to.global.u64 %rd82, %rd4; mul.wide.s32 %rd83, %r213, 4; add.s64 %rd84, %rd82, %rd83; st.global.f32 [%rd84], %f348; mul.f32 %f349, %f2, %f339; add.f32 %f350, %f336, %f338; mul.f32 %f351, %f2, %f350; mul.f32 %f352, %f380, %f157; fma.rn.f32 %f353, %f1, %f352, %f351; mul.f32 %f354, %f410, %f159; fma.rn.f32 %f355, %f3, %f354, %f353; mul.f32 %f356, %f355, %f413; fma.rn.f32 %f357, %f337, %f349, %f356; cvta.to.global.u64 %rd85, %rd5; add.s64 %rd86, %rd85, %rd83; st.global.f32 [%rd86], %f357; mul.f32 %f358, %f3, %f339; mul.f32 %f359, %f396, %f158; mul.f32 %f360, %f2, %f359; mul.f32 %f361, %f381, %f157; fma.rn.f32 %f362, %f1, %f361, %f360; add.f32 %f363, %f336, %f337; fma.rn.f32 %f364, %f3, %f363, %f362; mul.f32 %f365, %f364, %f413; fma.rn.f32 %f366, %f338, %f358, %f365; cvta.to.global.u64 %rd87, %rd6; add.s64 %rd88, %rd87, %rd83; st.global.f32 [%rd88], %f366; BB0_108: ret; } ` getmagnetoelasticforce_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<113>; .reg .b16 %rs<19>; .reg .f32 %f<414>; .reg .b32 %r<136>; .reg .b64 %rd<77>; ld.param.u64 %rd4, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_6]; ld.param.f32 %f412, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_8]; ld.param.f32 %f413, [getmagnetoelasticforce_param_9]; ld.param.f32 %f157, [getmagnetoelasticforce_param_10]; ld.param.f32 %f158, [getmagnetoelasticforce_param_11]; ld.param.f32 %f159, [getmagnetoelasticforce_param_12]; ld.param.u32 %r55, [getmagnetoelasticforce_param_13]; ld.param.u32 %r56, [getmagnetoelasticforce_param_14]; ld.param.u32 %r57, [getmagnetoelasticforce_param_15]; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd9; mov.u32 %r58, %ntid.x; mov.u32 %r59, %ctaid.x; mov.u32 %r60, %tid.x; mad.lo.s32 %r1, %r58, %r59, %r60; mov.u32 %r61, %ntid.y; mov.u32 %r62, %ctaid.y; mov.u32 %r63, %tid.y; mad.lo.s32 %r2, %r61, %r62, %r63; mov.u32 %r64, %ntid.z; mov.u32 %r65, %ctaid.z; mov.u32 %r66, %tid.z; mad.lo.s32 %r3, %r64, %r65, %r66; setp.ge.s32 %p4, %r2, %r56; setp.ge.s32 %p5, %r1, %r55; or.pred %p6, %p4, %p5; setp.ge.s32 %p7, %r3, %r57; or.pred %p8, %p6, %p7; @%p8 bra BB0_108; mul.lo.s32 %r4, %r3, %r56; add.s32 %r67, %r4, %r2; mul.lo.s32 %r5, %r67, %r55; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; add.s64 %rd14, %rd2, %rd12; ld.global.nc.f32 %f2, [%rd14]; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f3, [%rd15]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r7, %r1, -2; @%p9 bra BB0_3; rem.s32 %r68, %r7, %r55; add.s32 %r69, %r68, %r55; rem.s32 %r124, %r69, %r55; bra.uni BB0_4; BB0_3: mov.u32 %r70, 0; max.s32 %r124, %r7, %r70; BB0_4: setp.lt.s32 %p11, %r7, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_6; add.s32 %r71, %r124, %r5; mul.wide.s32 %rd16, %r71, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f7, [%rd17]; add.s64 %rd18, %rd2, %rd16; ld.global.nc.f32 %f8, [%rd18]; add.s64 %rd19, %rd1, %rd16; ld.global.nc.f32 %f9, [%rd19]; BB0_6: add.s32 %r11, %r1, -1; @%p9 bra BB0_8; rem.s32 %r72, %r11, %r55; add.s32 %r73, %r72, %r55; rem.s32 %r125, %r73, %r55; bra.uni BB0_9; BB0_8: mov.u32 %r74, 0; max.s32 %r125, %r11, %r74; BB0_9: setp.lt.s32 %p14, %r11, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_11; add.s32 %r75, %r125, %r5; mul.wide.s32 %rd20, %r75, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f13, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f14, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f15, [%rd23]; BB0_11: add.s32 %r15, %r1, 1; @%p9 bra BB0_13; rem.s32 %r76, %r15, %r55; add.s32 %r77, %r76, %r55; rem.s32 %r126, %r77, %r55; bra.uni BB0_14; BB0_13: add.s32 %r78, %r55, -1; min.s32 %r126, %r15, %r78; BB0_14: setp.ge.s32 %p18, %r15, %r55; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_16; add.s32 %r79, %r126, %r5; mul.wide.s32 %rd24, %r79, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f19, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f20, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f21, [%rd27]; BB0_16: add.s32 %r19, %r1, 2; @%p9 bra BB0_18; rem.s32 %r80, %r19, %r55; add.s32 %r81, %r80, %r55; rem.s32 %r127, %r81, %r55; bra.uni BB0_19; BB0_18: add.s32 %r82, %r55, -1; min.s32 %r127, %r19, %r82; BB0_19: setp.ge.s32 %p22, %r19, %r55; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_21; add.s32 %r83, %r127, %r5; mul.wide.s32 %rd28, %r83, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f25, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f26, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f27, [%rd31]; BB0_21: mul.f32 %f172, %f20, %f20; fma.rn.f32 %f173, %f19, %f19, %f172; fma.rn.f32 %f28, %f21, %f21, %f173; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_23; mul.f32 %f177, %f14, %f14; fma.rn.f32 %f178, %f13, %f13, %f177; fma.rn.f32 %f179, %f15, %f15, %f178; setp.eq.f32 %p26, %f179, 0f00000000; mov.f32 %f379, 0f00000000; mov.f32 %f380, %f379; mov.f32 %f381, %f379; @%p26 bra BB0_35; BB0_23: mul.f32 %f180, %f8, %f8; fma.rn.f32 %f181, %f7, %f7, %f180; fma.rn.f32 %f29, %f9, %f9, %f181; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f182, %f26, %f26; fma.rn.f32 %f183, %f25, %f25, %f182; fma.rn.f32 %f30, %f27, %f27, %f183; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_25; mul.f32 %f184, %f14, %f14; fma.rn.f32 %f185, %f13, %f13, %f184; fma.rn.f32 %f186, %f15, %f15, %f185; setp.neu.f32 %p32, %f186, 0f00000000; @%p32 bra BB0_34; bra.uni BB0_25; BB0_34: sub.f32 %f215, %f19, %f13; mul.f32 %f379, %f215, 0f3F000000; sub.f32 %f216, %f20, %f14; mul.f32 %f380, %f216, 0f3F000000; sub.f32 %f217, %f21, %f15; mul.f32 %f381, %f217, 0f3F000000; bra.uni BB0_35; BB0_25: or.pred %p34, %p25, %p27; @%p34 bra BB0_27; bra.uni BB0_26; BB0_27: mul.f32 %f187, %f14, %f14; fma.rn.f32 %f188, %f13, %f13, %f187; fma.rn.f32 %f34, %f15, %f15, %f188; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_29; bra.uni BB0_28; BB0_29: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_31; bra.uni BB0_30; BB0_31: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_33; bra.uni BB0_32; BB0_33: sub.f32 %f206, %f19, %f13; sub.f32 %f207, %f20, %f14; sub.f32 %f208, %f21, %f15; sub.f32 %f209, %f7, %f25; mul.f32 %f210, %f209, 0f3DAAAAAB; sub.f32 %f211, %f8, %f26; mul.f32 %f212, %f211, 0f3DAAAAAB; sub.f32 %f213, %f9, %f27; mul.f32 %f214, %f213, 0f3DAAAAAB; fma.rn.f32 %f379, %f206, 0f3F2AAAAB, %f210; fma.rn.f32 %f380, %f207, 0f3F2AAAAB, %f212; fma.rn.f32 %f381, %f208, 0f3F2AAAAB, %f214; bra.uni BB0_35; BB0_26: sub.f32 %f379, %f1, %f13; sub.f32 %f380, %f2, %f14; sub.f32 %f381, %f3, %f15; bra.uni BB0_35; BB0_28: sub.f32 %f379, %f19, %f1; sub.f32 %f380, %f20, %f2; sub.f32 %f381, %f21, %f3; bra.uni BB0_35; BB0_30: mul.f32 %f189, %f13, 0fC0000000; fma.rn.f32 %f190, %f7, 0f3F000000, %f189; add.f32 %f191, %f14, %f14; mul.f32 %f192, %f8, 0f3F000000; sub.f32 %f193, %f192, %f191; add.f32 %f194, %f15, %f15; mul.f32 %f195, %f9, 0f3F000000; sub.f32 %f196, %f195, %f194; fma.rn.f32 %f379, %f1, 0f3FC00000, %f190; fma.rn.f32 %f380, %f2, 0f3FC00000, %f193; fma.rn.f32 %f381, %f3, 0f3FC00000, %f196; bra.uni BB0_35; BB0_32: mul.f32 %f197, %f25, 0fBF000000; fma.rn.f32 %f198, %f19, 0f40000000, %f197; mul.f32 %f199, %f26, 0fBF000000; fma.rn.f32 %f200, %f20, 0f40000000, %f199; mul.f32 %f201, %f27, 0fBF000000; fma.rn.f32 %f202, %f21, 0f40000000, %f201; mul.f32 %f203, %f1, 0f3FC00000; sub.f32 %f379, %f198, %f203; mul.f32 %f204, %f2, 0f3FC00000; sub.f32 %f380, %f200, %f204; mul.f32 %f205, %f3, 0f3FC00000; sub.f32 %f381, %f202, %f205; BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_37; rem.s32 %r84, %r23, %r56; add.s32 %r85, %r84, %r56; rem.s32 %r128, %r85, %r56; bra.uni BB0_38; BB0_37: mov.u32 %r86, 0; max.s32 %r128, %r23, %r86; BB0_38: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_40; add.s32 %r87, %r128, %r4; mad.lo.s32 %r88, %r87, %r55, %r1; mul.wide.s32 %rd32, %r88, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f56, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f57, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f58, [%rd35]; BB0_40: add.s32 %r27, %r2, -1; @%p43 bra BB0_42; rem.s32 %r89, %r27, %r56; add.s32 %r90, %r89, %r56; rem.s32 %r129, %r90, %r56; bra.uni BB0_43; BB0_42: mov.u32 %r91, 0; max.s32 %r129, %r27, %r91; BB0_43: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_45; add.s32 %r92, %r129, %r4; mad.lo.s32 %r93, %r92, %r55, %r1; mul.wide.s32 %rd36, %r93, 4; add.s64 %rd37, %rd3, %rd36; ld.global.nc.f32 %f62, [%rd37]; add.s64 %rd38, %rd2, %rd36; ld.global.nc.f32 %f63, [%rd38]; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f64, [%rd39]; BB0_45: add.s32 %r31, %r2, 1; @%p43 bra BB0_47; rem.s32 %r94, %r31, %r56; add.s32 %r95, %r94, %r56; rem.s32 %r130, %r95, %r56; bra.uni BB0_48; BB0_47: add.s32 %r96, %r56, -1; min.s32 %r130, %r31, %r96; BB0_48: setp.ge.s32 %p52, %r31, %r56; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_50; add.s32 %r97, %r130, %r4; mad.lo.s32 %r98, %r97, %r55, %r1; mul.wide.s32 %rd40, %r98, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f68, [%rd41]; add.s64 %rd42, %rd2, %rd40; ld.global.nc.f32 %f69, [%rd42]; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f70, [%rd43]; BB0_50: add.s32 %r35, %r2, 2; @%p43 bra BB0_52; rem.s32 %r99, %r35, %r56; add.s32 %r100, %r99, %r56; rem.s32 %r131, %r100, %r56; bra.uni BB0_53; BB0_52: add.s32 %r101, %r56, -1; min.s32 %r131, %r35, %r101; BB0_53: setp.ge.s32 %p56, %r35, %r56; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_55; add.s32 %r102, %r131, %r4; mad.lo.s32 %r103, %r102, %r55, %r1; mul.wide.s32 %rd44, %r103, 4; add.s64 %rd45, %rd3, %rd44; ld.global.nc.f32 %f74, [%rd45]; add.s64 %rd46, %rd2, %rd44; ld.global.nc.f32 %f75, [%rd46]; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f76, [%rd47]; BB0_55: mul.f32 %f230, %f69, %f69; fma.rn.f32 %f231, %f68, %f68, %f230; fma.rn.f32 %f77, %f70, %f70, %f231; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_57; mul.f32 %f235, %f63, %f63; fma.rn.f32 %f236, %f62, %f62, %f235; fma.rn.f32 %f237, %f64, %f64, %f236; setp.eq.f32 %p60, %f237, 0f00000000; mov.f32 %f394, 0f00000000; mov.f32 %f395, %f394; mov.f32 %f396, %f394; @%p60 bra BB0_69; BB0_57: mul.f32 %f238, %f57, %f57; fma.rn.f32 %f239, %f56, %f56, %f238; fma.rn.f32 %f78, %f58, %f58, %f239; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f240, %f75, %f75; fma.rn.f32 %f241, %f74, %f74, %f240; fma.rn.f32 %f79, %f76, %f76, %f241; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_59; mul.f32 %f242, %f63, %f63; fma.rn.f32 %f243, %f62, %f62, %f242; fma.rn.f32 %f244, %f64, %f64, %f243; setp.neu.f32 %p66, %f244, 0f00000000; @%p66 bra BB0_68; bra.uni BB0_59; BB0_68: sub.f32 %f273, %f68, %f62; mul.f32 %f394, %f273, 0f3F000000; sub.f32 %f274, %f69, %f63; mul.f32 %f395, %f274, 0f3F000000; sub.f32 %f275, %f70, %f64; mul.f32 %f396, %f275, 0f3F000000; bra.uni BB0_69; BB0_59: or.pred %p68, %p59, %p61; @%p68 bra BB0_61; bra.uni BB0_60; BB0_61: mul.f32 %f245, %f63, %f63; fma.rn.f32 %f246, %f62, %f62, %f245; fma.rn.f32 %f83, %f64, %f64, %f246; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_63; bra.uni BB0_62; BB0_63: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_65; bra.uni BB0_64; BB0_65: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_67; bra.uni BB0_66; BB0_67: sub.f32 %f264, %f68, %f62; sub.f32 %f265, %f69, %f63; sub.f32 %f266, %f70, %f64; sub.f32 %f267, %f56, %f74; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f57, %f75; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f58, %f76; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f394, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f395, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f396, %f266, 0f3F2AAAAB, %f272; bra.uni BB0_69; BB0_60: sub.f32 %f394, %f1, %f62; sub.f32 %f395, %f2, %f63; sub.f32 %f396, %f3, %f64; bra.uni BB0_69; BB0_62: sub.f32 %f394, %f68, %f1; sub.f32 %f395, %f69, %f2; sub.f32 %f396, %f70, %f3; bra.uni BB0_69; BB0_64: mul.f32 %f247, %f62, 0fC0000000; fma.rn.f32 %f248, %f56, 0f3F000000, %f247; add.f32 %f249, %f63, %f63; mul.f32 %f250, %f57, 0f3F000000; sub.f32 %f251, %f250, %f249; add.f32 %f252, %f64, %f64; mul.f32 %f253, %f58, 0f3F000000; sub.f32 %f254, %f253, %f252; fma.rn.f32 %f394, %f1, 0f3FC00000, %f248; fma.rn.f32 %f395, %f2, 0f3FC00000, %f251; fma.rn.f32 %f396, %f3, 0f3FC00000, %f254; bra.uni BB0_69; BB0_66: mul.f32 %f255, %f74, 0fBF000000; fma.rn.f32 %f256, %f68, 0f40000000, %f255; mul.f32 %f257, %f75, 0fBF000000; fma.rn.f32 %f258, %f69, 0f40000000, %f257; mul.f32 %f259, %f76, 0fBF000000; fma.rn.f32 %f260, %f70, 0f40000000, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f394, %f256, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f395, %f258, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f396, %f260, %f263; BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p77, %rs3, 0; add.s32 %r39, %r3, -2; @%p77 bra BB0_71; rem.s32 %r104, %r39, %r57; add.s32 %r105, %r104, %r57; rem.s32 %r132, %r105, %r57; bra.uni BB0_72; BB0_71: mov.u32 %r106, 0; max.s32 %r132, %r39, %r106; BB0_72: setp.lt.s32 %p79, %r39, 0; mov.f32 %f105, 0f00000000; and.pred %p80, %p79, %p77; mov.f32 %f106, %f105; mov.f32 %f107, %f105; @%p80 bra BB0_74; mad.lo.s32 %r107, %r132, %r56, %r2; mad.lo.s32 %r108, %r107, %r55, %r1; mul.wide.s32 %rd48, %r108, 4; add.s64 %rd49, %rd3, %rd48; ld.global.nc.f32 %f105, [%rd49]; add.s64 %rd50, %rd2, %rd48; ld.global.nc.f32 %f106, [%rd50]; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f107, [%rd51]; BB0_74: add.s32 %r43, %r3, -1; @%p77 bra BB0_76; rem.s32 %r109, %r43, %r57; add.s32 %r110, %r109, %r57; rem.s32 %r133, %r110, %r57; bra.uni BB0_77; BB0_76: mov.u32 %r111, 0; max.s32 %r133, %r43, %r111; BB0_77: setp.lt.s32 %p82, %r43, 0; mov.f32 %f111, 0f00000000; and.pred %p84, %p82, %p77; mov.f32 %f112, %f111; mov.f32 %f113, %f111; @%p84 bra BB0_79; mad.lo.s32 %r112, %r133, %r56, %r2; mad.lo.s32 %r113, %r112, %r55, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f111, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f112, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f113, [%rd55]; BB0_79: add.s32 %r47, %r3, 1; @%p77 bra BB0_81; rem.s32 %r114, %r47, %r57; add.s32 %r115, %r114, %r57; rem.s32 %r134, %r115, %r57; bra.uni BB0_82; BB0_81: add.s32 %r116, %r57, -1; min.s32 %r134, %r47, %r116; BB0_82: setp.ge.s32 %p86, %r47, %r57; mov.f32 %f117, 0f00000000; and.pred %p88, %p86, %p77; mov.f32 %f118, %f117; mov.f32 %f119, %f117; @%p88 bra BB0_84; mad.lo.s32 %r117, %r134, %r56, %r2; mad.lo.s32 %r118, %r117, %r55, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f117, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f118, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f119, [%rd59]; BB0_84: add.s32 %r51, %r3, 2; @%p77 bra BB0_86; rem.s32 %r119, %r51, %r57; add.s32 %r120, %r119, %r57; rem.s32 %r135, %r120, %r57; bra.uni BB0_87; BB0_86: add.s32 %r121, %r57, -1; min.s32 %r135, %r51, %r121; BB0_87: setp.ge.s32 %p90, %r51, %r57; mov.f32 %f123, 0f00000000; and.pred %p92, %p90, %p77; mov.f32 %f124, %f123; mov.f32 %f125, %f123; @%p92 bra BB0_89; mad.lo.s32 %r122, %r135, %r56, %r2; mad.lo.s32 %r123, %r122, %r55, %r1; mul.wide.s32 %rd60, %r123, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f123, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f124, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f125, [%rd63]; BB0_89: mul.f32 %f288, %f118, %f118; fma.rn.f32 %f289, %f117, %f117, %f288; fma.rn.f32 %f126, %f119, %f119, %f289; setp.neu.f32 %p93, %f126, 0f00000000; @%p93 bra BB0_91; mul.f32 %f293, %f112, %f112; fma.rn.f32 %f294, %f111, %f111, %f293; fma.rn.f32 %f295, %f113, %f113, %f294; setp.eq.f32 %p94, %f295, 0f00000000; mov.f32 %f409, 0f00000000; mov.f32 %f410, %f409; mov.f32 %f411, %f409; @%p94 bra BB0_103; BB0_91: mul.f32 %f296, %f106, %f106; fma.rn.f32 %f297, %f105, %f105, %f296; fma.rn.f32 %f127, %f107, %f107, %f297; setp.neu.f32 %p95, %f127, 0f00000000; mul.f32 %f298, %f124, %f124; fma.rn.f32 %f299, %f123, %f123, %f298; fma.rn.f32 %f128, %f125, %f125, %f299; setp.neu.f32 %p96, %f128, 0f00000000; and.pred %p97, %p95, %p96; setp.eq.f32 %p98, %f126, 0f00000000; or.pred %p99, %p97, %p98; @%p99 bra BB0_93; mul.f32 %f300, %f112, %f112; fma.rn.f32 %f301, %f111, %f111, %f300; fma.rn.f32 %f302, %f113, %f113, %f301; setp.neu.f32 %p100, %f302, 0f00000000; @%p100 bra BB0_102; bra.uni BB0_93; BB0_102: sub.f32 %f331, %f117, %f111; mul.f32 %f409, %f331, 0f3F000000; sub.f32 %f332, %f118, %f112; mul.f32 %f410, %f332, 0f3F000000; sub.f32 %f333, %f119, %f113; mul.f32 %f411, %f333, 0f3F000000; bra.uni BB0_103; BB0_93: or.pred %p102, %p93, %p95; @%p102 bra BB0_95; bra.uni BB0_94; BB0_95: mul.f32 %f303, %f112, %f112; fma.rn.f32 %f304, %f111, %f111, %f303; fma.rn.f32 %f132, %f113, %f113, %f304; setp.neu.f32 %p103, %f132, 0f00000000; or.pred %p105, %p103, %p96; @%p105 bra BB0_97; bra.uni BB0_96; BB0_97: setp.eq.f32 %p106, %f127, 0f00000000; or.pred %p107, %p106, %p93; @%p107 bra BB0_99; bra.uni BB0_98; BB0_99: setp.eq.f32 %p109, %f128, 0f00000000; or.pred %p110, %p109, %p103; @%p110 bra BB0_101; bra.uni BB0_100; BB0_101: sub.f32 %f322, %f117, %f111; sub.f32 %f323, %f118, %f112; sub.f32 %f324, %f119, %f113; sub.f32 %f325, %f105, %f123; mul.f32 %f326, %f325, 0f3DAAAAAB; sub.f32 %f327, %f106, %f124; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f107, %f125; mul.f32 %f330, %f329, 0f3DAAAAAB; fma.rn.f32 %f409, %f322, 0f3F2AAAAB, %f326; fma.rn.f32 %f410, %f323, 0f3F2AAAAB, %f328; fma.rn.f32 %f411, %f324, 0f3F2AAAAB, %f330; bra.uni BB0_103; BB0_94: sub.f32 %f409, %f1, %f111; sub.f32 %f410, %f2, %f112; sub.f32 %f411, %f3, %f113; bra.uni BB0_103; BB0_96: sub.f32 %f409, %f117, %f1; sub.f32 %f410, %f118, %f2; sub.f32 %f411, %f119, %f3; bra.uni BB0_103; BB0_98: mul.f32 %f305, %f111, 0fC0000000; fma.rn.f32 %f306, %f105, 0f3F000000, %f305; add.f32 %f307, %f112, %f112; mul.f32 %f308, %f106, 0f3F000000; sub.f32 %f309, %f308, %f307; add.f32 %f310, %f113, %f113; mul.f32 %f311, %f107, 0f3F000000; sub.f32 %f312, %f311, %f310; fma.rn.f32 %f409, %f1, 0f3FC00000, %f306; fma.rn.f32 %f410, %f2, 0f3FC00000, %f309; fma.rn.f32 %f411, %f3, 0f3FC00000, %f312; bra.uni BB0_103; BB0_100: mul.f32 %f313, %f123, 0fBF000000; fma.rn.f32 %f314, %f117, 0f40000000, %f313; mul.f32 %f315, %f124, 0fBF000000; fma.rn.f32 %f316, %f118, 0f40000000, %f315; mul.f32 %f317, %f125, 0fBF000000; fma.rn.f32 %f318, %f119, 0f40000000, %f317; mul.f32 %f319, %f1, 0f3FC00000; sub.f32 %f409, %f314, %f319; mul.f32 %f320, %f2, 0f3FC00000; sub.f32 %f410, %f316, %f320; mul.f32 %f321, %f3, 0f3FC00000; sub.f32 %f411, %f318, %f321; BB0_103: setp.eq.s64 %p111, %rd7, 0; @%p111 bra BB0_105; cvta.to.global.u64 %rd64, %rd7; add.s64 %rd66, %rd64, %rd12; ld.global.nc.f32 %f334, [%rd66]; mul.f32 %f412, %f334, %f412; BB0_105: setp.eq.s64 %p112, %rd8, 0; @%p112 bra BB0_107; cvta.to.global.u64 %rd67, %rd8; add.s64 %rd69, %rd67, %rd12; ld.global.nc.f32 %f335, [%rd69]; mul.f32 %f413, %f335, %f413; BB0_107: cvta.to.global.u64 %rd70, %rd6; cvta.to.global.u64 %rd71, %rd5; cvta.to.global.u64 %rd72, %rd4; mul.f32 %f336, %f379, %f157; mul.f32 %f337, %f395, %f158; mul.f32 %f338, %f411, %f159; add.f32 %f339, %f412, %f412; mul.f32 %f340, %f1, %f339; add.f32 %f341, %f337, %f338; mul.f32 %f342, %f1, %f341; mul.f32 %f343, %f394, %f158; fma.rn.f32 %f344, %f2, %f343, %f342; mul.f32 %f345, %f409, %f159; fma.rn.f32 %f346, %f3, %f345, %f344; mul.f32 %f347, %f346, %f413; fma.rn.f32 %f348, %f336, %f340, %f347; add.s64 %rd74, %rd72, %rd12; st.global.f32 [%rd74], %f348; mul.f32 %f349, %f2, %f339; add.f32 %f350, %f336, %f338; mul.f32 %f351, %f2, %f350; mul.f32 %f352, %f380, %f157; fma.rn.f32 %f353, %f1, %f352, %f351; mul.f32 %f354, %f410, %f159; fma.rn.f32 %f355, %f3, %f354, %f353; mul.f32 %f356, %f355, %f413; fma.rn.f32 %f357, %f337, %f349, %f356; add.s64 %rd75, %rd71, %rd12; st.global.f32 [%rd75], %f357; mul.f32 %f358, %f3, %f339; mul.f32 %f359, %f396, %f158; mul.f32 %f360, %f2, %f359; mul.f32 %f361, %f381, %f157; fma.rn.f32 %f362, %f1, %f361, %f360; add.f32 %f363, %f336, %f337; fma.rn.f32 %f364, %f3, %f363, %f362; mul.f32 %f365, %f364, %f413; fma.rn.f32 %f366, %f338, %f358, %f365; add.s64 %rd76, %rd70, %rd12; st.global.f32 [%rd76], %f366; BB0_108: ret; } ` getmagnetoelasticforce_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<113>; .reg .b16 %rs<19>; .reg .f32 %f<414>; .reg .b32 %r<136>; .reg .b64 %rd<77>; ld.param.u64 %rd4, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_6]; ld.param.f32 %f412, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_8]; ld.param.f32 %f413, [getmagnetoelasticforce_param_9]; ld.param.f32 %f157, [getmagnetoelasticforce_param_10]; ld.param.f32 %f158, [getmagnetoelasticforce_param_11]; ld.param.f32 %f159, [getmagnetoelasticforce_param_12]; ld.param.u32 %r55, [getmagnetoelasticforce_param_13]; ld.param.u32 %r56, [getmagnetoelasticforce_param_14]; ld.param.u32 %r57, [getmagnetoelasticforce_param_15]; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd9; mov.u32 %r58, %ntid.x; mov.u32 %r59, %ctaid.x; mov.u32 %r60, %tid.x; mad.lo.s32 %r1, %r58, %r59, %r60; mov.u32 %r61, %ntid.y; mov.u32 %r62, %ctaid.y; mov.u32 %r63, %tid.y; mad.lo.s32 %r2, %r61, %r62, %r63; mov.u32 %r64, %ntid.z; mov.u32 %r65, %ctaid.z; mov.u32 %r66, %tid.z; mad.lo.s32 %r3, %r64, %r65, %r66; setp.ge.s32 %p4, %r2, %r56; setp.ge.s32 %p5, %r1, %r55; or.pred %p6, %p4, %p5; setp.ge.s32 %p7, %r3, %r57; or.pred %p8, %p6, %p7; @%p8 bra BB0_108; mul.lo.s32 %r4, %r3, %r56; add.s32 %r67, %r4, %r2; mul.lo.s32 %r5, %r67, %r55; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; add.s64 %rd14, %rd2, %rd12; ld.global.nc.f32 %f2, [%rd14]; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f3, [%rd15]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r7, %r1, -2; @%p9 bra BB0_3; rem.s32 %r68, %r7, %r55; add.s32 %r69, %r68, %r55; rem.s32 %r124, %r69, %r55; bra.uni BB0_4; BB0_3: mov.u32 %r70, 0; max.s32 %r124, %r7, %r70; BB0_4: setp.lt.s32 %p11, %r7, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_6; add.s32 %r71, %r124, %r5; mul.wide.s32 %rd16, %r71, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f7, [%rd17]; add.s64 %rd18, %rd2, %rd16; ld.global.nc.f32 %f8, [%rd18]; add.s64 %rd19, %rd1, %rd16; ld.global.nc.f32 %f9, [%rd19]; BB0_6: add.s32 %r11, %r1, -1; @%p9 bra BB0_8; rem.s32 %r72, %r11, %r55; add.s32 %r73, %r72, %r55; rem.s32 %r125, %r73, %r55; bra.uni BB0_9; BB0_8: mov.u32 %r74, 0; max.s32 %r125, %r11, %r74; BB0_9: setp.lt.s32 %p14, %r11, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_11; add.s32 %r75, %r125, %r5; mul.wide.s32 %rd20, %r75, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f13, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f14, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f15, [%rd23]; BB0_11: add.s32 %r15, %r1, 1; @%p9 bra BB0_13; rem.s32 %r76, %r15, %r55; add.s32 %r77, %r76, %r55; rem.s32 %r126, %r77, %r55; bra.uni BB0_14; BB0_13: add.s32 %r78, %r55, -1; min.s32 %r126, %r15, %r78; BB0_14: setp.ge.s32 %p18, %r15, %r55; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_16; add.s32 %r79, %r126, %r5; mul.wide.s32 %rd24, %r79, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f19, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f20, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f21, [%rd27]; BB0_16: add.s32 %r19, %r1, 2; @%p9 bra BB0_18; rem.s32 %r80, %r19, %r55; add.s32 %r81, %r80, %r55; rem.s32 %r127, %r81, %r55; bra.uni BB0_19; BB0_18: add.s32 %r82, %r55, -1; min.s32 %r127, %r19, %r82; BB0_19: setp.ge.s32 %p22, %r19, %r55; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_21; add.s32 %r83, %r127, %r5; mul.wide.s32 %rd28, %r83, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f25, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f26, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f27, [%rd31]; BB0_21: mul.f32 %f172, %f20, %f20; fma.rn.f32 %f173, %f19, %f19, %f172; fma.rn.f32 %f28, %f21, %f21, %f173; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_23; mul.f32 %f177, %f14, %f14; fma.rn.f32 %f178, %f13, %f13, %f177; fma.rn.f32 %f179, %f15, %f15, %f178; setp.eq.f32 %p26, %f179, 0f00000000; mov.f32 %f379, 0f00000000; mov.f32 %f380, %f379; mov.f32 %f381, %f379; @%p26 bra BB0_35; BB0_23: mul.f32 %f180, %f8, %f8; fma.rn.f32 %f181, %f7, %f7, %f180; fma.rn.f32 %f29, %f9, %f9, %f181; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f182, %f26, %f26; fma.rn.f32 %f183, %f25, %f25, %f182; fma.rn.f32 %f30, %f27, %f27, %f183; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_25; mul.f32 %f184, %f14, %f14; fma.rn.f32 %f185, %f13, %f13, %f184; fma.rn.f32 %f186, %f15, %f15, %f185; setp.neu.f32 %p32, %f186, 0f00000000; @%p32 bra BB0_34; bra.uni BB0_25; BB0_34: sub.f32 %f215, %f19, %f13; mul.f32 %f379, %f215, 0f3F000000; sub.f32 %f216, %f20, %f14; mul.f32 %f380, %f216, 0f3F000000; sub.f32 %f217, %f21, %f15; mul.f32 %f381, %f217, 0f3F000000; bra.uni BB0_35; BB0_25: or.pred %p34, %p25, %p27; @%p34 bra BB0_27; bra.uni BB0_26; BB0_27: mul.f32 %f187, %f14, %f14; fma.rn.f32 %f188, %f13, %f13, %f187; fma.rn.f32 %f34, %f15, %f15, %f188; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_29; bra.uni BB0_28; BB0_29: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_31; bra.uni BB0_30; BB0_31: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_33; bra.uni BB0_32; BB0_33: sub.f32 %f206, %f19, %f13; sub.f32 %f207, %f20, %f14; sub.f32 %f208, %f21, %f15; sub.f32 %f209, %f7, %f25; mul.f32 %f210, %f209, 0f3DAAAAAB; sub.f32 %f211, %f8, %f26; mul.f32 %f212, %f211, 0f3DAAAAAB; sub.f32 %f213, %f9, %f27; mul.f32 %f214, %f213, 0f3DAAAAAB; fma.rn.f32 %f379, %f206, 0f3F2AAAAB, %f210; fma.rn.f32 %f380, %f207, 0f3F2AAAAB, %f212; fma.rn.f32 %f381, %f208, 0f3F2AAAAB, %f214; bra.uni BB0_35; BB0_26: sub.f32 %f379, %f1, %f13; sub.f32 %f380, %f2, %f14; sub.f32 %f381, %f3, %f15; bra.uni BB0_35; BB0_28: sub.f32 %f379, %f19, %f1; sub.f32 %f380, %f20, %f2; sub.f32 %f381, %f21, %f3; bra.uni BB0_35; BB0_30: mul.f32 %f189, %f13, 0fC0000000; fma.rn.f32 %f190, %f7, 0f3F000000, %f189; add.f32 %f191, %f14, %f14; mul.f32 %f192, %f8, 0f3F000000; sub.f32 %f193, %f192, %f191; add.f32 %f194, %f15, %f15; mul.f32 %f195, %f9, 0f3F000000; sub.f32 %f196, %f195, %f194; fma.rn.f32 %f379, %f1, 0f3FC00000, %f190; fma.rn.f32 %f380, %f2, 0f3FC00000, %f193; fma.rn.f32 %f381, %f3, 0f3FC00000, %f196; bra.uni BB0_35; BB0_32: mul.f32 %f197, %f25, 0fBF000000; fma.rn.f32 %f198, %f19, 0f40000000, %f197; mul.f32 %f199, %f26, 0fBF000000; fma.rn.f32 %f200, %f20, 0f40000000, %f199; mul.f32 %f201, %f27, 0fBF000000; fma.rn.f32 %f202, %f21, 0f40000000, %f201; mul.f32 %f203, %f1, 0f3FC00000; sub.f32 %f379, %f198, %f203; mul.f32 %f204, %f2, 0f3FC00000; sub.f32 %f380, %f200, %f204; mul.f32 %f205, %f3, 0f3FC00000; sub.f32 %f381, %f202, %f205; BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_37; rem.s32 %r84, %r23, %r56; add.s32 %r85, %r84, %r56; rem.s32 %r128, %r85, %r56; bra.uni BB0_38; BB0_37: mov.u32 %r86, 0; max.s32 %r128, %r23, %r86; BB0_38: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_40; add.s32 %r87, %r128, %r4; mad.lo.s32 %r88, %r87, %r55, %r1; mul.wide.s32 %rd32, %r88, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f56, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f57, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f58, [%rd35]; BB0_40: add.s32 %r27, %r2, -1; @%p43 bra BB0_42; rem.s32 %r89, %r27, %r56; add.s32 %r90, %r89, %r56; rem.s32 %r129, %r90, %r56; bra.uni BB0_43; BB0_42: mov.u32 %r91, 0; max.s32 %r129, %r27, %r91; BB0_43: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_45; add.s32 %r92, %r129, %r4; mad.lo.s32 %r93, %r92, %r55, %r1; mul.wide.s32 %rd36, %r93, 4; add.s64 %rd37, %rd3, %rd36; ld.global.nc.f32 %f62, [%rd37]; add.s64 %rd38, %rd2, %rd36; ld.global.nc.f32 %f63, [%rd38]; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f64, [%rd39]; BB0_45: add.s32 %r31, %r2, 1; @%p43 bra BB0_47; rem.s32 %r94, %r31, %r56; add.s32 %r95, %r94, %r56; rem.s32 %r130, %r95, %r56; bra.uni BB0_48; BB0_47: add.s32 %r96, %r56, -1; min.s32 %r130, %r31, %r96; BB0_48: setp.ge.s32 %p52, %r31, %r56; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_50; add.s32 %r97, %r130, %r4; mad.lo.s32 %r98, %r97, %r55, %r1; mul.wide.s32 %rd40, %r98, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f68, [%rd41]; add.s64 %rd42, %rd2, %rd40; ld.global.nc.f32 %f69, [%rd42]; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f70, [%rd43]; BB0_50: add.s32 %r35, %r2, 2; @%p43 bra BB0_52; rem.s32 %r99, %r35, %r56; add.s32 %r100, %r99, %r56; rem.s32 %r131, %r100, %r56; bra.uni BB0_53; BB0_52: add.s32 %r101, %r56, -1; min.s32 %r131, %r35, %r101; BB0_53: setp.ge.s32 %p56, %r35, %r56; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_55; add.s32 %r102, %r131, %r4; mad.lo.s32 %r103, %r102, %r55, %r1; mul.wide.s32 %rd44, %r103, 4; add.s64 %rd45, %rd3, %rd44; ld.global.nc.f32 %f74, [%rd45]; add.s64 %rd46, %rd2, %rd44; ld.global.nc.f32 %f75, [%rd46]; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f76, [%rd47]; BB0_55: mul.f32 %f230, %f69, %f69; fma.rn.f32 %f231, %f68, %f68, %f230; fma.rn.f32 %f77, %f70, %f70, %f231; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_57; mul.f32 %f235, %f63, %f63; fma.rn.f32 %f236, %f62, %f62, %f235; fma.rn.f32 %f237, %f64, %f64, %f236; setp.eq.f32 %p60, %f237, 0f00000000; mov.f32 %f394, 0f00000000; mov.f32 %f395, %f394; mov.f32 %f396, %f394; @%p60 bra BB0_69; BB0_57: mul.f32 %f238, %f57, %f57; fma.rn.f32 %f239, %f56, %f56, %f238; fma.rn.f32 %f78, %f58, %f58, %f239; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f240, %f75, %f75; fma.rn.f32 %f241, %f74, %f74, %f240; fma.rn.f32 %f79, %f76, %f76, %f241; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_59; mul.f32 %f242, %f63, %f63; fma.rn.f32 %f243, %f62, %f62, %f242; fma.rn.f32 %f244, %f64, %f64, %f243; setp.neu.f32 %p66, %f244, 0f00000000; @%p66 bra BB0_68; bra.uni BB0_59; BB0_68: sub.f32 %f273, %f68, %f62; mul.f32 %f394, %f273, 0f3F000000; sub.f32 %f274, %f69, %f63; mul.f32 %f395, %f274, 0f3F000000; sub.f32 %f275, %f70, %f64; mul.f32 %f396, %f275, 0f3F000000; bra.uni BB0_69; BB0_59: or.pred %p68, %p59, %p61; @%p68 bra BB0_61; bra.uni BB0_60; BB0_61: mul.f32 %f245, %f63, %f63; fma.rn.f32 %f246, %f62, %f62, %f245; fma.rn.f32 %f83, %f64, %f64, %f246; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_63; bra.uni BB0_62; BB0_63: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_65; bra.uni BB0_64; BB0_65: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_67; bra.uni BB0_66; BB0_67: sub.f32 %f264, %f68, %f62; sub.f32 %f265, %f69, %f63; sub.f32 %f266, %f70, %f64; sub.f32 %f267, %f56, %f74; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f57, %f75; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f58, %f76; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f394, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f395, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f396, %f266, 0f3F2AAAAB, %f272; bra.uni BB0_69; BB0_60: sub.f32 %f394, %f1, %f62; sub.f32 %f395, %f2, %f63; sub.f32 %f396, %f3, %f64; bra.uni BB0_69; BB0_62: sub.f32 %f394, %f68, %f1; sub.f32 %f395, %f69, %f2; sub.f32 %f396, %f70, %f3; bra.uni BB0_69; BB0_64: mul.f32 %f247, %f62, 0fC0000000; fma.rn.f32 %f248, %f56, 0f3F000000, %f247; add.f32 %f249, %f63, %f63; mul.f32 %f250, %f57, 0f3F000000; sub.f32 %f251, %f250, %f249; add.f32 %f252, %f64, %f64; mul.f32 %f253, %f58, 0f3F000000; sub.f32 %f254, %f253, %f252; fma.rn.f32 %f394, %f1, 0f3FC00000, %f248; fma.rn.f32 %f395, %f2, 0f3FC00000, %f251; fma.rn.f32 %f396, %f3, 0f3FC00000, %f254; bra.uni BB0_69; BB0_66: mul.f32 %f255, %f74, 0fBF000000; fma.rn.f32 %f256, %f68, 0f40000000, %f255; mul.f32 %f257, %f75, 0fBF000000; fma.rn.f32 %f258, %f69, 0f40000000, %f257; mul.f32 %f259, %f76, 0fBF000000; fma.rn.f32 %f260, %f70, 0f40000000, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f394, %f256, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f395, %f258, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f396, %f260, %f263; BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p77, %rs3, 0; add.s32 %r39, %r3, -2; @%p77 bra BB0_71; rem.s32 %r104, %r39, %r57; add.s32 %r105, %r104, %r57; rem.s32 %r132, %r105, %r57; bra.uni BB0_72; BB0_71: mov.u32 %r106, 0; max.s32 %r132, %r39, %r106; BB0_72: setp.lt.s32 %p79, %r39, 0; mov.f32 %f105, 0f00000000; and.pred %p80, %p79, %p77; mov.f32 %f106, %f105; mov.f32 %f107, %f105; @%p80 bra BB0_74; mad.lo.s32 %r107, %r132, %r56, %r2; mad.lo.s32 %r108, %r107, %r55, %r1; mul.wide.s32 %rd48, %r108, 4; add.s64 %rd49, %rd3, %rd48; ld.global.nc.f32 %f105, [%rd49]; add.s64 %rd50, %rd2, %rd48; ld.global.nc.f32 %f106, [%rd50]; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f107, [%rd51]; BB0_74: add.s32 %r43, %r3, -1; @%p77 bra BB0_76; rem.s32 %r109, %r43, %r57; add.s32 %r110, %r109, %r57; rem.s32 %r133, %r110, %r57; bra.uni BB0_77; BB0_76: mov.u32 %r111, 0; max.s32 %r133, %r43, %r111; BB0_77: setp.lt.s32 %p82, %r43, 0; mov.f32 %f111, 0f00000000; and.pred %p84, %p82, %p77; mov.f32 %f112, %f111; mov.f32 %f113, %f111; @%p84 bra BB0_79; mad.lo.s32 %r112, %r133, %r56, %r2; mad.lo.s32 %r113, %r112, %r55, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f111, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f112, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f113, [%rd55]; BB0_79: add.s32 %r47, %r3, 1; @%p77 bra BB0_81; rem.s32 %r114, %r47, %r57; add.s32 %r115, %r114, %r57; rem.s32 %r134, %r115, %r57; bra.uni BB0_82; BB0_81: add.s32 %r116, %r57, -1; min.s32 %r134, %r47, %r116; BB0_82: setp.ge.s32 %p86, %r47, %r57; mov.f32 %f117, 0f00000000; and.pred %p88, %p86, %p77; mov.f32 %f118, %f117; mov.f32 %f119, %f117; @%p88 bra BB0_84; mad.lo.s32 %r117, %r134, %r56, %r2; mad.lo.s32 %r118, %r117, %r55, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f117, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f118, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f119, [%rd59]; BB0_84: add.s32 %r51, %r3, 2; @%p77 bra BB0_86; rem.s32 %r119, %r51, %r57; add.s32 %r120, %r119, %r57; rem.s32 %r135, %r120, %r57; bra.uni BB0_87; BB0_86: add.s32 %r121, %r57, -1; min.s32 %r135, %r51, %r121; BB0_87: setp.ge.s32 %p90, %r51, %r57; mov.f32 %f123, 0f00000000; and.pred %p92, %p90, %p77; mov.f32 %f124, %f123; mov.f32 %f125, %f123; @%p92 bra BB0_89; mad.lo.s32 %r122, %r135, %r56, %r2; mad.lo.s32 %r123, %r122, %r55, %r1; mul.wide.s32 %rd60, %r123, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f123, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f124, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f125, [%rd63]; BB0_89: mul.f32 %f288, %f118, %f118; fma.rn.f32 %f289, %f117, %f117, %f288; fma.rn.f32 %f126, %f119, %f119, %f289; setp.neu.f32 %p93, %f126, 0f00000000; @%p93 bra BB0_91; mul.f32 %f293, %f112, %f112; fma.rn.f32 %f294, %f111, %f111, %f293; fma.rn.f32 %f295, %f113, %f113, %f294; setp.eq.f32 %p94, %f295, 0f00000000; mov.f32 %f409, 0f00000000; mov.f32 %f410, %f409; mov.f32 %f411, %f409; @%p94 bra BB0_103; BB0_91: mul.f32 %f296, %f106, %f106; fma.rn.f32 %f297, %f105, %f105, %f296; fma.rn.f32 %f127, %f107, %f107, %f297; setp.neu.f32 %p95, %f127, 0f00000000; mul.f32 %f298, %f124, %f124; fma.rn.f32 %f299, %f123, %f123, %f298; fma.rn.f32 %f128, %f125, %f125, %f299; setp.neu.f32 %p96, %f128, 0f00000000; and.pred %p97, %p95, %p96; setp.eq.f32 %p98, %f126, 0f00000000; or.pred %p99, %p97, %p98; @%p99 bra BB0_93; mul.f32 %f300, %f112, %f112; fma.rn.f32 %f301, %f111, %f111, %f300; fma.rn.f32 %f302, %f113, %f113, %f301; setp.neu.f32 %p100, %f302, 0f00000000; @%p100 bra BB0_102; bra.uni BB0_93; BB0_102: sub.f32 %f331, %f117, %f111; mul.f32 %f409, %f331, 0f3F000000; sub.f32 %f332, %f118, %f112; mul.f32 %f410, %f332, 0f3F000000; sub.f32 %f333, %f119, %f113; mul.f32 %f411, %f333, 0f3F000000; bra.uni BB0_103; BB0_93: or.pred %p102, %p93, %p95; @%p102 bra BB0_95; bra.uni BB0_94; BB0_95: mul.f32 %f303, %f112, %f112; fma.rn.f32 %f304, %f111, %f111, %f303; fma.rn.f32 %f132, %f113, %f113, %f304; setp.neu.f32 %p103, %f132, 0f00000000; or.pred %p105, %p103, %p96; @%p105 bra BB0_97; bra.uni BB0_96; BB0_97: setp.eq.f32 %p106, %f127, 0f00000000; or.pred %p107, %p106, %p93; @%p107 bra BB0_99; bra.uni BB0_98; BB0_99: setp.eq.f32 %p109, %f128, 0f00000000; or.pred %p110, %p109, %p103; @%p110 bra BB0_101; bra.uni BB0_100; BB0_101: sub.f32 %f322, %f117, %f111; sub.f32 %f323, %f118, %f112; sub.f32 %f324, %f119, %f113; sub.f32 %f325, %f105, %f123; mul.f32 %f326, %f325, 0f3DAAAAAB; sub.f32 %f327, %f106, %f124; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f107, %f125; mul.f32 %f330, %f329, 0f3DAAAAAB; fma.rn.f32 %f409, %f322, 0f3F2AAAAB, %f326; fma.rn.f32 %f410, %f323, 0f3F2AAAAB, %f328; fma.rn.f32 %f411, %f324, 0f3F2AAAAB, %f330; bra.uni BB0_103; BB0_94: sub.f32 %f409, %f1, %f111; sub.f32 %f410, %f2, %f112; sub.f32 %f411, %f3, %f113; bra.uni BB0_103; BB0_96: sub.f32 %f409, %f117, %f1; sub.f32 %f410, %f118, %f2; sub.f32 %f411, %f119, %f3; bra.uni BB0_103; BB0_98: mul.f32 %f305, %f111, 0fC0000000; fma.rn.f32 %f306, %f105, 0f3F000000, %f305; add.f32 %f307, %f112, %f112; mul.f32 %f308, %f106, 0f3F000000; sub.f32 %f309, %f308, %f307; add.f32 %f310, %f113, %f113; mul.f32 %f311, %f107, 0f3F000000; sub.f32 %f312, %f311, %f310; fma.rn.f32 %f409, %f1, 0f3FC00000, %f306; fma.rn.f32 %f410, %f2, 0f3FC00000, %f309; fma.rn.f32 %f411, %f3, 0f3FC00000, %f312; bra.uni BB0_103; BB0_100: mul.f32 %f313, %f123, 0fBF000000; fma.rn.f32 %f314, %f117, 0f40000000, %f313; mul.f32 %f315, %f124, 0fBF000000; fma.rn.f32 %f316, %f118, 0f40000000, %f315; mul.f32 %f317, %f125, 0fBF000000; fma.rn.f32 %f318, %f119, 0f40000000, %f317; mul.f32 %f319, %f1, 0f3FC00000; sub.f32 %f409, %f314, %f319; mul.f32 %f320, %f2, 0f3FC00000; sub.f32 %f410, %f316, %f320; mul.f32 %f321, %f3, 0f3FC00000; sub.f32 %f411, %f318, %f321; BB0_103: setp.eq.s64 %p111, %rd7, 0; @%p111 bra BB0_105; cvta.to.global.u64 %rd64, %rd7; add.s64 %rd66, %rd64, %rd12; ld.global.nc.f32 %f334, [%rd66]; mul.f32 %f412, %f334, %f412; BB0_105: setp.eq.s64 %p112, %rd8, 0; @%p112 bra BB0_107; cvta.to.global.u64 %rd67, %rd8; add.s64 %rd69, %rd67, %rd12; ld.global.nc.f32 %f335, [%rd69]; mul.f32 %f413, %f335, %f413; BB0_107: cvta.to.global.u64 %rd70, %rd6; cvta.to.global.u64 %rd71, %rd5; cvta.to.global.u64 %rd72, %rd4; mul.f32 %f336, %f379, %f157; mul.f32 %f337, %f395, %f158; mul.f32 %f338, %f411, %f159; add.f32 %f339, %f412, %f412; mul.f32 %f340, %f1, %f339; add.f32 %f341, %f337, %f338; mul.f32 %f342, %f1, %f341; mul.f32 %f343, %f394, %f158; fma.rn.f32 %f344, %f2, %f343, %f342; mul.f32 %f345, %f409, %f159; fma.rn.f32 %f346, %f3, %f345, %f344; mul.f32 %f347, %f346, %f413; fma.rn.f32 %f348, %f336, %f340, %f347; add.s64 %rd74, %rd72, %rd12; st.global.f32 [%rd74], %f348; mul.f32 %f349, %f2, %f339; add.f32 %f350, %f336, %f338; mul.f32 %f351, %f2, %f350; mul.f32 %f352, %f380, %f157; fma.rn.f32 %f353, %f1, %f352, %f351; mul.f32 %f354, %f410, %f159; fma.rn.f32 %f355, %f3, %f354, %f353; mul.f32 %f356, %f355, %f413; fma.rn.f32 %f357, %f337, %f349, %f356; add.s64 %rd75, %rd71, %rd12; st.global.f32 [%rd75], %f357; mul.f32 %f358, %f3, %f339; mul.f32 %f359, %f396, %f158; mul.f32 %f360, %f2, %f359; mul.f32 %f361, %f381, %f157; fma.rn.f32 %f362, %f1, %f361, %f360; add.f32 %f363, %f336, %f337; fma.rn.f32 %f364, %f3, %f363, %f362; mul.f32 %f365, %f364, %f413; fma.rn.f32 %f366, %f338, %f358, %f365; add.s64 %rd76, %rd70, %rd12; st.global.f32 [%rd76], %f366; BB0_108: ret; } ` getmagnetoelasticforce_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<113>; .reg .b16 %rs<19>; .reg .f32 %f<414>; .reg .b32 %r<136>; .reg .b64 %rd<77>; ld.param.u64 %rd4, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_6]; ld.param.f32 %f412, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_8]; ld.param.f32 %f413, [getmagnetoelasticforce_param_9]; ld.param.f32 %f157, [getmagnetoelasticforce_param_10]; ld.param.f32 %f158, [getmagnetoelasticforce_param_11]; ld.param.f32 %f159, [getmagnetoelasticforce_param_12]; ld.param.u32 %r55, [getmagnetoelasticforce_param_13]; ld.param.u32 %r56, [getmagnetoelasticforce_param_14]; ld.param.u32 %r57, [getmagnetoelasticforce_param_15]; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd9; mov.u32 %r58, %ntid.x; mov.u32 %r59, %ctaid.x; mov.u32 %r60, %tid.x; mad.lo.s32 %r1, %r58, %r59, %r60; mov.u32 %r61, %ntid.y; mov.u32 %r62, %ctaid.y; mov.u32 %r63, %tid.y; mad.lo.s32 %r2, %r61, %r62, %r63; mov.u32 %r64, %ntid.z; mov.u32 %r65, %ctaid.z; mov.u32 %r66, %tid.z; mad.lo.s32 %r3, %r64, %r65, %r66; setp.ge.s32 %p4, %r2, %r56; setp.ge.s32 %p5, %r1, %r55; or.pred %p6, %p4, %p5; setp.ge.s32 %p7, %r3, %r57; or.pred %p8, %p6, %p7; @%p8 bra BB0_108; mul.lo.s32 %r4, %r3, %r56; add.s32 %r67, %r4, %r2; mul.lo.s32 %r5, %r67, %r55; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; add.s64 %rd14, %rd2, %rd12; ld.global.nc.f32 %f2, [%rd14]; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f3, [%rd15]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r7, %r1, -2; @%p9 bra BB0_3; rem.s32 %r68, %r7, %r55; add.s32 %r69, %r68, %r55; rem.s32 %r124, %r69, %r55; bra.uni BB0_4; BB0_3: mov.u32 %r70, 0; max.s32 %r124, %r7, %r70; BB0_4: setp.lt.s32 %p11, %r7, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_6; add.s32 %r71, %r124, %r5; mul.wide.s32 %rd16, %r71, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f7, [%rd17]; add.s64 %rd18, %rd2, %rd16; ld.global.nc.f32 %f8, [%rd18]; add.s64 %rd19, %rd1, %rd16; ld.global.nc.f32 %f9, [%rd19]; BB0_6: add.s32 %r11, %r1, -1; @%p9 bra BB0_8; rem.s32 %r72, %r11, %r55; add.s32 %r73, %r72, %r55; rem.s32 %r125, %r73, %r55; bra.uni BB0_9; BB0_8: mov.u32 %r74, 0; max.s32 %r125, %r11, %r74; BB0_9: setp.lt.s32 %p14, %r11, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_11; add.s32 %r75, %r125, %r5; mul.wide.s32 %rd20, %r75, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f13, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f14, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f15, [%rd23]; BB0_11: add.s32 %r15, %r1, 1; @%p9 bra BB0_13; rem.s32 %r76, %r15, %r55; add.s32 %r77, %r76, %r55; rem.s32 %r126, %r77, %r55; bra.uni BB0_14; BB0_13: add.s32 %r78, %r55, -1; min.s32 %r126, %r15, %r78; BB0_14: setp.ge.s32 %p18, %r15, %r55; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_16; add.s32 %r79, %r126, %r5; mul.wide.s32 %rd24, %r79, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f19, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f20, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f21, [%rd27]; BB0_16: add.s32 %r19, %r1, 2; @%p9 bra BB0_18; rem.s32 %r80, %r19, %r55; add.s32 %r81, %r80, %r55; rem.s32 %r127, %r81, %r55; bra.uni BB0_19; BB0_18: add.s32 %r82, %r55, -1; min.s32 %r127, %r19, %r82; BB0_19: setp.ge.s32 %p22, %r19, %r55; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_21; add.s32 %r83, %r127, %r5; mul.wide.s32 %rd28, %r83, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f25, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f26, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f27, [%rd31]; BB0_21: mul.f32 %f172, %f20, %f20; fma.rn.f32 %f173, %f19, %f19, %f172; fma.rn.f32 %f28, %f21, %f21, %f173; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_23; mul.f32 %f177, %f14, %f14; fma.rn.f32 %f178, %f13, %f13, %f177; fma.rn.f32 %f179, %f15, %f15, %f178; setp.eq.f32 %p26, %f179, 0f00000000; mov.f32 %f379, 0f00000000; mov.f32 %f380, %f379; mov.f32 %f381, %f379; @%p26 bra BB0_35; BB0_23: mul.f32 %f180, %f8, %f8; fma.rn.f32 %f181, %f7, %f7, %f180; fma.rn.f32 %f29, %f9, %f9, %f181; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f182, %f26, %f26; fma.rn.f32 %f183, %f25, %f25, %f182; fma.rn.f32 %f30, %f27, %f27, %f183; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_25; mul.f32 %f184, %f14, %f14; fma.rn.f32 %f185, %f13, %f13, %f184; fma.rn.f32 %f186, %f15, %f15, %f185; setp.neu.f32 %p32, %f186, 0f00000000; @%p32 bra BB0_34; bra.uni BB0_25; BB0_34: sub.f32 %f215, %f19, %f13; mul.f32 %f379, %f215, 0f3F000000; sub.f32 %f216, %f20, %f14; mul.f32 %f380, %f216, 0f3F000000; sub.f32 %f217, %f21, %f15; mul.f32 %f381, %f217, 0f3F000000; bra.uni BB0_35; BB0_25: or.pred %p34, %p25, %p27; @%p34 bra BB0_27; bra.uni BB0_26; BB0_27: mul.f32 %f187, %f14, %f14; fma.rn.f32 %f188, %f13, %f13, %f187; fma.rn.f32 %f34, %f15, %f15, %f188; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_29; bra.uni BB0_28; BB0_29: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_31; bra.uni BB0_30; BB0_31: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_33; bra.uni BB0_32; BB0_33: sub.f32 %f206, %f19, %f13; sub.f32 %f207, %f20, %f14; sub.f32 %f208, %f21, %f15; sub.f32 %f209, %f7, %f25; mul.f32 %f210, %f209, 0f3DAAAAAB; sub.f32 %f211, %f8, %f26; mul.f32 %f212, %f211, 0f3DAAAAAB; sub.f32 %f213, %f9, %f27; mul.f32 %f214, %f213, 0f3DAAAAAB; fma.rn.f32 %f379, %f206, 0f3F2AAAAB, %f210; fma.rn.f32 %f380, %f207, 0f3F2AAAAB, %f212; fma.rn.f32 %f381, %f208, 0f3F2AAAAB, %f214; bra.uni BB0_35; BB0_26: sub.f32 %f379, %f1, %f13; sub.f32 %f380, %f2, %f14; sub.f32 %f381, %f3, %f15; bra.uni BB0_35; BB0_28: sub.f32 %f379, %f19, %f1; sub.f32 %f380, %f20, %f2; sub.f32 %f381, %f21, %f3; bra.uni BB0_35; BB0_30: mul.f32 %f189, %f13, 0fC0000000; fma.rn.f32 %f190, %f7, 0f3F000000, %f189; add.f32 %f191, %f14, %f14; mul.f32 %f192, %f8, 0f3F000000; sub.f32 %f193, %f192, %f191; add.f32 %f194, %f15, %f15; mul.f32 %f195, %f9, 0f3F000000; sub.f32 %f196, %f195, %f194; fma.rn.f32 %f379, %f1, 0f3FC00000, %f190; fma.rn.f32 %f380, %f2, 0f3FC00000, %f193; fma.rn.f32 %f381, %f3, 0f3FC00000, %f196; bra.uni BB0_35; BB0_32: mul.f32 %f197, %f25, 0fBF000000; fma.rn.f32 %f198, %f19, 0f40000000, %f197; mul.f32 %f199, %f26, 0fBF000000; fma.rn.f32 %f200, %f20, 0f40000000, %f199; mul.f32 %f201, %f27, 0fBF000000; fma.rn.f32 %f202, %f21, 0f40000000, %f201; mul.f32 %f203, %f1, 0f3FC00000; sub.f32 %f379, %f198, %f203; mul.f32 %f204, %f2, 0f3FC00000; sub.f32 %f380, %f200, %f204; mul.f32 %f205, %f3, 0f3FC00000; sub.f32 %f381, %f202, %f205; BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_37; rem.s32 %r84, %r23, %r56; add.s32 %r85, %r84, %r56; rem.s32 %r128, %r85, %r56; bra.uni BB0_38; BB0_37: mov.u32 %r86, 0; max.s32 %r128, %r23, %r86; BB0_38: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_40; add.s32 %r87, %r128, %r4; mad.lo.s32 %r88, %r87, %r55, %r1; mul.wide.s32 %rd32, %r88, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f56, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f57, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f58, [%rd35]; BB0_40: add.s32 %r27, %r2, -1; @%p43 bra BB0_42; rem.s32 %r89, %r27, %r56; add.s32 %r90, %r89, %r56; rem.s32 %r129, %r90, %r56; bra.uni BB0_43; BB0_42: mov.u32 %r91, 0; max.s32 %r129, %r27, %r91; BB0_43: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_45; add.s32 %r92, %r129, %r4; mad.lo.s32 %r93, %r92, %r55, %r1; mul.wide.s32 %rd36, %r93, 4; add.s64 %rd37, %rd3, %rd36; ld.global.nc.f32 %f62, [%rd37]; add.s64 %rd38, %rd2, %rd36; ld.global.nc.f32 %f63, [%rd38]; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f64, [%rd39]; BB0_45: add.s32 %r31, %r2, 1; @%p43 bra BB0_47; rem.s32 %r94, %r31, %r56; add.s32 %r95, %r94, %r56; rem.s32 %r130, %r95, %r56; bra.uni BB0_48; BB0_47: add.s32 %r96, %r56, -1; min.s32 %r130, %r31, %r96; BB0_48: setp.ge.s32 %p52, %r31, %r56; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_50; add.s32 %r97, %r130, %r4; mad.lo.s32 %r98, %r97, %r55, %r1; mul.wide.s32 %rd40, %r98, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f68, [%rd41]; add.s64 %rd42, %rd2, %rd40; ld.global.nc.f32 %f69, [%rd42]; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f70, [%rd43]; BB0_50: add.s32 %r35, %r2, 2; @%p43 bra BB0_52; rem.s32 %r99, %r35, %r56; add.s32 %r100, %r99, %r56; rem.s32 %r131, %r100, %r56; bra.uni BB0_53; BB0_52: add.s32 %r101, %r56, -1; min.s32 %r131, %r35, %r101; BB0_53: setp.ge.s32 %p56, %r35, %r56; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_55; add.s32 %r102, %r131, %r4; mad.lo.s32 %r103, %r102, %r55, %r1; mul.wide.s32 %rd44, %r103, 4; add.s64 %rd45, %rd3, %rd44; ld.global.nc.f32 %f74, [%rd45]; add.s64 %rd46, %rd2, %rd44; ld.global.nc.f32 %f75, [%rd46]; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f76, [%rd47]; BB0_55: mul.f32 %f230, %f69, %f69; fma.rn.f32 %f231, %f68, %f68, %f230; fma.rn.f32 %f77, %f70, %f70, %f231; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_57; mul.f32 %f235, %f63, %f63; fma.rn.f32 %f236, %f62, %f62, %f235; fma.rn.f32 %f237, %f64, %f64, %f236; setp.eq.f32 %p60, %f237, 0f00000000; mov.f32 %f394, 0f00000000; mov.f32 %f395, %f394; mov.f32 %f396, %f394; @%p60 bra BB0_69; BB0_57: mul.f32 %f238, %f57, %f57; fma.rn.f32 %f239, %f56, %f56, %f238; fma.rn.f32 %f78, %f58, %f58, %f239; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f240, %f75, %f75; fma.rn.f32 %f241, %f74, %f74, %f240; fma.rn.f32 %f79, %f76, %f76, %f241; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_59; mul.f32 %f242, %f63, %f63; fma.rn.f32 %f243, %f62, %f62, %f242; fma.rn.f32 %f244, %f64, %f64, %f243; setp.neu.f32 %p66, %f244, 0f00000000; @%p66 bra BB0_68; bra.uni BB0_59; BB0_68: sub.f32 %f273, %f68, %f62; mul.f32 %f394, %f273, 0f3F000000; sub.f32 %f274, %f69, %f63; mul.f32 %f395, %f274, 0f3F000000; sub.f32 %f275, %f70, %f64; mul.f32 %f396, %f275, 0f3F000000; bra.uni BB0_69; BB0_59: or.pred %p68, %p59, %p61; @%p68 bra BB0_61; bra.uni BB0_60; BB0_61: mul.f32 %f245, %f63, %f63; fma.rn.f32 %f246, %f62, %f62, %f245; fma.rn.f32 %f83, %f64, %f64, %f246; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_63; bra.uni BB0_62; BB0_63: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_65; bra.uni BB0_64; BB0_65: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_67; bra.uni BB0_66; BB0_67: sub.f32 %f264, %f68, %f62; sub.f32 %f265, %f69, %f63; sub.f32 %f266, %f70, %f64; sub.f32 %f267, %f56, %f74; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f57, %f75; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f58, %f76; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f394, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f395, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f396, %f266, 0f3F2AAAAB, %f272; bra.uni BB0_69; BB0_60: sub.f32 %f394, %f1, %f62; sub.f32 %f395, %f2, %f63; sub.f32 %f396, %f3, %f64; bra.uni BB0_69; BB0_62: sub.f32 %f394, %f68, %f1; sub.f32 %f395, %f69, %f2; sub.f32 %f396, %f70, %f3; bra.uni BB0_69; BB0_64: mul.f32 %f247, %f62, 0fC0000000; fma.rn.f32 %f248, %f56, 0f3F000000, %f247; add.f32 %f249, %f63, %f63; mul.f32 %f250, %f57, 0f3F000000; sub.f32 %f251, %f250, %f249; add.f32 %f252, %f64, %f64; mul.f32 %f253, %f58, 0f3F000000; sub.f32 %f254, %f253, %f252; fma.rn.f32 %f394, %f1, 0f3FC00000, %f248; fma.rn.f32 %f395, %f2, 0f3FC00000, %f251; fma.rn.f32 %f396, %f3, 0f3FC00000, %f254; bra.uni BB0_69; BB0_66: mul.f32 %f255, %f74, 0fBF000000; fma.rn.f32 %f256, %f68, 0f40000000, %f255; mul.f32 %f257, %f75, 0fBF000000; fma.rn.f32 %f258, %f69, 0f40000000, %f257; mul.f32 %f259, %f76, 0fBF000000; fma.rn.f32 %f260, %f70, 0f40000000, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f394, %f256, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f395, %f258, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f396, %f260, %f263; BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p77, %rs3, 0; add.s32 %r39, %r3, -2; @%p77 bra BB0_71; rem.s32 %r104, %r39, %r57; add.s32 %r105, %r104, %r57; rem.s32 %r132, %r105, %r57; bra.uni BB0_72; BB0_71: mov.u32 %r106, 0; max.s32 %r132, %r39, %r106; BB0_72: setp.lt.s32 %p79, %r39, 0; mov.f32 %f105, 0f00000000; and.pred %p80, %p79, %p77; mov.f32 %f106, %f105; mov.f32 %f107, %f105; @%p80 bra BB0_74; mad.lo.s32 %r107, %r132, %r56, %r2; mad.lo.s32 %r108, %r107, %r55, %r1; mul.wide.s32 %rd48, %r108, 4; add.s64 %rd49, %rd3, %rd48; ld.global.nc.f32 %f105, [%rd49]; add.s64 %rd50, %rd2, %rd48; ld.global.nc.f32 %f106, [%rd50]; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f107, [%rd51]; BB0_74: add.s32 %r43, %r3, -1; @%p77 bra BB0_76; rem.s32 %r109, %r43, %r57; add.s32 %r110, %r109, %r57; rem.s32 %r133, %r110, %r57; bra.uni BB0_77; BB0_76: mov.u32 %r111, 0; max.s32 %r133, %r43, %r111; BB0_77: setp.lt.s32 %p82, %r43, 0; mov.f32 %f111, 0f00000000; and.pred %p84, %p82, %p77; mov.f32 %f112, %f111; mov.f32 %f113, %f111; @%p84 bra BB0_79; mad.lo.s32 %r112, %r133, %r56, %r2; mad.lo.s32 %r113, %r112, %r55, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f111, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f112, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f113, [%rd55]; BB0_79: add.s32 %r47, %r3, 1; @%p77 bra BB0_81; rem.s32 %r114, %r47, %r57; add.s32 %r115, %r114, %r57; rem.s32 %r134, %r115, %r57; bra.uni BB0_82; BB0_81: add.s32 %r116, %r57, -1; min.s32 %r134, %r47, %r116; BB0_82: setp.ge.s32 %p86, %r47, %r57; mov.f32 %f117, 0f00000000; and.pred %p88, %p86, %p77; mov.f32 %f118, %f117; mov.f32 %f119, %f117; @%p88 bra BB0_84; mad.lo.s32 %r117, %r134, %r56, %r2; mad.lo.s32 %r118, %r117, %r55, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f117, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f118, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f119, [%rd59]; BB0_84: add.s32 %r51, %r3, 2; @%p77 bra BB0_86; rem.s32 %r119, %r51, %r57; add.s32 %r120, %r119, %r57; rem.s32 %r135, %r120, %r57; bra.uni BB0_87; BB0_86: add.s32 %r121, %r57, -1; min.s32 %r135, %r51, %r121; BB0_87: setp.ge.s32 %p90, %r51, %r57; mov.f32 %f123, 0f00000000; and.pred %p92, %p90, %p77; mov.f32 %f124, %f123; mov.f32 %f125, %f123; @%p92 bra BB0_89; mad.lo.s32 %r122, %r135, %r56, %r2; mad.lo.s32 %r123, %r122, %r55, %r1; mul.wide.s32 %rd60, %r123, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f123, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f124, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f125, [%rd63]; BB0_89: mul.f32 %f288, %f118, %f118; fma.rn.f32 %f289, %f117, %f117, %f288; fma.rn.f32 %f126, %f119, %f119, %f289; setp.neu.f32 %p93, %f126, 0f00000000; @%p93 bra BB0_91; mul.f32 %f293, %f112, %f112; fma.rn.f32 %f294, %f111, %f111, %f293; fma.rn.f32 %f295, %f113, %f113, %f294; setp.eq.f32 %p94, %f295, 0f00000000; mov.f32 %f409, 0f00000000; mov.f32 %f410, %f409; mov.f32 %f411, %f409; @%p94 bra BB0_103; BB0_91: mul.f32 %f296, %f106, %f106; fma.rn.f32 %f297, %f105, %f105, %f296; fma.rn.f32 %f127, %f107, %f107, %f297; setp.neu.f32 %p95, %f127, 0f00000000; mul.f32 %f298, %f124, %f124; fma.rn.f32 %f299, %f123, %f123, %f298; fma.rn.f32 %f128, %f125, %f125, %f299; setp.neu.f32 %p96, %f128, 0f00000000; and.pred %p97, %p95, %p96; setp.eq.f32 %p98, %f126, 0f00000000; or.pred %p99, %p97, %p98; @%p99 bra BB0_93; mul.f32 %f300, %f112, %f112; fma.rn.f32 %f301, %f111, %f111, %f300; fma.rn.f32 %f302, %f113, %f113, %f301; setp.neu.f32 %p100, %f302, 0f00000000; @%p100 bra BB0_102; bra.uni BB0_93; BB0_102: sub.f32 %f331, %f117, %f111; mul.f32 %f409, %f331, 0f3F000000; sub.f32 %f332, %f118, %f112; mul.f32 %f410, %f332, 0f3F000000; sub.f32 %f333, %f119, %f113; mul.f32 %f411, %f333, 0f3F000000; bra.uni BB0_103; BB0_93: or.pred %p102, %p93, %p95; @%p102 bra BB0_95; bra.uni BB0_94; BB0_95: mul.f32 %f303, %f112, %f112; fma.rn.f32 %f304, %f111, %f111, %f303; fma.rn.f32 %f132, %f113, %f113, %f304; setp.neu.f32 %p103, %f132, 0f00000000; or.pred %p105, %p103, %p96; @%p105 bra BB0_97; bra.uni BB0_96; BB0_97: setp.eq.f32 %p106, %f127, 0f00000000; or.pred %p107, %p106, %p93; @%p107 bra BB0_99; bra.uni BB0_98; BB0_99: setp.eq.f32 %p109, %f128, 0f00000000; or.pred %p110, %p109, %p103; @%p110 bra BB0_101; bra.uni BB0_100; BB0_101: sub.f32 %f322, %f117, %f111; sub.f32 %f323, %f118, %f112; sub.f32 %f324, %f119, %f113; sub.f32 %f325, %f105, %f123; mul.f32 %f326, %f325, 0f3DAAAAAB; sub.f32 %f327, %f106, %f124; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f107, %f125; mul.f32 %f330, %f329, 0f3DAAAAAB; fma.rn.f32 %f409, %f322, 0f3F2AAAAB, %f326; fma.rn.f32 %f410, %f323, 0f3F2AAAAB, %f328; fma.rn.f32 %f411, %f324, 0f3F2AAAAB, %f330; bra.uni BB0_103; BB0_94: sub.f32 %f409, %f1, %f111; sub.f32 %f410, %f2, %f112; sub.f32 %f411, %f3, %f113; bra.uni BB0_103; BB0_96: sub.f32 %f409, %f117, %f1; sub.f32 %f410, %f118, %f2; sub.f32 %f411, %f119, %f3; bra.uni BB0_103; BB0_98: mul.f32 %f305, %f111, 0fC0000000; fma.rn.f32 %f306, %f105, 0f3F000000, %f305; add.f32 %f307, %f112, %f112; mul.f32 %f308, %f106, 0f3F000000; sub.f32 %f309, %f308, %f307; add.f32 %f310, %f113, %f113; mul.f32 %f311, %f107, 0f3F000000; sub.f32 %f312, %f311, %f310; fma.rn.f32 %f409, %f1, 0f3FC00000, %f306; fma.rn.f32 %f410, %f2, 0f3FC00000, %f309; fma.rn.f32 %f411, %f3, 0f3FC00000, %f312; bra.uni BB0_103; BB0_100: mul.f32 %f313, %f123, 0fBF000000; fma.rn.f32 %f314, %f117, 0f40000000, %f313; mul.f32 %f315, %f124, 0fBF000000; fma.rn.f32 %f316, %f118, 0f40000000, %f315; mul.f32 %f317, %f125, 0fBF000000; fma.rn.f32 %f318, %f119, 0f40000000, %f317; mul.f32 %f319, %f1, 0f3FC00000; sub.f32 %f409, %f314, %f319; mul.f32 %f320, %f2, 0f3FC00000; sub.f32 %f410, %f316, %f320; mul.f32 %f321, %f3, 0f3FC00000; sub.f32 %f411, %f318, %f321; BB0_103: setp.eq.s64 %p111, %rd7, 0; @%p111 bra BB0_105; cvta.to.global.u64 %rd64, %rd7; add.s64 %rd66, %rd64, %rd12; ld.global.nc.f32 %f334, [%rd66]; mul.f32 %f412, %f334, %f412; BB0_105: setp.eq.s64 %p112, %rd8, 0; @%p112 bra BB0_107; cvta.to.global.u64 %rd67, %rd8; add.s64 %rd69, %rd67, %rd12; ld.global.nc.f32 %f335, [%rd69]; mul.f32 %f413, %f335, %f413; BB0_107: cvta.to.global.u64 %rd70, %rd6; cvta.to.global.u64 %rd71, %rd5; cvta.to.global.u64 %rd72, %rd4; mul.f32 %f336, %f379, %f157; mul.f32 %f337, %f395, %f158; mul.f32 %f338, %f411, %f159; add.f32 %f339, %f412, %f412; mul.f32 %f340, %f1, %f339; add.f32 %f341, %f337, %f338; mul.f32 %f342, %f1, %f341; mul.f32 %f343, %f394, %f158; fma.rn.f32 %f344, %f2, %f343, %f342; mul.f32 %f345, %f409, %f159; fma.rn.f32 %f346, %f3, %f345, %f344; mul.f32 %f347, %f346, %f413; fma.rn.f32 %f348, %f336, %f340, %f347; add.s64 %rd74, %rd72, %rd12; st.global.f32 [%rd74], %f348; mul.f32 %f349, %f2, %f339; add.f32 %f350, %f336, %f338; mul.f32 %f351, %f2, %f350; mul.f32 %f352, %f380, %f157; fma.rn.f32 %f353, %f1, %f352, %f351; mul.f32 %f354, %f410, %f159; fma.rn.f32 %f355, %f3, %f354, %f353; mul.f32 %f356, %f355, %f413; fma.rn.f32 %f357, %f337, %f349, %f356; add.s64 %rd75, %rd71, %rd12; st.global.f32 [%rd75], %f357; mul.f32 %f358, %f3, %f339; mul.f32 %f359, %f396, %f158; mul.f32 %f360, %f2, %f359; mul.f32 %f361, %f381, %f157; fma.rn.f32 %f362, %f1, %f361, %f360; add.f32 %f363, %f336, %f337; fma.rn.f32 %f364, %f3, %f363, %f362; mul.f32 %f365, %f364, %f413; fma.rn.f32 %f366, %f338, %f358, %f365; add.s64 %rd76, %rd70, %rd12; st.global.f32 [%rd76], %f366; BB0_108: ret; } ` getmagnetoelasticforce_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<113>; .reg .b16 %rs<19>; .reg .f32 %f<414>; .reg .b32 %r<136>; .reg .b64 %rd<77>; ld.param.u64 %rd4, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_6]; ld.param.f32 %f412, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_8]; ld.param.f32 %f413, [getmagnetoelasticforce_param_9]; ld.param.f32 %f157, [getmagnetoelasticforce_param_10]; ld.param.f32 %f158, [getmagnetoelasticforce_param_11]; ld.param.f32 %f159, [getmagnetoelasticforce_param_12]; ld.param.u32 %r55, [getmagnetoelasticforce_param_13]; ld.param.u32 %r56, [getmagnetoelasticforce_param_14]; ld.param.u32 %r57, [getmagnetoelasticforce_param_15]; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd9; mov.u32 %r58, %ntid.x; mov.u32 %r59, %ctaid.x; mov.u32 %r60, %tid.x; mad.lo.s32 %r1, %r58, %r59, %r60; mov.u32 %r61, %ntid.y; mov.u32 %r62, %ctaid.y; mov.u32 %r63, %tid.y; mad.lo.s32 %r2, %r61, %r62, %r63; mov.u32 %r64, %ntid.z; mov.u32 %r65, %ctaid.z; mov.u32 %r66, %tid.z; mad.lo.s32 %r3, %r64, %r65, %r66; setp.ge.s32 %p4, %r2, %r56; setp.ge.s32 %p5, %r1, %r55; or.pred %p6, %p4, %p5; setp.ge.s32 %p7, %r3, %r57; or.pred %p8, %p6, %p7; @%p8 bra BB0_108; mul.lo.s32 %r4, %r3, %r56; add.s32 %r67, %r4, %r2; mul.lo.s32 %r5, %r67, %r55; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; add.s64 %rd14, %rd2, %rd12; ld.global.nc.f32 %f2, [%rd14]; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f3, [%rd15]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r7, %r1, -2; @%p9 bra BB0_3; rem.s32 %r68, %r7, %r55; add.s32 %r69, %r68, %r55; rem.s32 %r124, %r69, %r55; bra.uni BB0_4; BB0_3: mov.u32 %r70, 0; max.s32 %r124, %r7, %r70; BB0_4: setp.lt.s32 %p11, %r7, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_6; add.s32 %r71, %r124, %r5; mul.wide.s32 %rd16, %r71, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f7, [%rd17]; add.s64 %rd18, %rd2, %rd16; ld.global.nc.f32 %f8, [%rd18]; add.s64 %rd19, %rd1, %rd16; ld.global.nc.f32 %f9, [%rd19]; BB0_6: add.s32 %r11, %r1, -1; @%p9 bra BB0_8; rem.s32 %r72, %r11, %r55; add.s32 %r73, %r72, %r55; rem.s32 %r125, %r73, %r55; bra.uni BB0_9; BB0_8: mov.u32 %r74, 0; max.s32 %r125, %r11, %r74; BB0_9: setp.lt.s32 %p14, %r11, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_11; add.s32 %r75, %r125, %r5; mul.wide.s32 %rd20, %r75, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f13, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f14, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f15, [%rd23]; BB0_11: add.s32 %r15, %r1, 1; @%p9 bra BB0_13; rem.s32 %r76, %r15, %r55; add.s32 %r77, %r76, %r55; rem.s32 %r126, %r77, %r55; bra.uni BB0_14; BB0_13: add.s32 %r78, %r55, -1; min.s32 %r126, %r15, %r78; BB0_14: setp.ge.s32 %p18, %r15, %r55; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_16; add.s32 %r79, %r126, %r5; mul.wide.s32 %rd24, %r79, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f19, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f20, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f21, [%rd27]; BB0_16: add.s32 %r19, %r1, 2; @%p9 bra BB0_18; rem.s32 %r80, %r19, %r55; add.s32 %r81, %r80, %r55; rem.s32 %r127, %r81, %r55; bra.uni BB0_19; BB0_18: add.s32 %r82, %r55, -1; min.s32 %r127, %r19, %r82; BB0_19: setp.ge.s32 %p22, %r19, %r55; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_21; add.s32 %r83, %r127, %r5; mul.wide.s32 %rd28, %r83, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f25, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f26, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f27, [%rd31]; BB0_21: mul.f32 %f172, %f20, %f20; fma.rn.f32 %f173, %f19, %f19, %f172; fma.rn.f32 %f28, %f21, %f21, %f173; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_23; mul.f32 %f177, %f14, %f14; fma.rn.f32 %f178, %f13, %f13, %f177; fma.rn.f32 %f179, %f15, %f15, %f178; setp.eq.f32 %p26, %f179, 0f00000000; mov.f32 %f379, 0f00000000; mov.f32 %f380, %f379; mov.f32 %f381, %f379; @%p26 bra BB0_35; BB0_23: mul.f32 %f180, %f8, %f8; fma.rn.f32 %f181, %f7, %f7, %f180; fma.rn.f32 %f29, %f9, %f9, %f181; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f182, %f26, %f26; fma.rn.f32 %f183, %f25, %f25, %f182; fma.rn.f32 %f30, %f27, %f27, %f183; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_25; mul.f32 %f184, %f14, %f14; fma.rn.f32 %f185, %f13, %f13, %f184; fma.rn.f32 %f186, %f15, %f15, %f185; setp.neu.f32 %p32, %f186, 0f00000000; @%p32 bra BB0_34; bra.uni BB0_25; BB0_34: sub.f32 %f215, %f19, %f13; mul.f32 %f379, %f215, 0f3F000000; sub.f32 %f216, %f20, %f14; mul.f32 %f380, %f216, 0f3F000000; sub.f32 %f217, %f21, %f15; mul.f32 %f381, %f217, 0f3F000000; bra.uni BB0_35; BB0_25: or.pred %p34, %p25, %p27; @%p34 bra BB0_27; bra.uni BB0_26; BB0_27: mul.f32 %f187, %f14, %f14; fma.rn.f32 %f188, %f13, %f13, %f187; fma.rn.f32 %f34, %f15, %f15, %f188; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_29; bra.uni BB0_28; BB0_29: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_31; bra.uni BB0_30; BB0_31: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_33; bra.uni BB0_32; BB0_33: sub.f32 %f206, %f19, %f13; sub.f32 %f207, %f20, %f14; sub.f32 %f208, %f21, %f15; sub.f32 %f209, %f7, %f25; mul.f32 %f210, %f209, 0f3DAAAAAB; sub.f32 %f211, %f8, %f26; mul.f32 %f212, %f211, 0f3DAAAAAB; sub.f32 %f213, %f9, %f27; mul.f32 %f214, %f213, 0f3DAAAAAB; fma.rn.f32 %f379, %f206, 0f3F2AAAAB, %f210; fma.rn.f32 %f380, %f207, 0f3F2AAAAB, %f212; fma.rn.f32 %f381, %f208, 0f3F2AAAAB, %f214; bra.uni BB0_35; BB0_26: sub.f32 %f379, %f1, %f13; sub.f32 %f380, %f2, %f14; sub.f32 %f381, %f3, %f15; bra.uni BB0_35; BB0_28: sub.f32 %f379, %f19, %f1; sub.f32 %f380, %f20, %f2; sub.f32 %f381, %f21, %f3; bra.uni BB0_35; BB0_30: mul.f32 %f189, %f13, 0fC0000000; fma.rn.f32 %f190, %f7, 0f3F000000, %f189; add.f32 %f191, %f14, %f14; mul.f32 %f192, %f8, 0f3F000000; sub.f32 %f193, %f192, %f191; add.f32 %f194, %f15, %f15; mul.f32 %f195, %f9, 0f3F000000; sub.f32 %f196, %f195, %f194; fma.rn.f32 %f379, %f1, 0f3FC00000, %f190; fma.rn.f32 %f380, %f2, 0f3FC00000, %f193; fma.rn.f32 %f381, %f3, 0f3FC00000, %f196; bra.uni BB0_35; BB0_32: mul.f32 %f197, %f25, 0fBF000000; fma.rn.f32 %f198, %f19, 0f40000000, %f197; mul.f32 %f199, %f26, 0fBF000000; fma.rn.f32 %f200, %f20, 0f40000000, %f199; mul.f32 %f201, %f27, 0fBF000000; fma.rn.f32 %f202, %f21, 0f40000000, %f201; mul.f32 %f203, %f1, 0f3FC00000; sub.f32 %f379, %f198, %f203; mul.f32 %f204, %f2, 0f3FC00000; sub.f32 %f380, %f200, %f204; mul.f32 %f205, %f3, 0f3FC00000; sub.f32 %f381, %f202, %f205; BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_37; rem.s32 %r84, %r23, %r56; add.s32 %r85, %r84, %r56; rem.s32 %r128, %r85, %r56; bra.uni BB0_38; BB0_37: mov.u32 %r86, 0; max.s32 %r128, %r23, %r86; BB0_38: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_40; add.s32 %r87, %r128, %r4; mad.lo.s32 %r88, %r87, %r55, %r1; mul.wide.s32 %rd32, %r88, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f56, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f57, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f58, [%rd35]; BB0_40: add.s32 %r27, %r2, -1; @%p43 bra BB0_42; rem.s32 %r89, %r27, %r56; add.s32 %r90, %r89, %r56; rem.s32 %r129, %r90, %r56; bra.uni BB0_43; BB0_42: mov.u32 %r91, 0; max.s32 %r129, %r27, %r91; BB0_43: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_45; add.s32 %r92, %r129, %r4; mad.lo.s32 %r93, %r92, %r55, %r1; mul.wide.s32 %rd36, %r93, 4; add.s64 %rd37, %rd3, %rd36; ld.global.nc.f32 %f62, [%rd37]; add.s64 %rd38, %rd2, %rd36; ld.global.nc.f32 %f63, [%rd38]; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f64, [%rd39]; BB0_45: add.s32 %r31, %r2, 1; @%p43 bra BB0_47; rem.s32 %r94, %r31, %r56; add.s32 %r95, %r94, %r56; rem.s32 %r130, %r95, %r56; bra.uni BB0_48; BB0_47: add.s32 %r96, %r56, -1; min.s32 %r130, %r31, %r96; BB0_48: setp.ge.s32 %p52, %r31, %r56; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_50; add.s32 %r97, %r130, %r4; mad.lo.s32 %r98, %r97, %r55, %r1; mul.wide.s32 %rd40, %r98, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f68, [%rd41]; add.s64 %rd42, %rd2, %rd40; ld.global.nc.f32 %f69, [%rd42]; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f70, [%rd43]; BB0_50: add.s32 %r35, %r2, 2; @%p43 bra BB0_52; rem.s32 %r99, %r35, %r56; add.s32 %r100, %r99, %r56; rem.s32 %r131, %r100, %r56; bra.uni BB0_53; BB0_52: add.s32 %r101, %r56, -1; min.s32 %r131, %r35, %r101; BB0_53: setp.ge.s32 %p56, %r35, %r56; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_55; add.s32 %r102, %r131, %r4; mad.lo.s32 %r103, %r102, %r55, %r1; mul.wide.s32 %rd44, %r103, 4; add.s64 %rd45, %rd3, %rd44; ld.global.nc.f32 %f74, [%rd45]; add.s64 %rd46, %rd2, %rd44; ld.global.nc.f32 %f75, [%rd46]; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f76, [%rd47]; BB0_55: mul.f32 %f230, %f69, %f69; fma.rn.f32 %f231, %f68, %f68, %f230; fma.rn.f32 %f77, %f70, %f70, %f231; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_57; mul.f32 %f235, %f63, %f63; fma.rn.f32 %f236, %f62, %f62, %f235; fma.rn.f32 %f237, %f64, %f64, %f236; setp.eq.f32 %p60, %f237, 0f00000000; mov.f32 %f394, 0f00000000; mov.f32 %f395, %f394; mov.f32 %f396, %f394; @%p60 bra BB0_69; BB0_57: mul.f32 %f238, %f57, %f57; fma.rn.f32 %f239, %f56, %f56, %f238; fma.rn.f32 %f78, %f58, %f58, %f239; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f240, %f75, %f75; fma.rn.f32 %f241, %f74, %f74, %f240; fma.rn.f32 %f79, %f76, %f76, %f241; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_59; mul.f32 %f242, %f63, %f63; fma.rn.f32 %f243, %f62, %f62, %f242; fma.rn.f32 %f244, %f64, %f64, %f243; setp.neu.f32 %p66, %f244, 0f00000000; @%p66 bra BB0_68; bra.uni BB0_59; BB0_68: sub.f32 %f273, %f68, %f62; mul.f32 %f394, %f273, 0f3F000000; sub.f32 %f274, %f69, %f63; mul.f32 %f395, %f274, 0f3F000000; sub.f32 %f275, %f70, %f64; mul.f32 %f396, %f275, 0f3F000000; bra.uni BB0_69; BB0_59: or.pred %p68, %p59, %p61; @%p68 bra BB0_61; bra.uni BB0_60; BB0_61: mul.f32 %f245, %f63, %f63; fma.rn.f32 %f246, %f62, %f62, %f245; fma.rn.f32 %f83, %f64, %f64, %f246; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_63; bra.uni BB0_62; BB0_63: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_65; bra.uni BB0_64; BB0_65: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_67; bra.uni BB0_66; BB0_67: sub.f32 %f264, %f68, %f62; sub.f32 %f265, %f69, %f63; sub.f32 %f266, %f70, %f64; sub.f32 %f267, %f56, %f74; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f57, %f75; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f58, %f76; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f394, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f395, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f396, %f266, 0f3F2AAAAB, %f272; bra.uni BB0_69; BB0_60: sub.f32 %f394, %f1, %f62; sub.f32 %f395, %f2, %f63; sub.f32 %f396, %f3, %f64; bra.uni BB0_69; BB0_62: sub.f32 %f394, %f68, %f1; sub.f32 %f395, %f69, %f2; sub.f32 %f396, %f70, %f3; bra.uni BB0_69; BB0_64: mul.f32 %f247, %f62, 0fC0000000; fma.rn.f32 %f248, %f56, 0f3F000000, %f247; add.f32 %f249, %f63, %f63; mul.f32 %f250, %f57, 0f3F000000; sub.f32 %f251, %f250, %f249; add.f32 %f252, %f64, %f64; mul.f32 %f253, %f58, 0f3F000000; sub.f32 %f254, %f253, %f252; fma.rn.f32 %f394, %f1, 0f3FC00000, %f248; fma.rn.f32 %f395, %f2, 0f3FC00000, %f251; fma.rn.f32 %f396, %f3, 0f3FC00000, %f254; bra.uni BB0_69; BB0_66: mul.f32 %f255, %f74, 0fBF000000; fma.rn.f32 %f256, %f68, 0f40000000, %f255; mul.f32 %f257, %f75, 0fBF000000; fma.rn.f32 %f258, %f69, 0f40000000, %f257; mul.f32 %f259, %f76, 0fBF000000; fma.rn.f32 %f260, %f70, 0f40000000, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f394, %f256, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f395, %f258, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f396, %f260, %f263; BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p77, %rs3, 0; add.s32 %r39, %r3, -2; @%p77 bra BB0_71; rem.s32 %r104, %r39, %r57; add.s32 %r105, %r104, %r57; rem.s32 %r132, %r105, %r57; bra.uni BB0_72; BB0_71: mov.u32 %r106, 0; max.s32 %r132, %r39, %r106; BB0_72: setp.lt.s32 %p79, %r39, 0; mov.f32 %f105, 0f00000000; and.pred %p80, %p79, %p77; mov.f32 %f106, %f105; mov.f32 %f107, %f105; @%p80 bra BB0_74; mad.lo.s32 %r107, %r132, %r56, %r2; mad.lo.s32 %r108, %r107, %r55, %r1; mul.wide.s32 %rd48, %r108, 4; add.s64 %rd49, %rd3, %rd48; ld.global.nc.f32 %f105, [%rd49]; add.s64 %rd50, %rd2, %rd48; ld.global.nc.f32 %f106, [%rd50]; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f107, [%rd51]; BB0_74: add.s32 %r43, %r3, -1; @%p77 bra BB0_76; rem.s32 %r109, %r43, %r57; add.s32 %r110, %r109, %r57; rem.s32 %r133, %r110, %r57; bra.uni BB0_77; BB0_76: mov.u32 %r111, 0; max.s32 %r133, %r43, %r111; BB0_77: setp.lt.s32 %p82, %r43, 0; mov.f32 %f111, 0f00000000; and.pred %p84, %p82, %p77; mov.f32 %f112, %f111; mov.f32 %f113, %f111; @%p84 bra BB0_79; mad.lo.s32 %r112, %r133, %r56, %r2; mad.lo.s32 %r113, %r112, %r55, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f111, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f112, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f113, [%rd55]; BB0_79: add.s32 %r47, %r3, 1; @%p77 bra BB0_81; rem.s32 %r114, %r47, %r57; add.s32 %r115, %r114, %r57; rem.s32 %r134, %r115, %r57; bra.uni BB0_82; BB0_81: add.s32 %r116, %r57, -1; min.s32 %r134, %r47, %r116; BB0_82: setp.ge.s32 %p86, %r47, %r57; mov.f32 %f117, 0f00000000; and.pred %p88, %p86, %p77; mov.f32 %f118, %f117; mov.f32 %f119, %f117; @%p88 bra BB0_84; mad.lo.s32 %r117, %r134, %r56, %r2; mad.lo.s32 %r118, %r117, %r55, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f117, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f118, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f119, [%rd59]; BB0_84: add.s32 %r51, %r3, 2; @%p77 bra BB0_86; rem.s32 %r119, %r51, %r57; add.s32 %r120, %r119, %r57; rem.s32 %r135, %r120, %r57; bra.uni BB0_87; BB0_86: add.s32 %r121, %r57, -1; min.s32 %r135, %r51, %r121; BB0_87: setp.ge.s32 %p90, %r51, %r57; mov.f32 %f123, 0f00000000; and.pred %p92, %p90, %p77; mov.f32 %f124, %f123; mov.f32 %f125, %f123; @%p92 bra BB0_89; mad.lo.s32 %r122, %r135, %r56, %r2; mad.lo.s32 %r123, %r122, %r55, %r1; mul.wide.s32 %rd60, %r123, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f123, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f124, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f125, [%rd63]; BB0_89: mul.f32 %f288, %f118, %f118; fma.rn.f32 %f289, %f117, %f117, %f288; fma.rn.f32 %f126, %f119, %f119, %f289; setp.neu.f32 %p93, %f126, 0f00000000; @%p93 bra BB0_91; mul.f32 %f293, %f112, %f112; fma.rn.f32 %f294, %f111, %f111, %f293; fma.rn.f32 %f295, %f113, %f113, %f294; setp.eq.f32 %p94, %f295, 0f00000000; mov.f32 %f409, 0f00000000; mov.f32 %f410, %f409; mov.f32 %f411, %f409; @%p94 bra BB0_103; BB0_91: mul.f32 %f296, %f106, %f106; fma.rn.f32 %f297, %f105, %f105, %f296; fma.rn.f32 %f127, %f107, %f107, %f297; setp.neu.f32 %p95, %f127, 0f00000000; mul.f32 %f298, %f124, %f124; fma.rn.f32 %f299, %f123, %f123, %f298; fma.rn.f32 %f128, %f125, %f125, %f299; setp.neu.f32 %p96, %f128, 0f00000000; and.pred %p97, %p95, %p96; setp.eq.f32 %p98, %f126, 0f00000000; or.pred %p99, %p97, %p98; @%p99 bra BB0_93; mul.f32 %f300, %f112, %f112; fma.rn.f32 %f301, %f111, %f111, %f300; fma.rn.f32 %f302, %f113, %f113, %f301; setp.neu.f32 %p100, %f302, 0f00000000; @%p100 bra BB0_102; bra.uni BB0_93; BB0_102: sub.f32 %f331, %f117, %f111; mul.f32 %f409, %f331, 0f3F000000; sub.f32 %f332, %f118, %f112; mul.f32 %f410, %f332, 0f3F000000; sub.f32 %f333, %f119, %f113; mul.f32 %f411, %f333, 0f3F000000; bra.uni BB0_103; BB0_93: or.pred %p102, %p93, %p95; @%p102 bra BB0_95; bra.uni BB0_94; BB0_95: mul.f32 %f303, %f112, %f112; fma.rn.f32 %f304, %f111, %f111, %f303; fma.rn.f32 %f132, %f113, %f113, %f304; setp.neu.f32 %p103, %f132, 0f00000000; or.pred %p105, %p103, %p96; @%p105 bra BB0_97; bra.uni BB0_96; BB0_97: setp.eq.f32 %p106, %f127, 0f00000000; or.pred %p107, %p106, %p93; @%p107 bra BB0_99; bra.uni BB0_98; BB0_99: setp.eq.f32 %p109, %f128, 0f00000000; or.pred %p110, %p109, %p103; @%p110 bra BB0_101; bra.uni BB0_100; BB0_101: sub.f32 %f322, %f117, %f111; sub.f32 %f323, %f118, %f112; sub.f32 %f324, %f119, %f113; sub.f32 %f325, %f105, %f123; mul.f32 %f326, %f325, 0f3DAAAAAB; sub.f32 %f327, %f106, %f124; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f107, %f125; mul.f32 %f330, %f329, 0f3DAAAAAB; fma.rn.f32 %f409, %f322, 0f3F2AAAAB, %f326; fma.rn.f32 %f410, %f323, 0f3F2AAAAB, %f328; fma.rn.f32 %f411, %f324, 0f3F2AAAAB, %f330; bra.uni BB0_103; BB0_94: sub.f32 %f409, %f1, %f111; sub.f32 %f410, %f2, %f112; sub.f32 %f411, %f3, %f113; bra.uni BB0_103; BB0_96: sub.f32 %f409, %f117, %f1; sub.f32 %f410, %f118, %f2; sub.f32 %f411, %f119, %f3; bra.uni BB0_103; BB0_98: mul.f32 %f305, %f111, 0fC0000000; fma.rn.f32 %f306, %f105, 0f3F000000, %f305; add.f32 %f307, %f112, %f112; mul.f32 %f308, %f106, 0f3F000000; sub.f32 %f309, %f308, %f307; add.f32 %f310, %f113, %f113; mul.f32 %f311, %f107, 0f3F000000; sub.f32 %f312, %f311, %f310; fma.rn.f32 %f409, %f1, 0f3FC00000, %f306; fma.rn.f32 %f410, %f2, 0f3FC00000, %f309; fma.rn.f32 %f411, %f3, 0f3FC00000, %f312; bra.uni BB0_103; BB0_100: mul.f32 %f313, %f123, 0fBF000000; fma.rn.f32 %f314, %f117, 0f40000000, %f313; mul.f32 %f315, %f124, 0fBF000000; fma.rn.f32 %f316, %f118, 0f40000000, %f315; mul.f32 %f317, %f125, 0fBF000000; fma.rn.f32 %f318, %f119, 0f40000000, %f317; mul.f32 %f319, %f1, 0f3FC00000; sub.f32 %f409, %f314, %f319; mul.f32 %f320, %f2, 0f3FC00000; sub.f32 %f410, %f316, %f320; mul.f32 %f321, %f3, 0f3FC00000; sub.f32 %f411, %f318, %f321; BB0_103: setp.eq.s64 %p111, %rd7, 0; @%p111 bra BB0_105; cvta.to.global.u64 %rd64, %rd7; add.s64 %rd66, %rd64, %rd12; ld.global.nc.f32 %f334, [%rd66]; mul.f32 %f412, %f334, %f412; BB0_105: setp.eq.s64 %p112, %rd8, 0; @%p112 bra BB0_107; cvta.to.global.u64 %rd67, %rd8; add.s64 %rd69, %rd67, %rd12; ld.global.nc.f32 %f335, [%rd69]; mul.f32 %f413, %f335, %f413; BB0_107: cvta.to.global.u64 %rd70, %rd6; cvta.to.global.u64 %rd71, %rd5; cvta.to.global.u64 %rd72, %rd4; mul.f32 %f336, %f379, %f157; mul.f32 %f337, %f395, %f158; mul.f32 %f338, %f411, %f159; add.f32 %f339, %f412, %f412; mul.f32 %f340, %f1, %f339; add.f32 %f341, %f337, %f338; mul.f32 %f342, %f1, %f341; mul.f32 %f343, %f394, %f158; fma.rn.f32 %f344, %f2, %f343, %f342; mul.f32 %f345, %f409, %f159; fma.rn.f32 %f346, %f3, %f345, %f344; mul.f32 %f347, %f346, %f413; fma.rn.f32 %f348, %f336, %f340, %f347; add.s64 %rd74, %rd72, %rd12; st.global.f32 [%rd74], %f348; mul.f32 %f349, %f2, %f339; add.f32 %f350, %f336, %f338; mul.f32 %f351, %f2, %f350; mul.f32 %f352, %f380, %f157; fma.rn.f32 %f353, %f1, %f352, %f351; mul.f32 %f354, %f410, %f159; fma.rn.f32 %f355, %f3, %f354, %f353; mul.f32 %f356, %f355, %f413; fma.rn.f32 %f357, %f337, %f349, %f356; add.s64 %rd75, %rd71, %rd12; st.global.f32 [%rd75], %f357; mul.f32 %f358, %f3, %f339; mul.f32 %f359, %f396, %f158; mul.f32 %f360, %f2, %f359; mul.f32 %f361, %f381, %f157; fma.rn.f32 %f362, %f1, %f361, %f360; add.f32 %f363, %f336, %f337; fma.rn.f32 %f364, %f3, %f363, %f362; mul.f32 %f365, %f364, %f413; fma.rn.f32 %f366, %f338, %f358, %f365; add.s64 %rd76, %rd70, %rd12; st.global.f32 [%rd76], %f366; BB0_108: ret; } ` getmagnetoelasticforce_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<113>; .reg .b16 %rs<19>; .reg .f32 %f<414>; .reg .b32 %r<136>; .reg .b64 %rd<77>; ld.param.u64 %rd4, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_6]; ld.param.f32 %f412, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_8]; ld.param.f32 %f413, [getmagnetoelasticforce_param_9]; ld.param.f32 %f157, [getmagnetoelasticforce_param_10]; ld.param.f32 %f158, [getmagnetoelasticforce_param_11]; ld.param.f32 %f159, [getmagnetoelasticforce_param_12]; ld.param.u32 %r55, [getmagnetoelasticforce_param_13]; ld.param.u32 %r56, [getmagnetoelasticforce_param_14]; ld.param.u32 %r57, [getmagnetoelasticforce_param_15]; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd9; mov.u32 %r58, %ntid.x; mov.u32 %r59, %ctaid.x; mov.u32 %r60, %tid.x; mad.lo.s32 %r1, %r58, %r59, %r60; mov.u32 %r61, %ntid.y; mov.u32 %r62, %ctaid.y; mov.u32 %r63, %tid.y; mad.lo.s32 %r2, %r61, %r62, %r63; mov.u32 %r64, %ntid.z; mov.u32 %r65, %ctaid.z; mov.u32 %r66, %tid.z; mad.lo.s32 %r3, %r64, %r65, %r66; setp.ge.s32 %p4, %r2, %r56; setp.ge.s32 %p5, %r1, %r55; or.pred %p6, %p4, %p5; setp.ge.s32 %p7, %r3, %r57; or.pred %p8, %p6, %p7; @%p8 bra BB0_108; mul.lo.s32 %r4, %r3, %r56; add.s32 %r67, %r4, %r2; mul.lo.s32 %r5, %r67, %r55; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; add.s64 %rd14, %rd2, %rd12; ld.global.nc.f32 %f2, [%rd14]; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f3, [%rd15]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r7, %r1, -2; @%p9 bra BB0_3; rem.s32 %r68, %r7, %r55; add.s32 %r69, %r68, %r55; rem.s32 %r124, %r69, %r55; bra.uni BB0_4; BB0_3: mov.u32 %r70, 0; max.s32 %r124, %r7, %r70; BB0_4: setp.lt.s32 %p11, %r7, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_6; add.s32 %r71, %r124, %r5; mul.wide.s32 %rd16, %r71, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f7, [%rd17]; add.s64 %rd18, %rd2, %rd16; ld.global.nc.f32 %f8, [%rd18]; add.s64 %rd19, %rd1, %rd16; ld.global.nc.f32 %f9, [%rd19]; BB0_6: add.s32 %r11, %r1, -1; @%p9 bra BB0_8; rem.s32 %r72, %r11, %r55; add.s32 %r73, %r72, %r55; rem.s32 %r125, %r73, %r55; bra.uni BB0_9; BB0_8: mov.u32 %r74, 0; max.s32 %r125, %r11, %r74; BB0_9: setp.lt.s32 %p14, %r11, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_11; add.s32 %r75, %r125, %r5; mul.wide.s32 %rd20, %r75, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f13, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f14, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f15, [%rd23]; BB0_11: add.s32 %r15, %r1, 1; @%p9 bra BB0_13; rem.s32 %r76, %r15, %r55; add.s32 %r77, %r76, %r55; rem.s32 %r126, %r77, %r55; bra.uni BB0_14; BB0_13: add.s32 %r78, %r55, -1; min.s32 %r126, %r15, %r78; BB0_14: setp.ge.s32 %p18, %r15, %r55; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_16; add.s32 %r79, %r126, %r5; mul.wide.s32 %rd24, %r79, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f19, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f20, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f21, [%rd27]; BB0_16: add.s32 %r19, %r1, 2; @%p9 bra BB0_18; rem.s32 %r80, %r19, %r55; add.s32 %r81, %r80, %r55; rem.s32 %r127, %r81, %r55; bra.uni BB0_19; BB0_18: add.s32 %r82, %r55, -1; min.s32 %r127, %r19, %r82; BB0_19: setp.ge.s32 %p22, %r19, %r55; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_21; add.s32 %r83, %r127, %r5; mul.wide.s32 %rd28, %r83, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f25, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f26, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f27, [%rd31]; BB0_21: mul.f32 %f172, %f20, %f20; fma.rn.f32 %f173, %f19, %f19, %f172; fma.rn.f32 %f28, %f21, %f21, %f173; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_23; mul.f32 %f177, %f14, %f14; fma.rn.f32 %f178, %f13, %f13, %f177; fma.rn.f32 %f179, %f15, %f15, %f178; setp.eq.f32 %p26, %f179, 0f00000000; mov.f32 %f379, 0f00000000; mov.f32 %f380, %f379; mov.f32 %f381, %f379; @%p26 bra BB0_35; BB0_23: mul.f32 %f180, %f8, %f8; fma.rn.f32 %f181, %f7, %f7, %f180; fma.rn.f32 %f29, %f9, %f9, %f181; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f182, %f26, %f26; fma.rn.f32 %f183, %f25, %f25, %f182; fma.rn.f32 %f30, %f27, %f27, %f183; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_25; mul.f32 %f184, %f14, %f14; fma.rn.f32 %f185, %f13, %f13, %f184; fma.rn.f32 %f186, %f15, %f15, %f185; setp.neu.f32 %p32, %f186, 0f00000000; @%p32 bra BB0_34; bra.uni BB0_25; BB0_34: sub.f32 %f215, %f19, %f13; mul.f32 %f379, %f215, 0f3F000000; sub.f32 %f216, %f20, %f14; mul.f32 %f380, %f216, 0f3F000000; sub.f32 %f217, %f21, %f15; mul.f32 %f381, %f217, 0f3F000000; bra.uni BB0_35; BB0_25: or.pred %p34, %p25, %p27; @%p34 bra BB0_27; bra.uni BB0_26; BB0_27: mul.f32 %f187, %f14, %f14; fma.rn.f32 %f188, %f13, %f13, %f187; fma.rn.f32 %f34, %f15, %f15, %f188; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_29; bra.uni BB0_28; BB0_29: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_31; bra.uni BB0_30; BB0_31: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_33; bra.uni BB0_32; BB0_33: sub.f32 %f206, %f19, %f13; sub.f32 %f207, %f20, %f14; sub.f32 %f208, %f21, %f15; sub.f32 %f209, %f7, %f25; mul.f32 %f210, %f209, 0f3DAAAAAB; sub.f32 %f211, %f8, %f26; mul.f32 %f212, %f211, 0f3DAAAAAB; sub.f32 %f213, %f9, %f27; mul.f32 %f214, %f213, 0f3DAAAAAB; fma.rn.f32 %f379, %f206, 0f3F2AAAAB, %f210; fma.rn.f32 %f380, %f207, 0f3F2AAAAB, %f212; fma.rn.f32 %f381, %f208, 0f3F2AAAAB, %f214; bra.uni BB0_35; BB0_26: sub.f32 %f379, %f1, %f13; sub.f32 %f380, %f2, %f14; sub.f32 %f381, %f3, %f15; bra.uni BB0_35; BB0_28: sub.f32 %f379, %f19, %f1; sub.f32 %f380, %f20, %f2; sub.f32 %f381, %f21, %f3; bra.uni BB0_35; BB0_30: mul.f32 %f189, %f13, 0fC0000000; fma.rn.f32 %f190, %f7, 0f3F000000, %f189; add.f32 %f191, %f14, %f14; mul.f32 %f192, %f8, 0f3F000000; sub.f32 %f193, %f192, %f191; add.f32 %f194, %f15, %f15; mul.f32 %f195, %f9, 0f3F000000; sub.f32 %f196, %f195, %f194; fma.rn.f32 %f379, %f1, 0f3FC00000, %f190; fma.rn.f32 %f380, %f2, 0f3FC00000, %f193; fma.rn.f32 %f381, %f3, 0f3FC00000, %f196; bra.uni BB0_35; BB0_32: mul.f32 %f197, %f25, 0fBF000000; fma.rn.f32 %f198, %f19, 0f40000000, %f197; mul.f32 %f199, %f26, 0fBF000000; fma.rn.f32 %f200, %f20, 0f40000000, %f199; mul.f32 %f201, %f27, 0fBF000000; fma.rn.f32 %f202, %f21, 0f40000000, %f201; mul.f32 %f203, %f1, 0f3FC00000; sub.f32 %f379, %f198, %f203; mul.f32 %f204, %f2, 0f3FC00000; sub.f32 %f380, %f200, %f204; mul.f32 %f205, %f3, 0f3FC00000; sub.f32 %f381, %f202, %f205; BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_37; rem.s32 %r84, %r23, %r56; add.s32 %r85, %r84, %r56; rem.s32 %r128, %r85, %r56; bra.uni BB0_38; BB0_37: mov.u32 %r86, 0; max.s32 %r128, %r23, %r86; BB0_38: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_40; add.s32 %r87, %r128, %r4; mad.lo.s32 %r88, %r87, %r55, %r1; mul.wide.s32 %rd32, %r88, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f56, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f57, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f58, [%rd35]; BB0_40: add.s32 %r27, %r2, -1; @%p43 bra BB0_42; rem.s32 %r89, %r27, %r56; add.s32 %r90, %r89, %r56; rem.s32 %r129, %r90, %r56; bra.uni BB0_43; BB0_42: mov.u32 %r91, 0; max.s32 %r129, %r27, %r91; BB0_43: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_45; add.s32 %r92, %r129, %r4; mad.lo.s32 %r93, %r92, %r55, %r1; mul.wide.s32 %rd36, %r93, 4; add.s64 %rd37, %rd3, %rd36; ld.global.nc.f32 %f62, [%rd37]; add.s64 %rd38, %rd2, %rd36; ld.global.nc.f32 %f63, [%rd38]; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f64, [%rd39]; BB0_45: add.s32 %r31, %r2, 1; @%p43 bra BB0_47; rem.s32 %r94, %r31, %r56; add.s32 %r95, %r94, %r56; rem.s32 %r130, %r95, %r56; bra.uni BB0_48; BB0_47: add.s32 %r96, %r56, -1; min.s32 %r130, %r31, %r96; BB0_48: setp.ge.s32 %p52, %r31, %r56; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_50; add.s32 %r97, %r130, %r4; mad.lo.s32 %r98, %r97, %r55, %r1; mul.wide.s32 %rd40, %r98, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f68, [%rd41]; add.s64 %rd42, %rd2, %rd40; ld.global.nc.f32 %f69, [%rd42]; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f70, [%rd43]; BB0_50: add.s32 %r35, %r2, 2; @%p43 bra BB0_52; rem.s32 %r99, %r35, %r56; add.s32 %r100, %r99, %r56; rem.s32 %r131, %r100, %r56; bra.uni BB0_53; BB0_52: add.s32 %r101, %r56, -1; min.s32 %r131, %r35, %r101; BB0_53: setp.ge.s32 %p56, %r35, %r56; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_55; add.s32 %r102, %r131, %r4; mad.lo.s32 %r103, %r102, %r55, %r1; mul.wide.s32 %rd44, %r103, 4; add.s64 %rd45, %rd3, %rd44; ld.global.nc.f32 %f74, [%rd45]; add.s64 %rd46, %rd2, %rd44; ld.global.nc.f32 %f75, [%rd46]; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f76, [%rd47]; BB0_55: mul.f32 %f230, %f69, %f69; fma.rn.f32 %f231, %f68, %f68, %f230; fma.rn.f32 %f77, %f70, %f70, %f231; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_57; mul.f32 %f235, %f63, %f63; fma.rn.f32 %f236, %f62, %f62, %f235; fma.rn.f32 %f237, %f64, %f64, %f236; setp.eq.f32 %p60, %f237, 0f00000000; mov.f32 %f394, 0f00000000; mov.f32 %f395, %f394; mov.f32 %f396, %f394; @%p60 bra BB0_69; BB0_57: mul.f32 %f238, %f57, %f57; fma.rn.f32 %f239, %f56, %f56, %f238; fma.rn.f32 %f78, %f58, %f58, %f239; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f240, %f75, %f75; fma.rn.f32 %f241, %f74, %f74, %f240; fma.rn.f32 %f79, %f76, %f76, %f241; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_59; mul.f32 %f242, %f63, %f63; fma.rn.f32 %f243, %f62, %f62, %f242; fma.rn.f32 %f244, %f64, %f64, %f243; setp.neu.f32 %p66, %f244, 0f00000000; @%p66 bra BB0_68; bra.uni BB0_59; BB0_68: sub.f32 %f273, %f68, %f62; mul.f32 %f394, %f273, 0f3F000000; sub.f32 %f274, %f69, %f63; mul.f32 %f395, %f274, 0f3F000000; sub.f32 %f275, %f70, %f64; mul.f32 %f396, %f275, 0f3F000000; bra.uni BB0_69; BB0_59: or.pred %p68, %p59, %p61; @%p68 bra BB0_61; bra.uni BB0_60; BB0_61: mul.f32 %f245, %f63, %f63; fma.rn.f32 %f246, %f62, %f62, %f245; fma.rn.f32 %f83, %f64, %f64, %f246; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_63; bra.uni BB0_62; BB0_63: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_65; bra.uni BB0_64; BB0_65: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_67; bra.uni BB0_66; BB0_67: sub.f32 %f264, %f68, %f62; sub.f32 %f265, %f69, %f63; sub.f32 %f266, %f70, %f64; sub.f32 %f267, %f56, %f74; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f57, %f75; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f58, %f76; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f394, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f395, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f396, %f266, 0f3F2AAAAB, %f272; bra.uni BB0_69; BB0_60: sub.f32 %f394, %f1, %f62; sub.f32 %f395, %f2, %f63; sub.f32 %f396, %f3, %f64; bra.uni BB0_69; BB0_62: sub.f32 %f394, %f68, %f1; sub.f32 %f395, %f69, %f2; sub.f32 %f396, %f70, %f3; bra.uni BB0_69; BB0_64: mul.f32 %f247, %f62, 0fC0000000; fma.rn.f32 %f248, %f56, 0f3F000000, %f247; add.f32 %f249, %f63, %f63; mul.f32 %f250, %f57, 0f3F000000; sub.f32 %f251, %f250, %f249; add.f32 %f252, %f64, %f64; mul.f32 %f253, %f58, 0f3F000000; sub.f32 %f254, %f253, %f252; fma.rn.f32 %f394, %f1, 0f3FC00000, %f248; fma.rn.f32 %f395, %f2, 0f3FC00000, %f251; fma.rn.f32 %f396, %f3, 0f3FC00000, %f254; bra.uni BB0_69; BB0_66: mul.f32 %f255, %f74, 0fBF000000; fma.rn.f32 %f256, %f68, 0f40000000, %f255; mul.f32 %f257, %f75, 0fBF000000; fma.rn.f32 %f258, %f69, 0f40000000, %f257; mul.f32 %f259, %f76, 0fBF000000; fma.rn.f32 %f260, %f70, 0f40000000, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f394, %f256, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f395, %f258, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f396, %f260, %f263; BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p77, %rs3, 0; add.s32 %r39, %r3, -2; @%p77 bra BB0_71; rem.s32 %r104, %r39, %r57; add.s32 %r105, %r104, %r57; rem.s32 %r132, %r105, %r57; bra.uni BB0_72; BB0_71: mov.u32 %r106, 0; max.s32 %r132, %r39, %r106; BB0_72: setp.lt.s32 %p79, %r39, 0; mov.f32 %f105, 0f00000000; and.pred %p80, %p79, %p77; mov.f32 %f106, %f105; mov.f32 %f107, %f105; @%p80 bra BB0_74; mad.lo.s32 %r107, %r132, %r56, %r2; mad.lo.s32 %r108, %r107, %r55, %r1; mul.wide.s32 %rd48, %r108, 4; add.s64 %rd49, %rd3, %rd48; ld.global.nc.f32 %f105, [%rd49]; add.s64 %rd50, %rd2, %rd48; ld.global.nc.f32 %f106, [%rd50]; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f107, [%rd51]; BB0_74: add.s32 %r43, %r3, -1; @%p77 bra BB0_76; rem.s32 %r109, %r43, %r57; add.s32 %r110, %r109, %r57; rem.s32 %r133, %r110, %r57; bra.uni BB0_77; BB0_76: mov.u32 %r111, 0; max.s32 %r133, %r43, %r111; BB0_77: setp.lt.s32 %p82, %r43, 0; mov.f32 %f111, 0f00000000; and.pred %p84, %p82, %p77; mov.f32 %f112, %f111; mov.f32 %f113, %f111; @%p84 bra BB0_79; mad.lo.s32 %r112, %r133, %r56, %r2; mad.lo.s32 %r113, %r112, %r55, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f111, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f112, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f113, [%rd55]; BB0_79: add.s32 %r47, %r3, 1; @%p77 bra BB0_81; rem.s32 %r114, %r47, %r57; add.s32 %r115, %r114, %r57; rem.s32 %r134, %r115, %r57; bra.uni BB0_82; BB0_81: add.s32 %r116, %r57, -1; min.s32 %r134, %r47, %r116; BB0_82: setp.ge.s32 %p86, %r47, %r57; mov.f32 %f117, 0f00000000; and.pred %p88, %p86, %p77; mov.f32 %f118, %f117; mov.f32 %f119, %f117; @%p88 bra BB0_84; mad.lo.s32 %r117, %r134, %r56, %r2; mad.lo.s32 %r118, %r117, %r55, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f117, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f118, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f119, [%rd59]; BB0_84: add.s32 %r51, %r3, 2; @%p77 bra BB0_86; rem.s32 %r119, %r51, %r57; add.s32 %r120, %r119, %r57; rem.s32 %r135, %r120, %r57; bra.uni BB0_87; BB0_86: add.s32 %r121, %r57, -1; min.s32 %r135, %r51, %r121; BB0_87: setp.ge.s32 %p90, %r51, %r57; mov.f32 %f123, 0f00000000; and.pred %p92, %p90, %p77; mov.f32 %f124, %f123; mov.f32 %f125, %f123; @%p92 bra BB0_89; mad.lo.s32 %r122, %r135, %r56, %r2; mad.lo.s32 %r123, %r122, %r55, %r1; mul.wide.s32 %rd60, %r123, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f123, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f124, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f125, [%rd63]; BB0_89: mul.f32 %f288, %f118, %f118; fma.rn.f32 %f289, %f117, %f117, %f288; fma.rn.f32 %f126, %f119, %f119, %f289; setp.neu.f32 %p93, %f126, 0f00000000; @%p93 bra BB0_91; mul.f32 %f293, %f112, %f112; fma.rn.f32 %f294, %f111, %f111, %f293; fma.rn.f32 %f295, %f113, %f113, %f294; setp.eq.f32 %p94, %f295, 0f00000000; mov.f32 %f409, 0f00000000; mov.f32 %f410, %f409; mov.f32 %f411, %f409; @%p94 bra BB0_103; BB0_91: mul.f32 %f296, %f106, %f106; fma.rn.f32 %f297, %f105, %f105, %f296; fma.rn.f32 %f127, %f107, %f107, %f297; setp.neu.f32 %p95, %f127, 0f00000000; mul.f32 %f298, %f124, %f124; fma.rn.f32 %f299, %f123, %f123, %f298; fma.rn.f32 %f128, %f125, %f125, %f299; setp.neu.f32 %p96, %f128, 0f00000000; and.pred %p97, %p95, %p96; setp.eq.f32 %p98, %f126, 0f00000000; or.pred %p99, %p97, %p98; @%p99 bra BB0_93; mul.f32 %f300, %f112, %f112; fma.rn.f32 %f301, %f111, %f111, %f300; fma.rn.f32 %f302, %f113, %f113, %f301; setp.neu.f32 %p100, %f302, 0f00000000; @%p100 bra BB0_102; bra.uni BB0_93; BB0_102: sub.f32 %f331, %f117, %f111; mul.f32 %f409, %f331, 0f3F000000; sub.f32 %f332, %f118, %f112; mul.f32 %f410, %f332, 0f3F000000; sub.f32 %f333, %f119, %f113; mul.f32 %f411, %f333, 0f3F000000; bra.uni BB0_103; BB0_93: or.pred %p102, %p93, %p95; @%p102 bra BB0_95; bra.uni BB0_94; BB0_95: mul.f32 %f303, %f112, %f112; fma.rn.f32 %f304, %f111, %f111, %f303; fma.rn.f32 %f132, %f113, %f113, %f304; setp.neu.f32 %p103, %f132, 0f00000000; or.pred %p105, %p103, %p96; @%p105 bra BB0_97; bra.uni BB0_96; BB0_97: setp.eq.f32 %p106, %f127, 0f00000000; or.pred %p107, %p106, %p93; @%p107 bra BB0_99; bra.uni BB0_98; BB0_99: setp.eq.f32 %p109, %f128, 0f00000000; or.pred %p110, %p109, %p103; @%p110 bra BB0_101; bra.uni BB0_100; BB0_101: sub.f32 %f322, %f117, %f111; sub.f32 %f323, %f118, %f112; sub.f32 %f324, %f119, %f113; sub.f32 %f325, %f105, %f123; mul.f32 %f326, %f325, 0f3DAAAAAB; sub.f32 %f327, %f106, %f124; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f107, %f125; mul.f32 %f330, %f329, 0f3DAAAAAB; fma.rn.f32 %f409, %f322, 0f3F2AAAAB, %f326; fma.rn.f32 %f410, %f323, 0f3F2AAAAB, %f328; fma.rn.f32 %f411, %f324, 0f3F2AAAAB, %f330; bra.uni BB0_103; BB0_94: sub.f32 %f409, %f1, %f111; sub.f32 %f410, %f2, %f112; sub.f32 %f411, %f3, %f113; bra.uni BB0_103; BB0_96: sub.f32 %f409, %f117, %f1; sub.f32 %f410, %f118, %f2; sub.f32 %f411, %f119, %f3; bra.uni BB0_103; BB0_98: mul.f32 %f305, %f111, 0fC0000000; fma.rn.f32 %f306, %f105, 0f3F000000, %f305; add.f32 %f307, %f112, %f112; mul.f32 %f308, %f106, 0f3F000000; sub.f32 %f309, %f308, %f307; add.f32 %f310, %f113, %f113; mul.f32 %f311, %f107, 0f3F000000; sub.f32 %f312, %f311, %f310; fma.rn.f32 %f409, %f1, 0f3FC00000, %f306; fma.rn.f32 %f410, %f2, 0f3FC00000, %f309; fma.rn.f32 %f411, %f3, 0f3FC00000, %f312; bra.uni BB0_103; BB0_100: mul.f32 %f313, %f123, 0fBF000000; fma.rn.f32 %f314, %f117, 0f40000000, %f313; mul.f32 %f315, %f124, 0fBF000000; fma.rn.f32 %f316, %f118, 0f40000000, %f315; mul.f32 %f317, %f125, 0fBF000000; fma.rn.f32 %f318, %f119, 0f40000000, %f317; mul.f32 %f319, %f1, 0f3FC00000; sub.f32 %f409, %f314, %f319; mul.f32 %f320, %f2, 0f3FC00000; sub.f32 %f410, %f316, %f320; mul.f32 %f321, %f3, 0f3FC00000; sub.f32 %f411, %f318, %f321; BB0_103: setp.eq.s64 %p111, %rd7, 0; @%p111 bra BB0_105; cvta.to.global.u64 %rd64, %rd7; add.s64 %rd66, %rd64, %rd12; ld.global.nc.f32 %f334, [%rd66]; mul.f32 %f412, %f334, %f412; BB0_105: setp.eq.s64 %p112, %rd8, 0; @%p112 bra BB0_107; cvta.to.global.u64 %rd67, %rd8; add.s64 %rd69, %rd67, %rd12; ld.global.nc.f32 %f335, [%rd69]; mul.f32 %f413, %f335, %f413; BB0_107: cvta.to.global.u64 %rd70, %rd6; cvta.to.global.u64 %rd71, %rd5; cvta.to.global.u64 %rd72, %rd4; mul.f32 %f336, %f379, %f157; mul.f32 %f337, %f395, %f158; mul.f32 %f338, %f411, %f159; add.f32 %f339, %f412, %f412; mul.f32 %f340, %f1, %f339; add.f32 %f341, %f337, %f338; mul.f32 %f342, %f1, %f341; mul.f32 %f343, %f394, %f158; fma.rn.f32 %f344, %f2, %f343, %f342; mul.f32 %f345, %f409, %f159; fma.rn.f32 %f346, %f3, %f345, %f344; mul.f32 %f347, %f346, %f413; fma.rn.f32 %f348, %f336, %f340, %f347; add.s64 %rd74, %rd72, %rd12; st.global.f32 [%rd74], %f348; mul.f32 %f349, %f2, %f339; add.f32 %f350, %f336, %f338; mul.f32 %f351, %f2, %f350; mul.f32 %f352, %f380, %f157; fma.rn.f32 %f353, %f1, %f352, %f351; mul.f32 %f354, %f410, %f159; fma.rn.f32 %f355, %f3, %f354, %f353; mul.f32 %f356, %f355, %f413; fma.rn.f32 %f357, %f337, %f349, %f356; add.s64 %rd75, %rd71, %rd12; st.global.f32 [%rd75], %f357; mul.f32 %f358, %f3, %f339; mul.f32 %f359, %f396, %f158; mul.f32 %f360, %f2, %f359; mul.f32 %f361, %f381, %f157; fma.rn.f32 %f362, %f1, %f361, %f360; add.f32 %f363, %f336, %f337; fma.rn.f32 %f364, %f3, %f363, %f362; mul.f32 %f365, %f364, %f413; fma.rn.f32 %f366, %f338, %f358, %f365; add.s64 %rd76, %rd70, %rd12; st.global.f32 [%rd76], %f366; BB0_108: ret; } ` getmagnetoelasticforce_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<113>; .reg .b16 %rs<19>; .reg .f32 %f<414>; .reg .b32 %r<136>; .reg .b64 %rd<77>; ld.param.u64 %rd4, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_6]; ld.param.f32 %f412, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_8]; ld.param.f32 %f413, [getmagnetoelasticforce_param_9]; ld.param.f32 %f157, [getmagnetoelasticforce_param_10]; ld.param.f32 %f158, [getmagnetoelasticforce_param_11]; ld.param.f32 %f159, [getmagnetoelasticforce_param_12]; ld.param.u32 %r55, [getmagnetoelasticforce_param_13]; ld.param.u32 %r56, [getmagnetoelasticforce_param_14]; ld.param.u32 %r57, [getmagnetoelasticforce_param_15]; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd9; mov.u32 %r58, %ntid.x; mov.u32 %r59, %ctaid.x; mov.u32 %r60, %tid.x; mad.lo.s32 %r1, %r58, %r59, %r60; mov.u32 %r61, %ntid.y; mov.u32 %r62, %ctaid.y; mov.u32 %r63, %tid.y; mad.lo.s32 %r2, %r61, %r62, %r63; mov.u32 %r64, %ntid.z; mov.u32 %r65, %ctaid.z; mov.u32 %r66, %tid.z; mad.lo.s32 %r3, %r64, %r65, %r66; setp.ge.s32 %p4, %r2, %r56; setp.ge.s32 %p5, %r1, %r55; or.pred %p6, %p4, %p5; setp.ge.s32 %p7, %r3, %r57; or.pred %p8, %p6, %p7; @%p8 bra BB0_108; mul.lo.s32 %r4, %r3, %r56; add.s32 %r67, %r4, %r2; mul.lo.s32 %r5, %r67, %r55; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; add.s64 %rd14, %rd2, %rd12; ld.global.nc.f32 %f2, [%rd14]; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f3, [%rd15]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r7, %r1, -2; @%p9 bra BB0_3; rem.s32 %r68, %r7, %r55; add.s32 %r69, %r68, %r55; rem.s32 %r124, %r69, %r55; bra.uni BB0_4; BB0_3: mov.u32 %r70, 0; max.s32 %r124, %r7, %r70; BB0_4: setp.lt.s32 %p11, %r7, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_6; add.s32 %r71, %r124, %r5; mul.wide.s32 %rd16, %r71, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f7, [%rd17]; add.s64 %rd18, %rd2, %rd16; ld.global.nc.f32 %f8, [%rd18]; add.s64 %rd19, %rd1, %rd16; ld.global.nc.f32 %f9, [%rd19]; BB0_6: add.s32 %r11, %r1, -1; @%p9 bra BB0_8; rem.s32 %r72, %r11, %r55; add.s32 %r73, %r72, %r55; rem.s32 %r125, %r73, %r55; bra.uni BB0_9; BB0_8: mov.u32 %r74, 0; max.s32 %r125, %r11, %r74; BB0_9: setp.lt.s32 %p14, %r11, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_11; add.s32 %r75, %r125, %r5; mul.wide.s32 %rd20, %r75, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f13, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f14, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f15, [%rd23]; BB0_11: add.s32 %r15, %r1, 1; @%p9 bra BB0_13; rem.s32 %r76, %r15, %r55; add.s32 %r77, %r76, %r55; rem.s32 %r126, %r77, %r55; bra.uni BB0_14; BB0_13: add.s32 %r78, %r55, -1; min.s32 %r126, %r15, %r78; BB0_14: setp.ge.s32 %p18, %r15, %r55; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_16; add.s32 %r79, %r126, %r5; mul.wide.s32 %rd24, %r79, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f19, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f20, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f21, [%rd27]; BB0_16: add.s32 %r19, %r1, 2; @%p9 bra BB0_18; rem.s32 %r80, %r19, %r55; add.s32 %r81, %r80, %r55; rem.s32 %r127, %r81, %r55; bra.uni BB0_19; BB0_18: add.s32 %r82, %r55, -1; min.s32 %r127, %r19, %r82; BB0_19: setp.ge.s32 %p22, %r19, %r55; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_21; add.s32 %r83, %r127, %r5; mul.wide.s32 %rd28, %r83, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f25, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f26, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f27, [%rd31]; BB0_21: mul.f32 %f172, %f20, %f20; fma.rn.f32 %f173, %f19, %f19, %f172; fma.rn.f32 %f28, %f21, %f21, %f173; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_23; mul.f32 %f177, %f14, %f14; fma.rn.f32 %f178, %f13, %f13, %f177; fma.rn.f32 %f179, %f15, %f15, %f178; setp.eq.f32 %p26, %f179, 0f00000000; mov.f32 %f379, 0f00000000; mov.f32 %f380, %f379; mov.f32 %f381, %f379; @%p26 bra BB0_35; BB0_23: mul.f32 %f180, %f8, %f8; fma.rn.f32 %f181, %f7, %f7, %f180; fma.rn.f32 %f29, %f9, %f9, %f181; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f182, %f26, %f26; fma.rn.f32 %f183, %f25, %f25, %f182; fma.rn.f32 %f30, %f27, %f27, %f183; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_25; mul.f32 %f184, %f14, %f14; fma.rn.f32 %f185, %f13, %f13, %f184; fma.rn.f32 %f186, %f15, %f15, %f185; setp.neu.f32 %p32, %f186, 0f00000000; @%p32 bra BB0_34; bra.uni BB0_25; BB0_34: sub.f32 %f215, %f19, %f13; mul.f32 %f379, %f215, 0f3F000000; sub.f32 %f216, %f20, %f14; mul.f32 %f380, %f216, 0f3F000000; sub.f32 %f217, %f21, %f15; mul.f32 %f381, %f217, 0f3F000000; bra.uni BB0_35; BB0_25: or.pred %p34, %p25, %p27; @%p34 bra BB0_27; bra.uni BB0_26; BB0_27: mul.f32 %f187, %f14, %f14; fma.rn.f32 %f188, %f13, %f13, %f187; fma.rn.f32 %f34, %f15, %f15, %f188; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_29; bra.uni BB0_28; BB0_29: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_31; bra.uni BB0_30; BB0_31: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_33; bra.uni BB0_32; BB0_33: sub.f32 %f206, %f19, %f13; sub.f32 %f207, %f20, %f14; sub.f32 %f208, %f21, %f15; sub.f32 %f209, %f7, %f25; mul.f32 %f210, %f209, 0f3DAAAAAB; sub.f32 %f211, %f8, %f26; mul.f32 %f212, %f211, 0f3DAAAAAB; sub.f32 %f213, %f9, %f27; mul.f32 %f214, %f213, 0f3DAAAAAB; fma.rn.f32 %f379, %f206, 0f3F2AAAAB, %f210; fma.rn.f32 %f380, %f207, 0f3F2AAAAB, %f212; fma.rn.f32 %f381, %f208, 0f3F2AAAAB, %f214; bra.uni BB0_35; BB0_26: sub.f32 %f379, %f1, %f13; sub.f32 %f380, %f2, %f14; sub.f32 %f381, %f3, %f15; bra.uni BB0_35; BB0_28: sub.f32 %f379, %f19, %f1; sub.f32 %f380, %f20, %f2; sub.f32 %f381, %f21, %f3; bra.uni BB0_35; BB0_30: mul.f32 %f189, %f13, 0fC0000000; fma.rn.f32 %f190, %f7, 0f3F000000, %f189; add.f32 %f191, %f14, %f14; mul.f32 %f192, %f8, 0f3F000000; sub.f32 %f193, %f192, %f191; add.f32 %f194, %f15, %f15; mul.f32 %f195, %f9, 0f3F000000; sub.f32 %f196, %f195, %f194; fma.rn.f32 %f379, %f1, 0f3FC00000, %f190; fma.rn.f32 %f380, %f2, 0f3FC00000, %f193; fma.rn.f32 %f381, %f3, 0f3FC00000, %f196; bra.uni BB0_35; BB0_32: mul.f32 %f197, %f25, 0fBF000000; fma.rn.f32 %f198, %f19, 0f40000000, %f197; mul.f32 %f199, %f26, 0fBF000000; fma.rn.f32 %f200, %f20, 0f40000000, %f199; mul.f32 %f201, %f27, 0fBF000000; fma.rn.f32 %f202, %f21, 0f40000000, %f201; mul.f32 %f203, %f1, 0f3FC00000; sub.f32 %f379, %f198, %f203; mul.f32 %f204, %f2, 0f3FC00000; sub.f32 %f380, %f200, %f204; mul.f32 %f205, %f3, 0f3FC00000; sub.f32 %f381, %f202, %f205; BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_37; rem.s32 %r84, %r23, %r56; add.s32 %r85, %r84, %r56; rem.s32 %r128, %r85, %r56; bra.uni BB0_38; BB0_37: mov.u32 %r86, 0; max.s32 %r128, %r23, %r86; BB0_38: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_40; add.s32 %r87, %r128, %r4; mad.lo.s32 %r88, %r87, %r55, %r1; mul.wide.s32 %rd32, %r88, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f56, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f57, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f58, [%rd35]; BB0_40: add.s32 %r27, %r2, -1; @%p43 bra BB0_42; rem.s32 %r89, %r27, %r56; add.s32 %r90, %r89, %r56; rem.s32 %r129, %r90, %r56; bra.uni BB0_43; BB0_42: mov.u32 %r91, 0; max.s32 %r129, %r27, %r91; BB0_43: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_45; add.s32 %r92, %r129, %r4; mad.lo.s32 %r93, %r92, %r55, %r1; mul.wide.s32 %rd36, %r93, 4; add.s64 %rd37, %rd3, %rd36; ld.global.nc.f32 %f62, [%rd37]; add.s64 %rd38, %rd2, %rd36; ld.global.nc.f32 %f63, [%rd38]; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f64, [%rd39]; BB0_45: add.s32 %r31, %r2, 1; @%p43 bra BB0_47; rem.s32 %r94, %r31, %r56; add.s32 %r95, %r94, %r56; rem.s32 %r130, %r95, %r56; bra.uni BB0_48; BB0_47: add.s32 %r96, %r56, -1; min.s32 %r130, %r31, %r96; BB0_48: setp.ge.s32 %p52, %r31, %r56; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_50; add.s32 %r97, %r130, %r4; mad.lo.s32 %r98, %r97, %r55, %r1; mul.wide.s32 %rd40, %r98, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f68, [%rd41]; add.s64 %rd42, %rd2, %rd40; ld.global.nc.f32 %f69, [%rd42]; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f70, [%rd43]; BB0_50: add.s32 %r35, %r2, 2; @%p43 bra BB0_52; rem.s32 %r99, %r35, %r56; add.s32 %r100, %r99, %r56; rem.s32 %r131, %r100, %r56; bra.uni BB0_53; BB0_52: add.s32 %r101, %r56, -1; min.s32 %r131, %r35, %r101; BB0_53: setp.ge.s32 %p56, %r35, %r56; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_55; add.s32 %r102, %r131, %r4; mad.lo.s32 %r103, %r102, %r55, %r1; mul.wide.s32 %rd44, %r103, 4; add.s64 %rd45, %rd3, %rd44; ld.global.nc.f32 %f74, [%rd45]; add.s64 %rd46, %rd2, %rd44; ld.global.nc.f32 %f75, [%rd46]; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f76, [%rd47]; BB0_55: mul.f32 %f230, %f69, %f69; fma.rn.f32 %f231, %f68, %f68, %f230; fma.rn.f32 %f77, %f70, %f70, %f231; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_57; mul.f32 %f235, %f63, %f63; fma.rn.f32 %f236, %f62, %f62, %f235; fma.rn.f32 %f237, %f64, %f64, %f236; setp.eq.f32 %p60, %f237, 0f00000000; mov.f32 %f394, 0f00000000; mov.f32 %f395, %f394; mov.f32 %f396, %f394; @%p60 bra BB0_69; BB0_57: mul.f32 %f238, %f57, %f57; fma.rn.f32 %f239, %f56, %f56, %f238; fma.rn.f32 %f78, %f58, %f58, %f239; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f240, %f75, %f75; fma.rn.f32 %f241, %f74, %f74, %f240; fma.rn.f32 %f79, %f76, %f76, %f241; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_59; mul.f32 %f242, %f63, %f63; fma.rn.f32 %f243, %f62, %f62, %f242; fma.rn.f32 %f244, %f64, %f64, %f243; setp.neu.f32 %p66, %f244, 0f00000000; @%p66 bra BB0_68; bra.uni BB0_59; BB0_68: sub.f32 %f273, %f68, %f62; mul.f32 %f394, %f273, 0f3F000000; sub.f32 %f274, %f69, %f63; mul.f32 %f395, %f274, 0f3F000000; sub.f32 %f275, %f70, %f64; mul.f32 %f396, %f275, 0f3F000000; bra.uni BB0_69; BB0_59: or.pred %p68, %p59, %p61; @%p68 bra BB0_61; bra.uni BB0_60; BB0_61: mul.f32 %f245, %f63, %f63; fma.rn.f32 %f246, %f62, %f62, %f245; fma.rn.f32 %f83, %f64, %f64, %f246; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_63; bra.uni BB0_62; BB0_63: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_65; bra.uni BB0_64; BB0_65: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_67; bra.uni BB0_66; BB0_67: sub.f32 %f264, %f68, %f62; sub.f32 %f265, %f69, %f63; sub.f32 %f266, %f70, %f64; sub.f32 %f267, %f56, %f74; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f57, %f75; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f58, %f76; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f394, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f395, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f396, %f266, 0f3F2AAAAB, %f272; bra.uni BB0_69; BB0_60: sub.f32 %f394, %f1, %f62; sub.f32 %f395, %f2, %f63; sub.f32 %f396, %f3, %f64; bra.uni BB0_69; BB0_62: sub.f32 %f394, %f68, %f1; sub.f32 %f395, %f69, %f2; sub.f32 %f396, %f70, %f3; bra.uni BB0_69; BB0_64: mul.f32 %f247, %f62, 0fC0000000; fma.rn.f32 %f248, %f56, 0f3F000000, %f247; add.f32 %f249, %f63, %f63; mul.f32 %f250, %f57, 0f3F000000; sub.f32 %f251, %f250, %f249; add.f32 %f252, %f64, %f64; mul.f32 %f253, %f58, 0f3F000000; sub.f32 %f254, %f253, %f252; fma.rn.f32 %f394, %f1, 0f3FC00000, %f248; fma.rn.f32 %f395, %f2, 0f3FC00000, %f251; fma.rn.f32 %f396, %f3, 0f3FC00000, %f254; bra.uni BB0_69; BB0_66: mul.f32 %f255, %f74, 0fBF000000; fma.rn.f32 %f256, %f68, 0f40000000, %f255; mul.f32 %f257, %f75, 0fBF000000; fma.rn.f32 %f258, %f69, 0f40000000, %f257; mul.f32 %f259, %f76, 0fBF000000; fma.rn.f32 %f260, %f70, 0f40000000, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f394, %f256, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f395, %f258, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f396, %f260, %f263; BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p77, %rs3, 0; add.s32 %r39, %r3, -2; @%p77 bra BB0_71; rem.s32 %r104, %r39, %r57; add.s32 %r105, %r104, %r57; rem.s32 %r132, %r105, %r57; bra.uni BB0_72; BB0_71: mov.u32 %r106, 0; max.s32 %r132, %r39, %r106; BB0_72: setp.lt.s32 %p79, %r39, 0; mov.f32 %f105, 0f00000000; and.pred %p80, %p79, %p77; mov.f32 %f106, %f105; mov.f32 %f107, %f105; @%p80 bra BB0_74; mad.lo.s32 %r107, %r132, %r56, %r2; mad.lo.s32 %r108, %r107, %r55, %r1; mul.wide.s32 %rd48, %r108, 4; add.s64 %rd49, %rd3, %rd48; ld.global.nc.f32 %f105, [%rd49]; add.s64 %rd50, %rd2, %rd48; ld.global.nc.f32 %f106, [%rd50]; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f107, [%rd51]; BB0_74: add.s32 %r43, %r3, -1; @%p77 bra BB0_76; rem.s32 %r109, %r43, %r57; add.s32 %r110, %r109, %r57; rem.s32 %r133, %r110, %r57; bra.uni BB0_77; BB0_76: mov.u32 %r111, 0; max.s32 %r133, %r43, %r111; BB0_77: setp.lt.s32 %p82, %r43, 0; mov.f32 %f111, 0f00000000; and.pred %p84, %p82, %p77; mov.f32 %f112, %f111; mov.f32 %f113, %f111; @%p84 bra BB0_79; mad.lo.s32 %r112, %r133, %r56, %r2; mad.lo.s32 %r113, %r112, %r55, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f111, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f112, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f113, [%rd55]; BB0_79: add.s32 %r47, %r3, 1; @%p77 bra BB0_81; rem.s32 %r114, %r47, %r57; add.s32 %r115, %r114, %r57; rem.s32 %r134, %r115, %r57; bra.uni BB0_82; BB0_81: add.s32 %r116, %r57, -1; min.s32 %r134, %r47, %r116; BB0_82: setp.ge.s32 %p86, %r47, %r57; mov.f32 %f117, 0f00000000; and.pred %p88, %p86, %p77; mov.f32 %f118, %f117; mov.f32 %f119, %f117; @%p88 bra BB0_84; mad.lo.s32 %r117, %r134, %r56, %r2; mad.lo.s32 %r118, %r117, %r55, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f117, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f118, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f119, [%rd59]; BB0_84: add.s32 %r51, %r3, 2; @%p77 bra BB0_86; rem.s32 %r119, %r51, %r57; add.s32 %r120, %r119, %r57; rem.s32 %r135, %r120, %r57; bra.uni BB0_87; BB0_86: add.s32 %r121, %r57, -1; min.s32 %r135, %r51, %r121; BB0_87: setp.ge.s32 %p90, %r51, %r57; mov.f32 %f123, 0f00000000; and.pred %p92, %p90, %p77; mov.f32 %f124, %f123; mov.f32 %f125, %f123; @%p92 bra BB0_89; mad.lo.s32 %r122, %r135, %r56, %r2; mad.lo.s32 %r123, %r122, %r55, %r1; mul.wide.s32 %rd60, %r123, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f123, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f124, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f125, [%rd63]; BB0_89: mul.f32 %f288, %f118, %f118; fma.rn.f32 %f289, %f117, %f117, %f288; fma.rn.f32 %f126, %f119, %f119, %f289; setp.neu.f32 %p93, %f126, 0f00000000; @%p93 bra BB0_91; mul.f32 %f293, %f112, %f112; fma.rn.f32 %f294, %f111, %f111, %f293; fma.rn.f32 %f295, %f113, %f113, %f294; setp.eq.f32 %p94, %f295, 0f00000000; mov.f32 %f409, 0f00000000; mov.f32 %f410, %f409; mov.f32 %f411, %f409; @%p94 bra BB0_103; BB0_91: mul.f32 %f296, %f106, %f106; fma.rn.f32 %f297, %f105, %f105, %f296; fma.rn.f32 %f127, %f107, %f107, %f297; setp.neu.f32 %p95, %f127, 0f00000000; mul.f32 %f298, %f124, %f124; fma.rn.f32 %f299, %f123, %f123, %f298; fma.rn.f32 %f128, %f125, %f125, %f299; setp.neu.f32 %p96, %f128, 0f00000000; and.pred %p97, %p95, %p96; setp.eq.f32 %p98, %f126, 0f00000000; or.pred %p99, %p97, %p98; @%p99 bra BB0_93; mul.f32 %f300, %f112, %f112; fma.rn.f32 %f301, %f111, %f111, %f300; fma.rn.f32 %f302, %f113, %f113, %f301; setp.neu.f32 %p100, %f302, 0f00000000; @%p100 bra BB0_102; bra.uni BB0_93; BB0_102: sub.f32 %f331, %f117, %f111; mul.f32 %f409, %f331, 0f3F000000; sub.f32 %f332, %f118, %f112; mul.f32 %f410, %f332, 0f3F000000; sub.f32 %f333, %f119, %f113; mul.f32 %f411, %f333, 0f3F000000; bra.uni BB0_103; BB0_93: or.pred %p102, %p93, %p95; @%p102 bra BB0_95; bra.uni BB0_94; BB0_95: mul.f32 %f303, %f112, %f112; fma.rn.f32 %f304, %f111, %f111, %f303; fma.rn.f32 %f132, %f113, %f113, %f304; setp.neu.f32 %p103, %f132, 0f00000000; or.pred %p105, %p103, %p96; @%p105 bra BB0_97; bra.uni BB0_96; BB0_97: setp.eq.f32 %p106, %f127, 0f00000000; or.pred %p107, %p106, %p93; @%p107 bra BB0_99; bra.uni BB0_98; BB0_99: setp.eq.f32 %p109, %f128, 0f00000000; or.pred %p110, %p109, %p103; @%p110 bra BB0_101; bra.uni BB0_100; BB0_101: sub.f32 %f322, %f117, %f111; sub.f32 %f323, %f118, %f112; sub.f32 %f324, %f119, %f113; sub.f32 %f325, %f105, %f123; mul.f32 %f326, %f325, 0f3DAAAAAB; sub.f32 %f327, %f106, %f124; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f107, %f125; mul.f32 %f330, %f329, 0f3DAAAAAB; fma.rn.f32 %f409, %f322, 0f3F2AAAAB, %f326; fma.rn.f32 %f410, %f323, 0f3F2AAAAB, %f328; fma.rn.f32 %f411, %f324, 0f3F2AAAAB, %f330; bra.uni BB0_103; BB0_94: sub.f32 %f409, %f1, %f111; sub.f32 %f410, %f2, %f112; sub.f32 %f411, %f3, %f113; bra.uni BB0_103; BB0_96: sub.f32 %f409, %f117, %f1; sub.f32 %f410, %f118, %f2; sub.f32 %f411, %f119, %f3; bra.uni BB0_103; BB0_98: mul.f32 %f305, %f111, 0fC0000000; fma.rn.f32 %f306, %f105, 0f3F000000, %f305; add.f32 %f307, %f112, %f112; mul.f32 %f308, %f106, 0f3F000000; sub.f32 %f309, %f308, %f307; add.f32 %f310, %f113, %f113; mul.f32 %f311, %f107, 0f3F000000; sub.f32 %f312, %f311, %f310; fma.rn.f32 %f409, %f1, 0f3FC00000, %f306; fma.rn.f32 %f410, %f2, 0f3FC00000, %f309; fma.rn.f32 %f411, %f3, 0f3FC00000, %f312; bra.uni BB0_103; BB0_100: mul.f32 %f313, %f123, 0fBF000000; fma.rn.f32 %f314, %f117, 0f40000000, %f313; mul.f32 %f315, %f124, 0fBF000000; fma.rn.f32 %f316, %f118, 0f40000000, %f315; mul.f32 %f317, %f125, 0fBF000000; fma.rn.f32 %f318, %f119, 0f40000000, %f317; mul.f32 %f319, %f1, 0f3FC00000; sub.f32 %f409, %f314, %f319; mul.f32 %f320, %f2, 0f3FC00000; sub.f32 %f410, %f316, %f320; mul.f32 %f321, %f3, 0f3FC00000; sub.f32 %f411, %f318, %f321; BB0_103: setp.eq.s64 %p111, %rd7, 0; @%p111 bra BB0_105; cvta.to.global.u64 %rd64, %rd7; add.s64 %rd66, %rd64, %rd12; ld.global.nc.f32 %f334, [%rd66]; mul.f32 %f412, %f334, %f412; BB0_105: setp.eq.s64 %p112, %rd8, 0; @%p112 bra BB0_107; cvta.to.global.u64 %rd67, %rd8; add.s64 %rd69, %rd67, %rd12; ld.global.nc.f32 %f335, [%rd69]; mul.f32 %f413, %f335, %f413; BB0_107: cvta.to.global.u64 %rd70, %rd6; cvta.to.global.u64 %rd71, %rd5; cvta.to.global.u64 %rd72, %rd4; mul.f32 %f336, %f379, %f157; mul.f32 %f337, %f395, %f158; mul.f32 %f338, %f411, %f159; add.f32 %f339, %f412, %f412; mul.f32 %f340, %f1, %f339; add.f32 %f341, %f337, %f338; mul.f32 %f342, %f1, %f341; mul.f32 %f343, %f394, %f158; fma.rn.f32 %f344, %f2, %f343, %f342; mul.f32 %f345, %f409, %f159; fma.rn.f32 %f346, %f3, %f345, %f344; mul.f32 %f347, %f346, %f413; fma.rn.f32 %f348, %f336, %f340, %f347; add.s64 %rd74, %rd72, %rd12; st.global.f32 [%rd74], %f348; mul.f32 %f349, %f2, %f339; add.f32 %f350, %f336, %f338; mul.f32 %f351, %f2, %f350; mul.f32 %f352, %f380, %f157; fma.rn.f32 %f353, %f1, %f352, %f351; mul.f32 %f354, %f410, %f159; fma.rn.f32 %f355, %f3, %f354, %f353; mul.f32 %f356, %f355, %f413; fma.rn.f32 %f357, %f337, %f349, %f356; add.s64 %rd75, %rd71, %rd12; st.global.f32 [%rd75], %f357; mul.f32 %f358, %f3, %f339; mul.f32 %f359, %f396, %f158; mul.f32 %f360, %f2, %f359; mul.f32 %f361, %f381, %f157; fma.rn.f32 %f362, %f1, %f361, %f360; add.f32 %f363, %f336, %f337; fma.rn.f32 %f364, %f3, %f363, %f362; mul.f32 %f365, %f364, %f413; fma.rn.f32 %f366, %f338, %f358, %f365; add.s64 %rd76, %rd70, %rd12; st.global.f32 [%rd76], %f366; BB0_108: ret; } ` getmagnetoelasticforce_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<113>; .reg .b16 %rs<19>; .reg .f32 %f<414>; .reg .b32 %r<136>; .reg .b64 %rd<77>; ld.param.u64 %rd4, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_6]; ld.param.f32 %f412, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_8]; ld.param.f32 %f413, [getmagnetoelasticforce_param_9]; ld.param.f32 %f157, [getmagnetoelasticforce_param_10]; ld.param.f32 %f158, [getmagnetoelasticforce_param_11]; ld.param.f32 %f159, [getmagnetoelasticforce_param_12]; ld.param.u32 %r55, [getmagnetoelasticforce_param_13]; ld.param.u32 %r56, [getmagnetoelasticforce_param_14]; ld.param.u32 %r57, [getmagnetoelasticforce_param_15]; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd9; mov.u32 %r58, %ntid.x; mov.u32 %r59, %ctaid.x; mov.u32 %r60, %tid.x; mad.lo.s32 %r1, %r58, %r59, %r60; mov.u32 %r61, %ntid.y; mov.u32 %r62, %ctaid.y; mov.u32 %r63, %tid.y; mad.lo.s32 %r2, %r61, %r62, %r63; mov.u32 %r64, %ntid.z; mov.u32 %r65, %ctaid.z; mov.u32 %r66, %tid.z; mad.lo.s32 %r3, %r64, %r65, %r66; setp.ge.s32 %p4, %r2, %r56; setp.ge.s32 %p5, %r1, %r55; or.pred %p6, %p4, %p5; setp.ge.s32 %p7, %r3, %r57; or.pred %p8, %p6, %p7; @%p8 bra BB0_108; mul.lo.s32 %r4, %r3, %r56; add.s32 %r67, %r4, %r2; mul.lo.s32 %r5, %r67, %r55; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; add.s64 %rd14, %rd2, %rd12; ld.global.nc.f32 %f2, [%rd14]; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f3, [%rd15]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r7, %r1, -2; @%p9 bra BB0_3; rem.s32 %r68, %r7, %r55; add.s32 %r69, %r68, %r55; rem.s32 %r124, %r69, %r55; bra.uni BB0_4; BB0_3: mov.u32 %r70, 0; max.s32 %r124, %r7, %r70; BB0_4: setp.lt.s32 %p11, %r7, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_6; add.s32 %r71, %r124, %r5; mul.wide.s32 %rd16, %r71, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f7, [%rd17]; add.s64 %rd18, %rd2, %rd16; ld.global.nc.f32 %f8, [%rd18]; add.s64 %rd19, %rd1, %rd16; ld.global.nc.f32 %f9, [%rd19]; BB0_6: add.s32 %r11, %r1, -1; @%p9 bra BB0_8; rem.s32 %r72, %r11, %r55; add.s32 %r73, %r72, %r55; rem.s32 %r125, %r73, %r55; bra.uni BB0_9; BB0_8: mov.u32 %r74, 0; max.s32 %r125, %r11, %r74; BB0_9: setp.lt.s32 %p14, %r11, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_11; add.s32 %r75, %r125, %r5; mul.wide.s32 %rd20, %r75, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f13, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f14, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f15, [%rd23]; BB0_11: add.s32 %r15, %r1, 1; @%p9 bra BB0_13; rem.s32 %r76, %r15, %r55; add.s32 %r77, %r76, %r55; rem.s32 %r126, %r77, %r55; bra.uni BB0_14; BB0_13: add.s32 %r78, %r55, -1; min.s32 %r126, %r15, %r78; BB0_14: setp.ge.s32 %p18, %r15, %r55; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_16; add.s32 %r79, %r126, %r5; mul.wide.s32 %rd24, %r79, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f19, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f20, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f21, [%rd27]; BB0_16: add.s32 %r19, %r1, 2; @%p9 bra BB0_18; rem.s32 %r80, %r19, %r55; add.s32 %r81, %r80, %r55; rem.s32 %r127, %r81, %r55; bra.uni BB0_19; BB0_18: add.s32 %r82, %r55, -1; min.s32 %r127, %r19, %r82; BB0_19: setp.ge.s32 %p22, %r19, %r55; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_21; add.s32 %r83, %r127, %r5; mul.wide.s32 %rd28, %r83, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f25, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f26, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f27, [%rd31]; BB0_21: mul.f32 %f172, %f20, %f20; fma.rn.f32 %f173, %f19, %f19, %f172; fma.rn.f32 %f28, %f21, %f21, %f173; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_23; mul.f32 %f177, %f14, %f14; fma.rn.f32 %f178, %f13, %f13, %f177; fma.rn.f32 %f179, %f15, %f15, %f178; setp.eq.f32 %p26, %f179, 0f00000000; mov.f32 %f379, 0f00000000; mov.f32 %f380, %f379; mov.f32 %f381, %f379; @%p26 bra BB0_35; BB0_23: mul.f32 %f180, %f8, %f8; fma.rn.f32 %f181, %f7, %f7, %f180; fma.rn.f32 %f29, %f9, %f9, %f181; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f182, %f26, %f26; fma.rn.f32 %f183, %f25, %f25, %f182; fma.rn.f32 %f30, %f27, %f27, %f183; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_25; mul.f32 %f184, %f14, %f14; fma.rn.f32 %f185, %f13, %f13, %f184; fma.rn.f32 %f186, %f15, %f15, %f185; setp.neu.f32 %p32, %f186, 0f00000000; @%p32 bra BB0_34; bra.uni BB0_25; BB0_34: sub.f32 %f215, %f19, %f13; mul.f32 %f379, %f215, 0f3F000000; sub.f32 %f216, %f20, %f14; mul.f32 %f380, %f216, 0f3F000000; sub.f32 %f217, %f21, %f15; mul.f32 %f381, %f217, 0f3F000000; bra.uni BB0_35; BB0_25: or.pred %p34, %p25, %p27; @%p34 bra BB0_27; bra.uni BB0_26; BB0_27: mul.f32 %f187, %f14, %f14; fma.rn.f32 %f188, %f13, %f13, %f187; fma.rn.f32 %f34, %f15, %f15, %f188; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_29; bra.uni BB0_28; BB0_29: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_31; bra.uni BB0_30; BB0_31: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_33; bra.uni BB0_32; BB0_33: sub.f32 %f206, %f19, %f13; sub.f32 %f207, %f20, %f14; sub.f32 %f208, %f21, %f15; sub.f32 %f209, %f7, %f25; mul.f32 %f210, %f209, 0f3DAAAAAB; sub.f32 %f211, %f8, %f26; mul.f32 %f212, %f211, 0f3DAAAAAB; sub.f32 %f213, %f9, %f27; mul.f32 %f214, %f213, 0f3DAAAAAB; fma.rn.f32 %f379, %f206, 0f3F2AAAAB, %f210; fma.rn.f32 %f380, %f207, 0f3F2AAAAB, %f212; fma.rn.f32 %f381, %f208, 0f3F2AAAAB, %f214; bra.uni BB0_35; BB0_26: sub.f32 %f379, %f1, %f13; sub.f32 %f380, %f2, %f14; sub.f32 %f381, %f3, %f15; bra.uni BB0_35; BB0_28: sub.f32 %f379, %f19, %f1; sub.f32 %f380, %f20, %f2; sub.f32 %f381, %f21, %f3; bra.uni BB0_35; BB0_30: mul.f32 %f189, %f13, 0fC0000000; fma.rn.f32 %f190, %f7, 0f3F000000, %f189; add.f32 %f191, %f14, %f14; mul.f32 %f192, %f8, 0f3F000000; sub.f32 %f193, %f192, %f191; add.f32 %f194, %f15, %f15; mul.f32 %f195, %f9, 0f3F000000; sub.f32 %f196, %f195, %f194; fma.rn.f32 %f379, %f1, 0f3FC00000, %f190; fma.rn.f32 %f380, %f2, 0f3FC00000, %f193; fma.rn.f32 %f381, %f3, 0f3FC00000, %f196; bra.uni BB0_35; BB0_32: mul.f32 %f197, %f25, 0fBF000000; fma.rn.f32 %f198, %f19, 0f40000000, %f197; mul.f32 %f199, %f26, 0fBF000000; fma.rn.f32 %f200, %f20, 0f40000000, %f199; mul.f32 %f201, %f27, 0fBF000000; fma.rn.f32 %f202, %f21, 0f40000000, %f201; mul.f32 %f203, %f1, 0f3FC00000; sub.f32 %f379, %f198, %f203; mul.f32 %f204, %f2, 0f3FC00000; sub.f32 %f380, %f200, %f204; mul.f32 %f205, %f3, 0f3FC00000; sub.f32 %f381, %f202, %f205; BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_37; rem.s32 %r84, %r23, %r56; add.s32 %r85, %r84, %r56; rem.s32 %r128, %r85, %r56; bra.uni BB0_38; BB0_37: mov.u32 %r86, 0; max.s32 %r128, %r23, %r86; BB0_38: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_40; add.s32 %r87, %r128, %r4; mad.lo.s32 %r88, %r87, %r55, %r1; mul.wide.s32 %rd32, %r88, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f56, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f57, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f58, [%rd35]; BB0_40: add.s32 %r27, %r2, -1; @%p43 bra BB0_42; rem.s32 %r89, %r27, %r56; add.s32 %r90, %r89, %r56; rem.s32 %r129, %r90, %r56; bra.uni BB0_43; BB0_42: mov.u32 %r91, 0; max.s32 %r129, %r27, %r91; BB0_43: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_45; add.s32 %r92, %r129, %r4; mad.lo.s32 %r93, %r92, %r55, %r1; mul.wide.s32 %rd36, %r93, 4; add.s64 %rd37, %rd3, %rd36; ld.global.nc.f32 %f62, [%rd37]; add.s64 %rd38, %rd2, %rd36; ld.global.nc.f32 %f63, [%rd38]; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f64, [%rd39]; BB0_45: add.s32 %r31, %r2, 1; @%p43 bra BB0_47; rem.s32 %r94, %r31, %r56; add.s32 %r95, %r94, %r56; rem.s32 %r130, %r95, %r56; bra.uni BB0_48; BB0_47: add.s32 %r96, %r56, -1; min.s32 %r130, %r31, %r96; BB0_48: setp.ge.s32 %p52, %r31, %r56; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_50; add.s32 %r97, %r130, %r4; mad.lo.s32 %r98, %r97, %r55, %r1; mul.wide.s32 %rd40, %r98, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f68, [%rd41]; add.s64 %rd42, %rd2, %rd40; ld.global.nc.f32 %f69, [%rd42]; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f70, [%rd43]; BB0_50: add.s32 %r35, %r2, 2; @%p43 bra BB0_52; rem.s32 %r99, %r35, %r56; add.s32 %r100, %r99, %r56; rem.s32 %r131, %r100, %r56; bra.uni BB0_53; BB0_52: add.s32 %r101, %r56, -1; min.s32 %r131, %r35, %r101; BB0_53: setp.ge.s32 %p56, %r35, %r56; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_55; add.s32 %r102, %r131, %r4; mad.lo.s32 %r103, %r102, %r55, %r1; mul.wide.s32 %rd44, %r103, 4; add.s64 %rd45, %rd3, %rd44; ld.global.nc.f32 %f74, [%rd45]; add.s64 %rd46, %rd2, %rd44; ld.global.nc.f32 %f75, [%rd46]; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f76, [%rd47]; BB0_55: mul.f32 %f230, %f69, %f69; fma.rn.f32 %f231, %f68, %f68, %f230; fma.rn.f32 %f77, %f70, %f70, %f231; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_57; mul.f32 %f235, %f63, %f63; fma.rn.f32 %f236, %f62, %f62, %f235; fma.rn.f32 %f237, %f64, %f64, %f236; setp.eq.f32 %p60, %f237, 0f00000000; mov.f32 %f394, 0f00000000; mov.f32 %f395, %f394; mov.f32 %f396, %f394; @%p60 bra BB0_69; BB0_57: mul.f32 %f238, %f57, %f57; fma.rn.f32 %f239, %f56, %f56, %f238; fma.rn.f32 %f78, %f58, %f58, %f239; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f240, %f75, %f75; fma.rn.f32 %f241, %f74, %f74, %f240; fma.rn.f32 %f79, %f76, %f76, %f241; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_59; mul.f32 %f242, %f63, %f63; fma.rn.f32 %f243, %f62, %f62, %f242; fma.rn.f32 %f244, %f64, %f64, %f243; setp.neu.f32 %p66, %f244, 0f00000000; @%p66 bra BB0_68; bra.uni BB0_59; BB0_68: sub.f32 %f273, %f68, %f62; mul.f32 %f394, %f273, 0f3F000000; sub.f32 %f274, %f69, %f63; mul.f32 %f395, %f274, 0f3F000000; sub.f32 %f275, %f70, %f64; mul.f32 %f396, %f275, 0f3F000000; bra.uni BB0_69; BB0_59: or.pred %p68, %p59, %p61; @%p68 bra BB0_61; bra.uni BB0_60; BB0_61: mul.f32 %f245, %f63, %f63; fma.rn.f32 %f246, %f62, %f62, %f245; fma.rn.f32 %f83, %f64, %f64, %f246; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_63; bra.uni BB0_62; BB0_63: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_65; bra.uni BB0_64; BB0_65: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_67; bra.uni BB0_66; BB0_67: sub.f32 %f264, %f68, %f62; sub.f32 %f265, %f69, %f63; sub.f32 %f266, %f70, %f64; sub.f32 %f267, %f56, %f74; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f57, %f75; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f58, %f76; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f394, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f395, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f396, %f266, 0f3F2AAAAB, %f272; bra.uni BB0_69; BB0_60: sub.f32 %f394, %f1, %f62; sub.f32 %f395, %f2, %f63; sub.f32 %f396, %f3, %f64; bra.uni BB0_69; BB0_62: sub.f32 %f394, %f68, %f1; sub.f32 %f395, %f69, %f2; sub.f32 %f396, %f70, %f3; bra.uni BB0_69; BB0_64: mul.f32 %f247, %f62, 0fC0000000; fma.rn.f32 %f248, %f56, 0f3F000000, %f247; add.f32 %f249, %f63, %f63; mul.f32 %f250, %f57, 0f3F000000; sub.f32 %f251, %f250, %f249; add.f32 %f252, %f64, %f64; mul.f32 %f253, %f58, 0f3F000000; sub.f32 %f254, %f253, %f252; fma.rn.f32 %f394, %f1, 0f3FC00000, %f248; fma.rn.f32 %f395, %f2, 0f3FC00000, %f251; fma.rn.f32 %f396, %f3, 0f3FC00000, %f254; bra.uni BB0_69; BB0_66: mul.f32 %f255, %f74, 0fBF000000; fma.rn.f32 %f256, %f68, 0f40000000, %f255; mul.f32 %f257, %f75, 0fBF000000; fma.rn.f32 %f258, %f69, 0f40000000, %f257; mul.f32 %f259, %f76, 0fBF000000; fma.rn.f32 %f260, %f70, 0f40000000, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f394, %f256, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f395, %f258, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f396, %f260, %f263; BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p77, %rs3, 0; add.s32 %r39, %r3, -2; @%p77 bra BB0_71; rem.s32 %r104, %r39, %r57; add.s32 %r105, %r104, %r57; rem.s32 %r132, %r105, %r57; bra.uni BB0_72; BB0_71: mov.u32 %r106, 0; max.s32 %r132, %r39, %r106; BB0_72: setp.lt.s32 %p79, %r39, 0; mov.f32 %f105, 0f00000000; and.pred %p80, %p79, %p77; mov.f32 %f106, %f105; mov.f32 %f107, %f105; @%p80 bra BB0_74; mad.lo.s32 %r107, %r132, %r56, %r2; mad.lo.s32 %r108, %r107, %r55, %r1; mul.wide.s32 %rd48, %r108, 4; add.s64 %rd49, %rd3, %rd48; ld.global.nc.f32 %f105, [%rd49]; add.s64 %rd50, %rd2, %rd48; ld.global.nc.f32 %f106, [%rd50]; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f107, [%rd51]; BB0_74: add.s32 %r43, %r3, -1; @%p77 bra BB0_76; rem.s32 %r109, %r43, %r57; add.s32 %r110, %r109, %r57; rem.s32 %r133, %r110, %r57; bra.uni BB0_77; BB0_76: mov.u32 %r111, 0; max.s32 %r133, %r43, %r111; BB0_77: setp.lt.s32 %p82, %r43, 0; mov.f32 %f111, 0f00000000; and.pred %p84, %p82, %p77; mov.f32 %f112, %f111; mov.f32 %f113, %f111; @%p84 bra BB0_79; mad.lo.s32 %r112, %r133, %r56, %r2; mad.lo.s32 %r113, %r112, %r55, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f111, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f112, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f113, [%rd55]; BB0_79: add.s32 %r47, %r3, 1; @%p77 bra BB0_81; rem.s32 %r114, %r47, %r57; add.s32 %r115, %r114, %r57; rem.s32 %r134, %r115, %r57; bra.uni BB0_82; BB0_81: add.s32 %r116, %r57, -1; min.s32 %r134, %r47, %r116; BB0_82: setp.ge.s32 %p86, %r47, %r57; mov.f32 %f117, 0f00000000; and.pred %p88, %p86, %p77; mov.f32 %f118, %f117; mov.f32 %f119, %f117; @%p88 bra BB0_84; mad.lo.s32 %r117, %r134, %r56, %r2; mad.lo.s32 %r118, %r117, %r55, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f117, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f118, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f119, [%rd59]; BB0_84: add.s32 %r51, %r3, 2; @%p77 bra BB0_86; rem.s32 %r119, %r51, %r57; add.s32 %r120, %r119, %r57; rem.s32 %r135, %r120, %r57; bra.uni BB0_87; BB0_86: add.s32 %r121, %r57, -1; min.s32 %r135, %r51, %r121; BB0_87: setp.ge.s32 %p90, %r51, %r57; mov.f32 %f123, 0f00000000; and.pred %p92, %p90, %p77; mov.f32 %f124, %f123; mov.f32 %f125, %f123; @%p92 bra BB0_89; mad.lo.s32 %r122, %r135, %r56, %r2; mad.lo.s32 %r123, %r122, %r55, %r1; mul.wide.s32 %rd60, %r123, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f123, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f124, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f125, [%rd63]; BB0_89: mul.f32 %f288, %f118, %f118; fma.rn.f32 %f289, %f117, %f117, %f288; fma.rn.f32 %f126, %f119, %f119, %f289; setp.neu.f32 %p93, %f126, 0f00000000; @%p93 bra BB0_91; mul.f32 %f293, %f112, %f112; fma.rn.f32 %f294, %f111, %f111, %f293; fma.rn.f32 %f295, %f113, %f113, %f294; setp.eq.f32 %p94, %f295, 0f00000000; mov.f32 %f409, 0f00000000; mov.f32 %f410, %f409; mov.f32 %f411, %f409; @%p94 bra BB0_103; BB0_91: mul.f32 %f296, %f106, %f106; fma.rn.f32 %f297, %f105, %f105, %f296; fma.rn.f32 %f127, %f107, %f107, %f297; setp.neu.f32 %p95, %f127, 0f00000000; mul.f32 %f298, %f124, %f124; fma.rn.f32 %f299, %f123, %f123, %f298; fma.rn.f32 %f128, %f125, %f125, %f299; setp.neu.f32 %p96, %f128, 0f00000000; and.pred %p97, %p95, %p96; setp.eq.f32 %p98, %f126, 0f00000000; or.pred %p99, %p97, %p98; @%p99 bra BB0_93; mul.f32 %f300, %f112, %f112; fma.rn.f32 %f301, %f111, %f111, %f300; fma.rn.f32 %f302, %f113, %f113, %f301; setp.neu.f32 %p100, %f302, 0f00000000; @%p100 bra BB0_102; bra.uni BB0_93; BB0_102: sub.f32 %f331, %f117, %f111; mul.f32 %f409, %f331, 0f3F000000; sub.f32 %f332, %f118, %f112; mul.f32 %f410, %f332, 0f3F000000; sub.f32 %f333, %f119, %f113; mul.f32 %f411, %f333, 0f3F000000; bra.uni BB0_103; BB0_93: or.pred %p102, %p93, %p95; @%p102 bra BB0_95; bra.uni BB0_94; BB0_95: mul.f32 %f303, %f112, %f112; fma.rn.f32 %f304, %f111, %f111, %f303; fma.rn.f32 %f132, %f113, %f113, %f304; setp.neu.f32 %p103, %f132, 0f00000000; or.pred %p105, %p103, %p96; @%p105 bra BB0_97; bra.uni BB0_96; BB0_97: setp.eq.f32 %p106, %f127, 0f00000000; or.pred %p107, %p106, %p93; @%p107 bra BB0_99; bra.uni BB0_98; BB0_99: setp.eq.f32 %p109, %f128, 0f00000000; or.pred %p110, %p109, %p103; @%p110 bra BB0_101; bra.uni BB0_100; BB0_101: sub.f32 %f322, %f117, %f111; sub.f32 %f323, %f118, %f112; sub.f32 %f324, %f119, %f113; sub.f32 %f325, %f105, %f123; mul.f32 %f326, %f325, 0f3DAAAAAB; sub.f32 %f327, %f106, %f124; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f107, %f125; mul.f32 %f330, %f329, 0f3DAAAAAB; fma.rn.f32 %f409, %f322, 0f3F2AAAAB, %f326; fma.rn.f32 %f410, %f323, 0f3F2AAAAB, %f328; fma.rn.f32 %f411, %f324, 0f3F2AAAAB, %f330; bra.uni BB0_103; BB0_94: sub.f32 %f409, %f1, %f111; sub.f32 %f410, %f2, %f112; sub.f32 %f411, %f3, %f113; bra.uni BB0_103; BB0_96: sub.f32 %f409, %f117, %f1; sub.f32 %f410, %f118, %f2; sub.f32 %f411, %f119, %f3; bra.uni BB0_103; BB0_98: mul.f32 %f305, %f111, 0fC0000000; fma.rn.f32 %f306, %f105, 0f3F000000, %f305; add.f32 %f307, %f112, %f112; mul.f32 %f308, %f106, 0f3F000000; sub.f32 %f309, %f308, %f307; add.f32 %f310, %f113, %f113; mul.f32 %f311, %f107, 0f3F000000; sub.f32 %f312, %f311, %f310; fma.rn.f32 %f409, %f1, 0f3FC00000, %f306; fma.rn.f32 %f410, %f2, 0f3FC00000, %f309; fma.rn.f32 %f411, %f3, 0f3FC00000, %f312; bra.uni BB0_103; BB0_100: mul.f32 %f313, %f123, 0fBF000000; fma.rn.f32 %f314, %f117, 0f40000000, %f313; mul.f32 %f315, %f124, 0fBF000000; fma.rn.f32 %f316, %f118, 0f40000000, %f315; mul.f32 %f317, %f125, 0fBF000000; fma.rn.f32 %f318, %f119, 0f40000000, %f317; mul.f32 %f319, %f1, 0f3FC00000; sub.f32 %f409, %f314, %f319; mul.f32 %f320, %f2, 0f3FC00000; sub.f32 %f410, %f316, %f320; mul.f32 %f321, %f3, 0f3FC00000; sub.f32 %f411, %f318, %f321; BB0_103: setp.eq.s64 %p111, %rd7, 0; @%p111 bra BB0_105; cvta.to.global.u64 %rd64, %rd7; add.s64 %rd66, %rd64, %rd12; ld.global.nc.f32 %f334, [%rd66]; mul.f32 %f412, %f334, %f412; BB0_105: setp.eq.s64 %p112, %rd8, 0; @%p112 bra BB0_107; cvta.to.global.u64 %rd67, %rd8; add.s64 %rd69, %rd67, %rd12; ld.global.nc.f32 %f335, [%rd69]; mul.f32 %f413, %f335, %f413; BB0_107: cvta.to.global.u64 %rd70, %rd6; cvta.to.global.u64 %rd71, %rd5; cvta.to.global.u64 %rd72, %rd4; mul.f32 %f336, %f379, %f157; mul.f32 %f337, %f395, %f158; mul.f32 %f338, %f411, %f159; add.f32 %f339, %f412, %f412; mul.f32 %f340, %f1, %f339; add.f32 %f341, %f337, %f338; mul.f32 %f342, %f1, %f341; mul.f32 %f343, %f394, %f158; fma.rn.f32 %f344, %f2, %f343, %f342; mul.f32 %f345, %f409, %f159; fma.rn.f32 %f346, %f3, %f345, %f344; mul.f32 %f347, %f346, %f413; fma.rn.f32 %f348, %f336, %f340, %f347; add.s64 %rd74, %rd72, %rd12; st.global.f32 [%rd74], %f348; mul.f32 %f349, %f2, %f339; add.f32 %f350, %f336, %f338; mul.f32 %f351, %f2, %f350; mul.f32 %f352, %f380, %f157; fma.rn.f32 %f353, %f1, %f352, %f351; mul.f32 %f354, %f410, %f159; fma.rn.f32 %f355, %f3, %f354, %f353; mul.f32 %f356, %f355, %f413; fma.rn.f32 %f357, %f337, %f349, %f356; add.s64 %rd75, %rd71, %rd12; st.global.f32 [%rd75], %f357; mul.f32 %f358, %f3, %f339; mul.f32 %f359, %f396, %f158; mul.f32 %f360, %f2, %f359; mul.f32 %f361, %f381, %f157; fma.rn.f32 %f362, %f1, %f361, %f360; add.f32 %f363, %f336, %f337; fma.rn.f32 %f364, %f3, %f363, %f362; mul.f32 %f365, %f364, %f413; fma.rn.f32 %f366, %f338, %f358, %f365; add.s64 %rd76, %rd70, %rd12; st.global.f32 [%rd76], %f366; BB0_108: ret; } ` getmagnetoelasticforce_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<113>; .reg .b16 %rs<19>; .reg .f32 %f<414>; .reg .b32 %r<136>; .reg .b64 %rd<77>; ld.param.u64 %rd4, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_6]; ld.param.f32 %f412, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_8]; ld.param.f32 %f413, [getmagnetoelasticforce_param_9]; ld.param.f32 %f157, [getmagnetoelasticforce_param_10]; ld.param.f32 %f158, [getmagnetoelasticforce_param_11]; ld.param.f32 %f159, [getmagnetoelasticforce_param_12]; ld.param.u32 %r55, [getmagnetoelasticforce_param_13]; ld.param.u32 %r56, [getmagnetoelasticforce_param_14]; ld.param.u32 %r57, [getmagnetoelasticforce_param_15]; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd9; mov.u32 %r58, %ntid.x; mov.u32 %r59, %ctaid.x; mov.u32 %r60, %tid.x; mad.lo.s32 %r1, %r58, %r59, %r60; mov.u32 %r61, %ntid.y; mov.u32 %r62, %ctaid.y; mov.u32 %r63, %tid.y; mad.lo.s32 %r2, %r61, %r62, %r63; mov.u32 %r64, %ntid.z; mov.u32 %r65, %ctaid.z; mov.u32 %r66, %tid.z; mad.lo.s32 %r3, %r64, %r65, %r66; setp.ge.s32 %p4, %r2, %r56; setp.ge.s32 %p5, %r1, %r55; or.pred %p6, %p4, %p5; setp.ge.s32 %p7, %r3, %r57; or.pred %p8, %p6, %p7; @%p8 bra BB0_108; mul.lo.s32 %r4, %r3, %r56; add.s32 %r67, %r4, %r2; mul.lo.s32 %r5, %r67, %r55; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; add.s64 %rd14, %rd2, %rd12; ld.global.nc.f32 %f2, [%rd14]; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f3, [%rd15]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r7, %r1, -2; @%p9 bra BB0_3; rem.s32 %r68, %r7, %r55; add.s32 %r69, %r68, %r55; rem.s32 %r124, %r69, %r55; bra.uni BB0_4; BB0_3: mov.u32 %r70, 0; max.s32 %r124, %r7, %r70; BB0_4: setp.lt.s32 %p11, %r7, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_6; add.s32 %r71, %r124, %r5; mul.wide.s32 %rd16, %r71, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f7, [%rd17]; add.s64 %rd18, %rd2, %rd16; ld.global.nc.f32 %f8, [%rd18]; add.s64 %rd19, %rd1, %rd16; ld.global.nc.f32 %f9, [%rd19]; BB0_6: add.s32 %r11, %r1, -1; @%p9 bra BB0_8; rem.s32 %r72, %r11, %r55; add.s32 %r73, %r72, %r55; rem.s32 %r125, %r73, %r55; bra.uni BB0_9; BB0_8: mov.u32 %r74, 0; max.s32 %r125, %r11, %r74; BB0_9: setp.lt.s32 %p14, %r11, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_11; add.s32 %r75, %r125, %r5; mul.wide.s32 %rd20, %r75, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f13, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f14, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f15, [%rd23]; BB0_11: add.s32 %r15, %r1, 1; @%p9 bra BB0_13; rem.s32 %r76, %r15, %r55; add.s32 %r77, %r76, %r55; rem.s32 %r126, %r77, %r55; bra.uni BB0_14; BB0_13: add.s32 %r78, %r55, -1; min.s32 %r126, %r15, %r78; BB0_14: setp.ge.s32 %p18, %r15, %r55; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_16; add.s32 %r79, %r126, %r5; mul.wide.s32 %rd24, %r79, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f19, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f20, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f21, [%rd27]; BB0_16: add.s32 %r19, %r1, 2; @%p9 bra BB0_18; rem.s32 %r80, %r19, %r55; add.s32 %r81, %r80, %r55; rem.s32 %r127, %r81, %r55; bra.uni BB0_19; BB0_18: add.s32 %r82, %r55, -1; min.s32 %r127, %r19, %r82; BB0_19: setp.ge.s32 %p22, %r19, %r55; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_21; add.s32 %r83, %r127, %r5; mul.wide.s32 %rd28, %r83, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f25, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f26, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f27, [%rd31]; BB0_21: mul.f32 %f172, %f20, %f20; fma.rn.f32 %f173, %f19, %f19, %f172; fma.rn.f32 %f28, %f21, %f21, %f173; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_23; mul.f32 %f177, %f14, %f14; fma.rn.f32 %f178, %f13, %f13, %f177; fma.rn.f32 %f179, %f15, %f15, %f178; setp.eq.f32 %p26, %f179, 0f00000000; mov.f32 %f379, 0f00000000; mov.f32 %f380, %f379; mov.f32 %f381, %f379; @%p26 bra BB0_35; BB0_23: mul.f32 %f180, %f8, %f8; fma.rn.f32 %f181, %f7, %f7, %f180; fma.rn.f32 %f29, %f9, %f9, %f181; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f182, %f26, %f26; fma.rn.f32 %f183, %f25, %f25, %f182; fma.rn.f32 %f30, %f27, %f27, %f183; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_25; mul.f32 %f184, %f14, %f14; fma.rn.f32 %f185, %f13, %f13, %f184; fma.rn.f32 %f186, %f15, %f15, %f185; setp.neu.f32 %p32, %f186, 0f00000000; @%p32 bra BB0_34; bra.uni BB0_25; BB0_34: sub.f32 %f215, %f19, %f13; mul.f32 %f379, %f215, 0f3F000000; sub.f32 %f216, %f20, %f14; mul.f32 %f380, %f216, 0f3F000000; sub.f32 %f217, %f21, %f15; mul.f32 %f381, %f217, 0f3F000000; bra.uni BB0_35; BB0_25: or.pred %p34, %p25, %p27; @%p34 bra BB0_27; bra.uni BB0_26; BB0_27: mul.f32 %f187, %f14, %f14; fma.rn.f32 %f188, %f13, %f13, %f187; fma.rn.f32 %f34, %f15, %f15, %f188; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_29; bra.uni BB0_28; BB0_29: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_31; bra.uni BB0_30; BB0_31: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_33; bra.uni BB0_32; BB0_33: sub.f32 %f206, %f19, %f13; sub.f32 %f207, %f20, %f14; sub.f32 %f208, %f21, %f15; sub.f32 %f209, %f7, %f25; mul.f32 %f210, %f209, 0f3DAAAAAB; sub.f32 %f211, %f8, %f26; mul.f32 %f212, %f211, 0f3DAAAAAB; sub.f32 %f213, %f9, %f27; mul.f32 %f214, %f213, 0f3DAAAAAB; fma.rn.f32 %f379, %f206, 0f3F2AAAAB, %f210; fma.rn.f32 %f380, %f207, 0f3F2AAAAB, %f212; fma.rn.f32 %f381, %f208, 0f3F2AAAAB, %f214; bra.uni BB0_35; BB0_26: sub.f32 %f379, %f1, %f13; sub.f32 %f380, %f2, %f14; sub.f32 %f381, %f3, %f15; bra.uni BB0_35; BB0_28: sub.f32 %f379, %f19, %f1; sub.f32 %f380, %f20, %f2; sub.f32 %f381, %f21, %f3; bra.uni BB0_35; BB0_30: mul.f32 %f189, %f13, 0fC0000000; fma.rn.f32 %f190, %f7, 0f3F000000, %f189; add.f32 %f191, %f14, %f14; mul.f32 %f192, %f8, 0f3F000000; sub.f32 %f193, %f192, %f191; add.f32 %f194, %f15, %f15; mul.f32 %f195, %f9, 0f3F000000; sub.f32 %f196, %f195, %f194; fma.rn.f32 %f379, %f1, 0f3FC00000, %f190; fma.rn.f32 %f380, %f2, 0f3FC00000, %f193; fma.rn.f32 %f381, %f3, 0f3FC00000, %f196; bra.uni BB0_35; BB0_32: mul.f32 %f197, %f25, 0fBF000000; fma.rn.f32 %f198, %f19, 0f40000000, %f197; mul.f32 %f199, %f26, 0fBF000000; fma.rn.f32 %f200, %f20, 0f40000000, %f199; mul.f32 %f201, %f27, 0fBF000000; fma.rn.f32 %f202, %f21, 0f40000000, %f201; mul.f32 %f203, %f1, 0f3FC00000; sub.f32 %f379, %f198, %f203; mul.f32 %f204, %f2, 0f3FC00000; sub.f32 %f380, %f200, %f204; mul.f32 %f205, %f3, 0f3FC00000; sub.f32 %f381, %f202, %f205; BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_37; rem.s32 %r84, %r23, %r56; add.s32 %r85, %r84, %r56; rem.s32 %r128, %r85, %r56; bra.uni BB0_38; BB0_37: mov.u32 %r86, 0; max.s32 %r128, %r23, %r86; BB0_38: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_40; add.s32 %r87, %r128, %r4; mad.lo.s32 %r88, %r87, %r55, %r1; mul.wide.s32 %rd32, %r88, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f56, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f57, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f58, [%rd35]; BB0_40: add.s32 %r27, %r2, -1; @%p43 bra BB0_42; rem.s32 %r89, %r27, %r56; add.s32 %r90, %r89, %r56; rem.s32 %r129, %r90, %r56; bra.uni BB0_43; BB0_42: mov.u32 %r91, 0; max.s32 %r129, %r27, %r91; BB0_43: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_45; add.s32 %r92, %r129, %r4; mad.lo.s32 %r93, %r92, %r55, %r1; mul.wide.s32 %rd36, %r93, 4; add.s64 %rd37, %rd3, %rd36; ld.global.nc.f32 %f62, [%rd37]; add.s64 %rd38, %rd2, %rd36; ld.global.nc.f32 %f63, [%rd38]; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f64, [%rd39]; BB0_45: add.s32 %r31, %r2, 1; @%p43 bra BB0_47; rem.s32 %r94, %r31, %r56; add.s32 %r95, %r94, %r56; rem.s32 %r130, %r95, %r56; bra.uni BB0_48; BB0_47: add.s32 %r96, %r56, -1; min.s32 %r130, %r31, %r96; BB0_48: setp.ge.s32 %p52, %r31, %r56; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_50; add.s32 %r97, %r130, %r4; mad.lo.s32 %r98, %r97, %r55, %r1; mul.wide.s32 %rd40, %r98, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f68, [%rd41]; add.s64 %rd42, %rd2, %rd40; ld.global.nc.f32 %f69, [%rd42]; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f70, [%rd43]; BB0_50: add.s32 %r35, %r2, 2; @%p43 bra BB0_52; rem.s32 %r99, %r35, %r56; add.s32 %r100, %r99, %r56; rem.s32 %r131, %r100, %r56; bra.uni BB0_53; BB0_52: add.s32 %r101, %r56, -1; min.s32 %r131, %r35, %r101; BB0_53: setp.ge.s32 %p56, %r35, %r56; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_55; add.s32 %r102, %r131, %r4; mad.lo.s32 %r103, %r102, %r55, %r1; mul.wide.s32 %rd44, %r103, 4; add.s64 %rd45, %rd3, %rd44; ld.global.nc.f32 %f74, [%rd45]; add.s64 %rd46, %rd2, %rd44; ld.global.nc.f32 %f75, [%rd46]; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f76, [%rd47]; BB0_55: mul.f32 %f230, %f69, %f69; fma.rn.f32 %f231, %f68, %f68, %f230; fma.rn.f32 %f77, %f70, %f70, %f231; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_57; mul.f32 %f235, %f63, %f63; fma.rn.f32 %f236, %f62, %f62, %f235; fma.rn.f32 %f237, %f64, %f64, %f236; setp.eq.f32 %p60, %f237, 0f00000000; mov.f32 %f394, 0f00000000; mov.f32 %f395, %f394; mov.f32 %f396, %f394; @%p60 bra BB0_69; BB0_57: mul.f32 %f238, %f57, %f57; fma.rn.f32 %f239, %f56, %f56, %f238; fma.rn.f32 %f78, %f58, %f58, %f239; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f240, %f75, %f75; fma.rn.f32 %f241, %f74, %f74, %f240; fma.rn.f32 %f79, %f76, %f76, %f241; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_59; mul.f32 %f242, %f63, %f63; fma.rn.f32 %f243, %f62, %f62, %f242; fma.rn.f32 %f244, %f64, %f64, %f243; setp.neu.f32 %p66, %f244, 0f00000000; @%p66 bra BB0_68; bra.uni BB0_59; BB0_68: sub.f32 %f273, %f68, %f62; mul.f32 %f394, %f273, 0f3F000000; sub.f32 %f274, %f69, %f63; mul.f32 %f395, %f274, 0f3F000000; sub.f32 %f275, %f70, %f64; mul.f32 %f396, %f275, 0f3F000000; bra.uni BB0_69; BB0_59: or.pred %p68, %p59, %p61; @%p68 bra BB0_61; bra.uni BB0_60; BB0_61: mul.f32 %f245, %f63, %f63; fma.rn.f32 %f246, %f62, %f62, %f245; fma.rn.f32 %f83, %f64, %f64, %f246; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_63; bra.uni BB0_62; BB0_63: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_65; bra.uni BB0_64; BB0_65: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_67; bra.uni BB0_66; BB0_67: sub.f32 %f264, %f68, %f62; sub.f32 %f265, %f69, %f63; sub.f32 %f266, %f70, %f64; sub.f32 %f267, %f56, %f74; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f57, %f75; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f58, %f76; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f394, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f395, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f396, %f266, 0f3F2AAAAB, %f272; bra.uni BB0_69; BB0_60: sub.f32 %f394, %f1, %f62; sub.f32 %f395, %f2, %f63; sub.f32 %f396, %f3, %f64; bra.uni BB0_69; BB0_62: sub.f32 %f394, %f68, %f1; sub.f32 %f395, %f69, %f2; sub.f32 %f396, %f70, %f3; bra.uni BB0_69; BB0_64: mul.f32 %f247, %f62, 0fC0000000; fma.rn.f32 %f248, %f56, 0f3F000000, %f247; add.f32 %f249, %f63, %f63; mul.f32 %f250, %f57, 0f3F000000; sub.f32 %f251, %f250, %f249; add.f32 %f252, %f64, %f64; mul.f32 %f253, %f58, 0f3F000000; sub.f32 %f254, %f253, %f252; fma.rn.f32 %f394, %f1, 0f3FC00000, %f248; fma.rn.f32 %f395, %f2, 0f3FC00000, %f251; fma.rn.f32 %f396, %f3, 0f3FC00000, %f254; bra.uni BB0_69; BB0_66: mul.f32 %f255, %f74, 0fBF000000; fma.rn.f32 %f256, %f68, 0f40000000, %f255; mul.f32 %f257, %f75, 0fBF000000; fma.rn.f32 %f258, %f69, 0f40000000, %f257; mul.f32 %f259, %f76, 0fBF000000; fma.rn.f32 %f260, %f70, 0f40000000, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f394, %f256, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f395, %f258, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f396, %f260, %f263; BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p77, %rs3, 0; add.s32 %r39, %r3, -2; @%p77 bra BB0_71; rem.s32 %r104, %r39, %r57; add.s32 %r105, %r104, %r57; rem.s32 %r132, %r105, %r57; bra.uni BB0_72; BB0_71: mov.u32 %r106, 0; max.s32 %r132, %r39, %r106; BB0_72: setp.lt.s32 %p79, %r39, 0; mov.f32 %f105, 0f00000000; and.pred %p80, %p79, %p77; mov.f32 %f106, %f105; mov.f32 %f107, %f105; @%p80 bra BB0_74; mad.lo.s32 %r107, %r132, %r56, %r2; mad.lo.s32 %r108, %r107, %r55, %r1; mul.wide.s32 %rd48, %r108, 4; add.s64 %rd49, %rd3, %rd48; ld.global.nc.f32 %f105, [%rd49]; add.s64 %rd50, %rd2, %rd48; ld.global.nc.f32 %f106, [%rd50]; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f107, [%rd51]; BB0_74: add.s32 %r43, %r3, -1; @%p77 bra BB0_76; rem.s32 %r109, %r43, %r57; add.s32 %r110, %r109, %r57; rem.s32 %r133, %r110, %r57; bra.uni BB0_77; BB0_76: mov.u32 %r111, 0; max.s32 %r133, %r43, %r111; BB0_77: setp.lt.s32 %p82, %r43, 0; mov.f32 %f111, 0f00000000; and.pred %p84, %p82, %p77; mov.f32 %f112, %f111; mov.f32 %f113, %f111; @%p84 bra BB0_79; mad.lo.s32 %r112, %r133, %r56, %r2; mad.lo.s32 %r113, %r112, %r55, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f111, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f112, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f113, [%rd55]; BB0_79: add.s32 %r47, %r3, 1; @%p77 bra BB0_81; rem.s32 %r114, %r47, %r57; add.s32 %r115, %r114, %r57; rem.s32 %r134, %r115, %r57; bra.uni BB0_82; BB0_81: add.s32 %r116, %r57, -1; min.s32 %r134, %r47, %r116; BB0_82: setp.ge.s32 %p86, %r47, %r57; mov.f32 %f117, 0f00000000; and.pred %p88, %p86, %p77; mov.f32 %f118, %f117; mov.f32 %f119, %f117; @%p88 bra BB0_84; mad.lo.s32 %r117, %r134, %r56, %r2; mad.lo.s32 %r118, %r117, %r55, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f117, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f118, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f119, [%rd59]; BB0_84: add.s32 %r51, %r3, 2; @%p77 bra BB0_86; rem.s32 %r119, %r51, %r57; add.s32 %r120, %r119, %r57; rem.s32 %r135, %r120, %r57; bra.uni BB0_87; BB0_86: add.s32 %r121, %r57, -1; min.s32 %r135, %r51, %r121; BB0_87: setp.ge.s32 %p90, %r51, %r57; mov.f32 %f123, 0f00000000; and.pred %p92, %p90, %p77; mov.f32 %f124, %f123; mov.f32 %f125, %f123; @%p92 bra BB0_89; mad.lo.s32 %r122, %r135, %r56, %r2; mad.lo.s32 %r123, %r122, %r55, %r1; mul.wide.s32 %rd60, %r123, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f123, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f124, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f125, [%rd63]; BB0_89: mul.f32 %f288, %f118, %f118; fma.rn.f32 %f289, %f117, %f117, %f288; fma.rn.f32 %f126, %f119, %f119, %f289; setp.neu.f32 %p93, %f126, 0f00000000; @%p93 bra BB0_91; mul.f32 %f293, %f112, %f112; fma.rn.f32 %f294, %f111, %f111, %f293; fma.rn.f32 %f295, %f113, %f113, %f294; setp.eq.f32 %p94, %f295, 0f00000000; mov.f32 %f409, 0f00000000; mov.f32 %f410, %f409; mov.f32 %f411, %f409; @%p94 bra BB0_103; BB0_91: mul.f32 %f296, %f106, %f106; fma.rn.f32 %f297, %f105, %f105, %f296; fma.rn.f32 %f127, %f107, %f107, %f297; setp.neu.f32 %p95, %f127, 0f00000000; mul.f32 %f298, %f124, %f124; fma.rn.f32 %f299, %f123, %f123, %f298; fma.rn.f32 %f128, %f125, %f125, %f299; setp.neu.f32 %p96, %f128, 0f00000000; and.pred %p97, %p95, %p96; setp.eq.f32 %p98, %f126, 0f00000000; or.pred %p99, %p97, %p98; @%p99 bra BB0_93; mul.f32 %f300, %f112, %f112; fma.rn.f32 %f301, %f111, %f111, %f300; fma.rn.f32 %f302, %f113, %f113, %f301; setp.neu.f32 %p100, %f302, 0f00000000; @%p100 bra BB0_102; bra.uni BB0_93; BB0_102: sub.f32 %f331, %f117, %f111; mul.f32 %f409, %f331, 0f3F000000; sub.f32 %f332, %f118, %f112; mul.f32 %f410, %f332, 0f3F000000; sub.f32 %f333, %f119, %f113; mul.f32 %f411, %f333, 0f3F000000; bra.uni BB0_103; BB0_93: or.pred %p102, %p93, %p95; @%p102 bra BB0_95; bra.uni BB0_94; BB0_95: mul.f32 %f303, %f112, %f112; fma.rn.f32 %f304, %f111, %f111, %f303; fma.rn.f32 %f132, %f113, %f113, %f304; setp.neu.f32 %p103, %f132, 0f00000000; or.pred %p105, %p103, %p96; @%p105 bra BB0_97; bra.uni BB0_96; BB0_97: setp.eq.f32 %p106, %f127, 0f00000000; or.pred %p107, %p106, %p93; @%p107 bra BB0_99; bra.uni BB0_98; BB0_99: setp.eq.f32 %p109, %f128, 0f00000000; or.pred %p110, %p109, %p103; @%p110 bra BB0_101; bra.uni BB0_100; BB0_101: sub.f32 %f322, %f117, %f111; sub.f32 %f323, %f118, %f112; sub.f32 %f324, %f119, %f113; sub.f32 %f325, %f105, %f123; mul.f32 %f326, %f325, 0f3DAAAAAB; sub.f32 %f327, %f106, %f124; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f107, %f125; mul.f32 %f330, %f329, 0f3DAAAAAB; fma.rn.f32 %f409, %f322, 0f3F2AAAAB, %f326; fma.rn.f32 %f410, %f323, 0f3F2AAAAB, %f328; fma.rn.f32 %f411, %f324, 0f3F2AAAAB, %f330; bra.uni BB0_103; BB0_94: sub.f32 %f409, %f1, %f111; sub.f32 %f410, %f2, %f112; sub.f32 %f411, %f3, %f113; bra.uni BB0_103; BB0_96: sub.f32 %f409, %f117, %f1; sub.f32 %f410, %f118, %f2; sub.f32 %f411, %f119, %f3; bra.uni BB0_103; BB0_98: mul.f32 %f305, %f111, 0fC0000000; fma.rn.f32 %f306, %f105, 0f3F000000, %f305; add.f32 %f307, %f112, %f112; mul.f32 %f308, %f106, 0f3F000000; sub.f32 %f309, %f308, %f307; add.f32 %f310, %f113, %f113; mul.f32 %f311, %f107, 0f3F000000; sub.f32 %f312, %f311, %f310; fma.rn.f32 %f409, %f1, 0f3FC00000, %f306; fma.rn.f32 %f410, %f2, 0f3FC00000, %f309; fma.rn.f32 %f411, %f3, 0f3FC00000, %f312; bra.uni BB0_103; BB0_100: mul.f32 %f313, %f123, 0fBF000000; fma.rn.f32 %f314, %f117, 0f40000000, %f313; mul.f32 %f315, %f124, 0fBF000000; fma.rn.f32 %f316, %f118, 0f40000000, %f315; mul.f32 %f317, %f125, 0fBF000000; fma.rn.f32 %f318, %f119, 0f40000000, %f317; mul.f32 %f319, %f1, 0f3FC00000; sub.f32 %f409, %f314, %f319; mul.f32 %f320, %f2, 0f3FC00000; sub.f32 %f410, %f316, %f320; mul.f32 %f321, %f3, 0f3FC00000; sub.f32 %f411, %f318, %f321; BB0_103: setp.eq.s64 %p111, %rd7, 0; @%p111 bra BB0_105; cvta.to.global.u64 %rd64, %rd7; add.s64 %rd66, %rd64, %rd12; ld.global.nc.f32 %f334, [%rd66]; mul.f32 %f412, %f334, %f412; BB0_105: setp.eq.s64 %p112, %rd8, 0; @%p112 bra BB0_107; cvta.to.global.u64 %rd67, %rd8; add.s64 %rd69, %rd67, %rd12; ld.global.nc.f32 %f335, [%rd69]; mul.f32 %f413, %f335, %f413; BB0_107: cvta.to.global.u64 %rd70, %rd6; cvta.to.global.u64 %rd71, %rd5; cvta.to.global.u64 %rd72, %rd4; mul.f32 %f336, %f379, %f157; mul.f32 %f337, %f395, %f158; mul.f32 %f338, %f411, %f159; add.f32 %f339, %f412, %f412; mul.f32 %f340, %f1, %f339; add.f32 %f341, %f337, %f338; mul.f32 %f342, %f1, %f341; mul.f32 %f343, %f394, %f158; fma.rn.f32 %f344, %f2, %f343, %f342; mul.f32 %f345, %f409, %f159; fma.rn.f32 %f346, %f3, %f345, %f344; mul.f32 %f347, %f346, %f413; fma.rn.f32 %f348, %f336, %f340, %f347; add.s64 %rd74, %rd72, %rd12; st.global.f32 [%rd74], %f348; mul.f32 %f349, %f2, %f339; add.f32 %f350, %f336, %f338; mul.f32 %f351, %f2, %f350; mul.f32 %f352, %f380, %f157; fma.rn.f32 %f353, %f1, %f352, %f351; mul.f32 %f354, %f410, %f159; fma.rn.f32 %f355, %f3, %f354, %f353; mul.f32 %f356, %f355, %f413; fma.rn.f32 %f357, %f337, %f349, %f356; add.s64 %rd75, %rd71, %rd12; st.global.f32 [%rd75], %f357; mul.f32 %f358, %f3, %f339; mul.f32 %f359, %f396, %f158; mul.f32 %f360, %f2, %f359; mul.f32 %f361, %f381, %f157; fma.rn.f32 %f362, %f1, %f361, %f360; add.f32 %f363, %f336, %f337; fma.rn.f32 %f364, %f3, %f363, %f362; mul.f32 %f365, %f364, %f413; fma.rn.f32 %f366, %f338, %f358, %f365; add.s64 %rd76, %rd70, %rd12; st.global.f32 [%rd76], %f366; BB0_108: ret; } ` getmagnetoelasticforce_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<113>; .reg .b16 %rs<19>; .reg .f32 %f<414>; .reg .b32 %r<136>; .reg .b64 %rd<77>; ld.param.u64 %rd4, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_6]; ld.param.f32 %f412, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_8]; ld.param.f32 %f413, [getmagnetoelasticforce_param_9]; ld.param.f32 %f157, [getmagnetoelasticforce_param_10]; ld.param.f32 %f158, [getmagnetoelasticforce_param_11]; ld.param.f32 %f159, [getmagnetoelasticforce_param_12]; ld.param.u32 %r55, [getmagnetoelasticforce_param_13]; ld.param.u32 %r56, [getmagnetoelasticforce_param_14]; ld.param.u32 %r57, [getmagnetoelasticforce_param_15]; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd9; mov.u32 %r58, %ntid.x; mov.u32 %r59, %ctaid.x; mov.u32 %r60, %tid.x; mad.lo.s32 %r1, %r58, %r59, %r60; mov.u32 %r61, %ntid.y; mov.u32 %r62, %ctaid.y; mov.u32 %r63, %tid.y; mad.lo.s32 %r2, %r61, %r62, %r63; mov.u32 %r64, %ntid.z; mov.u32 %r65, %ctaid.z; mov.u32 %r66, %tid.z; mad.lo.s32 %r3, %r64, %r65, %r66; setp.ge.s32 %p4, %r2, %r56; setp.ge.s32 %p5, %r1, %r55; or.pred %p6, %p4, %p5; setp.ge.s32 %p7, %r3, %r57; or.pred %p8, %p6, %p7; @%p8 bra BB0_108; mul.lo.s32 %r4, %r3, %r56; add.s32 %r67, %r4, %r2; mul.lo.s32 %r5, %r67, %r55; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; add.s64 %rd14, %rd2, %rd12; ld.global.nc.f32 %f2, [%rd14]; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f3, [%rd15]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r7, %r1, -2; @%p9 bra BB0_3; rem.s32 %r68, %r7, %r55; add.s32 %r69, %r68, %r55; rem.s32 %r124, %r69, %r55; bra.uni BB0_4; BB0_3: mov.u32 %r70, 0; max.s32 %r124, %r7, %r70; BB0_4: setp.lt.s32 %p11, %r7, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_6; add.s32 %r71, %r124, %r5; mul.wide.s32 %rd16, %r71, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f7, [%rd17]; add.s64 %rd18, %rd2, %rd16; ld.global.nc.f32 %f8, [%rd18]; add.s64 %rd19, %rd1, %rd16; ld.global.nc.f32 %f9, [%rd19]; BB0_6: add.s32 %r11, %r1, -1; @%p9 bra BB0_8; rem.s32 %r72, %r11, %r55; add.s32 %r73, %r72, %r55; rem.s32 %r125, %r73, %r55; bra.uni BB0_9; BB0_8: mov.u32 %r74, 0; max.s32 %r125, %r11, %r74; BB0_9: setp.lt.s32 %p14, %r11, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_11; add.s32 %r75, %r125, %r5; mul.wide.s32 %rd20, %r75, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f13, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f14, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f15, [%rd23]; BB0_11: add.s32 %r15, %r1, 1; @%p9 bra BB0_13; rem.s32 %r76, %r15, %r55; add.s32 %r77, %r76, %r55; rem.s32 %r126, %r77, %r55; bra.uni BB0_14; BB0_13: add.s32 %r78, %r55, -1; min.s32 %r126, %r15, %r78; BB0_14: setp.ge.s32 %p18, %r15, %r55; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_16; add.s32 %r79, %r126, %r5; mul.wide.s32 %rd24, %r79, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f19, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f20, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f21, [%rd27]; BB0_16: add.s32 %r19, %r1, 2; @%p9 bra BB0_18; rem.s32 %r80, %r19, %r55; add.s32 %r81, %r80, %r55; rem.s32 %r127, %r81, %r55; bra.uni BB0_19; BB0_18: add.s32 %r82, %r55, -1; min.s32 %r127, %r19, %r82; BB0_19: setp.ge.s32 %p22, %r19, %r55; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_21; add.s32 %r83, %r127, %r5; mul.wide.s32 %rd28, %r83, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f25, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f26, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f27, [%rd31]; BB0_21: mul.f32 %f172, %f20, %f20; fma.rn.f32 %f173, %f19, %f19, %f172; fma.rn.f32 %f28, %f21, %f21, %f173; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_23; mul.f32 %f177, %f14, %f14; fma.rn.f32 %f178, %f13, %f13, %f177; fma.rn.f32 %f179, %f15, %f15, %f178; setp.eq.f32 %p26, %f179, 0f00000000; mov.f32 %f379, 0f00000000; mov.f32 %f380, %f379; mov.f32 %f381, %f379; @%p26 bra BB0_35; BB0_23: mul.f32 %f180, %f8, %f8; fma.rn.f32 %f181, %f7, %f7, %f180; fma.rn.f32 %f29, %f9, %f9, %f181; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f182, %f26, %f26; fma.rn.f32 %f183, %f25, %f25, %f182; fma.rn.f32 %f30, %f27, %f27, %f183; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_25; mul.f32 %f184, %f14, %f14; fma.rn.f32 %f185, %f13, %f13, %f184; fma.rn.f32 %f186, %f15, %f15, %f185; setp.neu.f32 %p32, %f186, 0f00000000; @%p32 bra BB0_34; bra.uni BB0_25; BB0_34: sub.f32 %f215, %f19, %f13; mul.f32 %f379, %f215, 0f3F000000; sub.f32 %f216, %f20, %f14; mul.f32 %f380, %f216, 0f3F000000; sub.f32 %f217, %f21, %f15; mul.f32 %f381, %f217, 0f3F000000; bra.uni BB0_35; BB0_25: or.pred %p34, %p25, %p27; @%p34 bra BB0_27; bra.uni BB0_26; BB0_27: mul.f32 %f187, %f14, %f14; fma.rn.f32 %f188, %f13, %f13, %f187; fma.rn.f32 %f34, %f15, %f15, %f188; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_29; bra.uni BB0_28; BB0_29: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_31; bra.uni BB0_30; BB0_31: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_33; bra.uni BB0_32; BB0_33: sub.f32 %f206, %f19, %f13; sub.f32 %f207, %f20, %f14; sub.f32 %f208, %f21, %f15; sub.f32 %f209, %f7, %f25; mul.f32 %f210, %f209, 0f3DAAAAAB; sub.f32 %f211, %f8, %f26; mul.f32 %f212, %f211, 0f3DAAAAAB; sub.f32 %f213, %f9, %f27; mul.f32 %f214, %f213, 0f3DAAAAAB; fma.rn.f32 %f379, %f206, 0f3F2AAAAB, %f210; fma.rn.f32 %f380, %f207, 0f3F2AAAAB, %f212; fma.rn.f32 %f381, %f208, 0f3F2AAAAB, %f214; bra.uni BB0_35; BB0_26: sub.f32 %f379, %f1, %f13; sub.f32 %f380, %f2, %f14; sub.f32 %f381, %f3, %f15; bra.uni BB0_35; BB0_28: sub.f32 %f379, %f19, %f1; sub.f32 %f380, %f20, %f2; sub.f32 %f381, %f21, %f3; bra.uni BB0_35; BB0_30: mul.f32 %f189, %f13, 0fC0000000; fma.rn.f32 %f190, %f7, 0f3F000000, %f189; add.f32 %f191, %f14, %f14; mul.f32 %f192, %f8, 0f3F000000; sub.f32 %f193, %f192, %f191; add.f32 %f194, %f15, %f15; mul.f32 %f195, %f9, 0f3F000000; sub.f32 %f196, %f195, %f194; fma.rn.f32 %f379, %f1, 0f3FC00000, %f190; fma.rn.f32 %f380, %f2, 0f3FC00000, %f193; fma.rn.f32 %f381, %f3, 0f3FC00000, %f196; bra.uni BB0_35; BB0_32: mul.f32 %f197, %f25, 0fBF000000; fma.rn.f32 %f198, %f19, 0f40000000, %f197; mul.f32 %f199, %f26, 0fBF000000; fma.rn.f32 %f200, %f20, 0f40000000, %f199; mul.f32 %f201, %f27, 0fBF000000; fma.rn.f32 %f202, %f21, 0f40000000, %f201; mul.f32 %f203, %f1, 0f3FC00000; sub.f32 %f379, %f198, %f203; mul.f32 %f204, %f2, 0f3FC00000; sub.f32 %f380, %f200, %f204; mul.f32 %f205, %f3, 0f3FC00000; sub.f32 %f381, %f202, %f205; BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_37; rem.s32 %r84, %r23, %r56; add.s32 %r85, %r84, %r56; rem.s32 %r128, %r85, %r56; bra.uni BB0_38; BB0_37: mov.u32 %r86, 0; max.s32 %r128, %r23, %r86; BB0_38: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_40; add.s32 %r87, %r128, %r4; mad.lo.s32 %r88, %r87, %r55, %r1; mul.wide.s32 %rd32, %r88, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f56, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f57, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f58, [%rd35]; BB0_40: add.s32 %r27, %r2, -1; @%p43 bra BB0_42; rem.s32 %r89, %r27, %r56; add.s32 %r90, %r89, %r56; rem.s32 %r129, %r90, %r56; bra.uni BB0_43; BB0_42: mov.u32 %r91, 0; max.s32 %r129, %r27, %r91; BB0_43: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_45; add.s32 %r92, %r129, %r4; mad.lo.s32 %r93, %r92, %r55, %r1; mul.wide.s32 %rd36, %r93, 4; add.s64 %rd37, %rd3, %rd36; ld.global.nc.f32 %f62, [%rd37]; add.s64 %rd38, %rd2, %rd36; ld.global.nc.f32 %f63, [%rd38]; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f64, [%rd39]; BB0_45: add.s32 %r31, %r2, 1; @%p43 bra BB0_47; rem.s32 %r94, %r31, %r56; add.s32 %r95, %r94, %r56; rem.s32 %r130, %r95, %r56; bra.uni BB0_48; BB0_47: add.s32 %r96, %r56, -1; min.s32 %r130, %r31, %r96; BB0_48: setp.ge.s32 %p52, %r31, %r56; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_50; add.s32 %r97, %r130, %r4; mad.lo.s32 %r98, %r97, %r55, %r1; mul.wide.s32 %rd40, %r98, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f68, [%rd41]; add.s64 %rd42, %rd2, %rd40; ld.global.nc.f32 %f69, [%rd42]; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f70, [%rd43]; BB0_50: add.s32 %r35, %r2, 2; @%p43 bra BB0_52; rem.s32 %r99, %r35, %r56; add.s32 %r100, %r99, %r56; rem.s32 %r131, %r100, %r56; bra.uni BB0_53; BB0_52: add.s32 %r101, %r56, -1; min.s32 %r131, %r35, %r101; BB0_53: setp.ge.s32 %p56, %r35, %r56; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_55; add.s32 %r102, %r131, %r4; mad.lo.s32 %r103, %r102, %r55, %r1; mul.wide.s32 %rd44, %r103, 4; add.s64 %rd45, %rd3, %rd44; ld.global.nc.f32 %f74, [%rd45]; add.s64 %rd46, %rd2, %rd44; ld.global.nc.f32 %f75, [%rd46]; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f76, [%rd47]; BB0_55: mul.f32 %f230, %f69, %f69; fma.rn.f32 %f231, %f68, %f68, %f230; fma.rn.f32 %f77, %f70, %f70, %f231; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_57; mul.f32 %f235, %f63, %f63; fma.rn.f32 %f236, %f62, %f62, %f235; fma.rn.f32 %f237, %f64, %f64, %f236; setp.eq.f32 %p60, %f237, 0f00000000; mov.f32 %f394, 0f00000000; mov.f32 %f395, %f394; mov.f32 %f396, %f394; @%p60 bra BB0_69; BB0_57: mul.f32 %f238, %f57, %f57; fma.rn.f32 %f239, %f56, %f56, %f238; fma.rn.f32 %f78, %f58, %f58, %f239; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f240, %f75, %f75; fma.rn.f32 %f241, %f74, %f74, %f240; fma.rn.f32 %f79, %f76, %f76, %f241; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_59; mul.f32 %f242, %f63, %f63; fma.rn.f32 %f243, %f62, %f62, %f242; fma.rn.f32 %f244, %f64, %f64, %f243; setp.neu.f32 %p66, %f244, 0f00000000; @%p66 bra BB0_68; bra.uni BB0_59; BB0_68: sub.f32 %f273, %f68, %f62; mul.f32 %f394, %f273, 0f3F000000; sub.f32 %f274, %f69, %f63; mul.f32 %f395, %f274, 0f3F000000; sub.f32 %f275, %f70, %f64; mul.f32 %f396, %f275, 0f3F000000; bra.uni BB0_69; BB0_59: or.pred %p68, %p59, %p61; @%p68 bra BB0_61; bra.uni BB0_60; BB0_61: mul.f32 %f245, %f63, %f63; fma.rn.f32 %f246, %f62, %f62, %f245; fma.rn.f32 %f83, %f64, %f64, %f246; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_63; bra.uni BB0_62; BB0_63: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_65; bra.uni BB0_64; BB0_65: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_67; bra.uni BB0_66; BB0_67: sub.f32 %f264, %f68, %f62; sub.f32 %f265, %f69, %f63; sub.f32 %f266, %f70, %f64; sub.f32 %f267, %f56, %f74; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f57, %f75; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f58, %f76; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f394, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f395, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f396, %f266, 0f3F2AAAAB, %f272; bra.uni BB0_69; BB0_60: sub.f32 %f394, %f1, %f62; sub.f32 %f395, %f2, %f63; sub.f32 %f396, %f3, %f64; bra.uni BB0_69; BB0_62: sub.f32 %f394, %f68, %f1; sub.f32 %f395, %f69, %f2; sub.f32 %f396, %f70, %f3; bra.uni BB0_69; BB0_64: mul.f32 %f247, %f62, 0fC0000000; fma.rn.f32 %f248, %f56, 0f3F000000, %f247; add.f32 %f249, %f63, %f63; mul.f32 %f250, %f57, 0f3F000000; sub.f32 %f251, %f250, %f249; add.f32 %f252, %f64, %f64; mul.f32 %f253, %f58, 0f3F000000; sub.f32 %f254, %f253, %f252; fma.rn.f32 %f394, %f1, 0f3FC00000, %f248; fma.rn.f32 %f395, %f2, 0f3FC00000, %f251; fma.rn.f32 %f396, %f3, 0f3FC00000, %f254; bra.uni BB0_69; BB0_66: mul.f32 %f255, %f74, 0fBF000000; fma.rn.f32 %f256, %f68, 0f40000000, %f255; mul.f32 %f257, %f75, 0fBF000000; fma.rn.f32 %f258, %f69, 0f40000000, %f257; mul.f32 %f259, %f76, 0fBF000000; fma.rn.f32 %f260, %f70, 0f40000000, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f394, %f256, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f395, %f258, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f396, %f260, %f263; BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p77, %rs3, 0; add.s32 %r39, %r3, -2; @%p77 bra BB0_71; rem.s32 %r104, %r39, %r57; add.s32 %r105, %r104, %r57; rem.s32 %r132, %r105, %r57; bra.uni BB0_72; BB0_71: mov.u32 %r106, 0; max.s32 %r132, %r39, %r106; BB0_72: setp.lt.s32 %p79, %r39, 0; mov.f32 %f105, 0f00000000; and.pred %p80, %p79, %p77; mov.f32 %f106, %f105; mov.f32 %f107, %f105; @%p80 bra BB0_74; mad.lo.s32 %r107, %r132, %r56, %r2; mad.lo.s32 %r108, %r107, %r55, %r1; mul.wide.s32 %rd48, %r108, 4; add.s64 %rd49, %rd3, %rd48; ld.global.nc.f32 %f105, [%rd49]; add.s64 %rd50, %rd2, %rd48; ld.global.nc.f32 %f106, [%rd50]; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f107, [%rd51]; BB0_74: add.s32 %r43, %r3, -1; @%p77 bra BB0_76; rem.s32 %r109, %r43, %r57; add.s32 %r110, %r109, %r57; rem.s32 %r133, %r110, %r57; bra.uni BB0_77; BB0_76: mov.u32 %r111, 0; max.s32 %r133, %r43, %r111; BB0_77: setp.lt.s32 %p82, %r43, 0; mov.f32 %f111, 0f00000000; and.pred %p84, %p82, %p77; mov.f32 %f112, %f111; mov.f32 %f113, %f111; @%p84 bra BB0_79; mad.lo.s32 %r112, %r133, %r56, %r2; mad.lo.s32 %r113, %r112, %r55, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f111, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f112, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f113, [%rd55]; BB0_79: add.s32 %r47, %r3, 1; @%p77 bra BB0_81; rem.s32 %r114, %r47, %r57; add.s32 %r115, %r114, %r57; rem.s32 %r134, %r115, %r57; bra.uni BB0_82; BB0_81: add.s32 %r116, %r57, -1; min.s32 %r134, %r47, %r116; BB0_82: setp.ge.s32 %p86, %r47, %r57; mov.f32 %f117, 0f00000000; and.pred %p88, %p86, %p77; mov.f32 %f118, %f117; mov.f32 %f119, %f117; @%p88 bra BB0_84; mad.lo.s32 %r117, %r134, %r56, %r2; mad.lo.s32 %r118, %r117, %r55, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f117, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f118, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f119, [%rd59]; BB0_84: add.s32 %r51, %r3, 2; @%p77 bra BB0_86; rem.s32 %r119, %r51, %r57; add.s32 %r120, %r119, %r57; rem.s32 %r135, %r120, %r57; bra.uni BB0_87; BB0_86: add.s32 %r121, %r57, -1; min.s32 %r135, %r51, %r121; BB0_87: setp.ge.s32 %p90, %r51, %r57; mov.f32 %f123, 0f00000000; and.pred %p92, %p90, %p77; mov.f32 %f124, %f123; mov.f32 %f125, %f123; @%p92 bra BB0_89; mad.lo.s32 %r122, %r135, %r56, %r2; mad.lo.s32 %r123, %r122, %r55, %r1; mul.wide.s32 %rd60, %r123, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f123, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f124, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f125, [%rd63]; BB0_89: mul.f32 %f288, %f118, %f118; fma.rn.f32 %f289, %f117, %f117, %f288; fma.rn.f32 %f126, %f119, %f119, %f289; setp.neu.f32 %p93, %f126, 0f00000000; @%p93 bra BB0_91; mul.f32 %f293, %f112, %f112; fma.rn.f32 %f294, %f111, %f111, %f293; fma.rn.f32 %f295, %f113, %f113, %f294; setp.eq.f32 %p94, %f295, 0f00000000; mov.f32 %f409, 0f00000000; mov.f32 %f410, %f409; mov.f32 %f411, %f409; @%p94 bra BB0_103; BB0_91: mul.f32 %f296, %f106, %f106; fma.rn.f32 %f297, %f105, %f105, %f296; fma.rn.f32 %f127, %f107, %f107, %f297; setp.neu.f32 %p95, %f127, 0f00000000; mul.f32 %f298, %f124, %f124; fma.rn.f32 %f299, %f123, %f123, %f298; fma.rn.f32 %f128, %f125, %f125, %f299; setp.neu.f32 %p96, %f128, 0f00000000; and.pred %p97, %p95, %p96; setp.eq.f32 %p98, %f126, 0f00000000; or.pred %p99, %p97, %p98; @%p99 bra BB0_93; mul.f32 %f300, %f112, %f112; fma.rn.f32 %f301, %f111, %f111, %f300; fma.rn.f32 %f302, %f113, %f113, %f301; setp.neu.f32 %p100, %f302, 0f00000000; @%p100 bra BB0_102; bra.uni BB0_93; BB0_102: sub.f32 %f331, %f117, %f111; mul.f32 %f409, %f331, 0f3F000000; sub.f32 %f332, %f118, %f112; mul.f32 %f410, %f332, 0f3F000000; sub.f32 %f333, %f119, %f113; mul.f32 %f411, %f333, 0f3F000000; bra.uni BB0_103; BB0_93: or.pred %p102, %p93, %p95; @%p102 bra BB0_95; bra.uni BB0_94; BB0_95: mul.f32 %f303, %f112, %f112; fma.rn.f32 %f304, %f111, %f111, %f303; fma.rn.f32 %f132, %f113, %f113, %f304; setp.neu.f32 %p103, %f132, 0f00000000; or.pred %p105, %p103, %p96; @%p105 bra BB0_97; bra.uni BB0_96; BB0_97: setp.eq.f32 %p106, %f127, 0f00000000; or.pred %p107, %p106, %p93; @%p107 bra BB0_99; bra.uni BB0_98; BB0_99: setp.eq.f32 %p109, %f128, 0f00000000; or.pred %p110, %p109, %p103; @%p110 bra BB0_101; bra.uni BB0_100; BB0_101: sub.f32 %f322, %f117, %f111; sub.f32 %f323, %f118, %f112; sub.f32 %f324, %f119, %f113; sub.f32 %f325, %f105, %f123; mul.f32 %f326, %f325, 0f3DAAAAAB; sub.f32 %f327, %f106, %f124; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f107, %f125; mul.f32 %f330, %f329, 0f3DAAAAAB; fma.rn.f32 %f409, %f322, 0f3F2AAAAB, %f326; fma.rn.f32 %f410, %f323, 0f3F2AAAAB, %f328; fma.rn.f32 %f411, %f324, 0f3F2AAAAB, %f330; bra.uni BB0_103; BB0_94: sub.f32 %f409, %f1, %f111; sub.f32 %f410, %f2, %f112; sub.f32 %f411, %f3, %f113; bra.uni BB0_103; BB0_96: sub.f32 %f409, %f117, %f1; sub.f32 %f410, %f118, %f2; sub.f32 %f411, %f119, %f3; bra.uni BB0_103; BB0_98: mul.f32 %f305, %f111, 0fC0000000; fma.rn.f32 %f306, %f105, 0f3F000000, %f305; add.f32 %f307, %f112, %f112; mul.f32 %f308, %f106, 0f3F000000; sub.f32 %f309, %f308, %f307; add.f32 %f310, %f113, %f113; mul.f32 %f311, %f107, 0f3F000000; sub.f32 %f312, %f311, %f310; fma.rn.f32 %f409, %f1, 0f3FC00000, %f306; fma.rn.f32 %f410, %f2, 0f3FC00000, %f309; fma.rn.f32 %f411, %f3, 0f3FC00000, %f312; bra.uni BB0_103; BB0_100: mul.f32 %f313, %f123, 0fBF000000; fma.rn.f32 %f314, %f117, 0f40000000, %f313; mul.f32 %f315, %f124, 0fBF000000; fma.rn.f32 %f316, %f118, 0f40000000, %f315; mul.f32 %f317, %f125, 0fBF000000; fma.rn.f32 %f318, %f119, 0f40000000, %f317; mul.f32 %f319, %f1, 0f3FC00000; sub.f32 %f409, %f314, %f319; mul.f32 %f320, %f2, 0f3FC00000; sub.f32 %f410, %f316, %f320; mul.f32 %f321, %f3, 0f3FC00000; sub.f32 %f411, %f318, %f321; BB0_103: setp.eq.s64 %p111, %rd7, 0; @%p111 bra BB0_105; cvta.to.global.u64 %rd64, %rd7; add.s64 %rd66, %rd64, %rd12; ld.global.nc.f32 %f334, [%rd66]; mul.f32 %f412, %f334, %f412; BB0_105: setp.eq.s64 %p112, %rd8, 0; @%p112 bra BB0_107; cvta.to.global.u64 %rd67, %rd8; add.s64 %rd69, %rd67, %rd12; ld.global.nc.f32 %f335, [%rd69]; mul.f32 %f413, %f335, %f413; BB0_107: cvta.to.global.u64 %rd70, %rd6; cvta.to.global.u64 %rd71, %rd5; cvta.to.global.u64 %rd72, %rd4; mul.f32 %f336, %f379, %f157; mul.f32 %f337, %f395, %f158; mul.f32 %f338, %f411, %f159; add.f32 %f339, %f412, %f412; mul.f32 %f340, %f1, %f339; add.f32 %f341, %f337, %f338; mul.f32 %f342, %f1, %f341; mul.f32 %f343, %f394, %f158; fma.rn.f32 %f344, %f2, %f343, %f342; mul.f32 %f345, %f409, %f159; fma.rn.f32 %f346, %f3, %f345, %f344; mul.f32 %f347, %f346, %f413; fma.rn.f32 %f348, %f336, %f340, %f347; add.s64 %rd74, %rd72, %rd12; st.global.f32 [%rd74], %f348; mul.f32 %f349, %f2, %f339; add.f32 %f350, %f336, %f338; mul.f32 %f351, %f2, %f350; mul.f32 %f352, %f380, %f157; fma.rn.f32 %f353, %f1, %f352, %f351; mul.f32 %f354, %f410, %f159; fma.rn.f32 %f355, %f3, %f354, %f353; mul.f32 %f356, %f355, %f413; fma.rn.f32 %f357, %f337, %f349, %f356; add.s64 %rd75, %rd71, %rd12; st.global.f32 [%rd75], %f357; mul.f32 %f358, %f3, %f339; mul.f32 %f359, %f396, %f158; mul.f32 %f360, %f2, %f359; mul.f32 %f361, %f381, %f157; fma.rn.f32 %f362, %f1, %f361, %f360; add.f32 %f363, %f336, %f337; fma.rn.f32 %f364, %f3, %f363, %f362; mul.f32 %f365, %f364, %f413; fma.rn.f32 %f366, %f338, %f358, %f365; add.s64 %rd76, %rd70, %rd12; st.global.f32 [%rd76], %f366; BB0_108: ret; } ` getmagnetoelasticforce_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<113>; .reg .b16 %rs<19>; .reg .f32 %f<414>; .reg .b32 %r<136>; .reg .b64 %rd<77>; ld.param.u64 %rd4, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_6]; ld.param.f32 %f412, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_8]; ld.param.f32 %f413, [getmagnetoelasticforce_param_9]; ld.param.f32 %f157, [getmagnetoelasticforce_param_10]; ld.param.f32 %f158, [getmagnetoelasticforce_param_11]; ld.param.f32 %f159, [getmagnetoelasticforce_param_12]; ld.param.u32 %r55, [getmagnetoelasticforce_param_13]; ld.param.u32 %r56, [getmagnetoelasticforce_param_14]; ld.param.u32 %r57, [getmagnetoelasticforce_param_15]; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd9; mov.u32 %r58, %ntid.x; mov.u32 %r59, %ctaid.x; mov.u32 %r60, %tid.x; mad.lo.s32 %r1, %r58, %r59, %r60; mov.u32 %r61, %ntid.y; mov.u32 %r62, %ctaid.y; mov.u32 %r63, %tid.y; mad.lo.s32 %r2, %r61, %r62, %r63; mov.u32 %r64, %ntid.z; mov.u32 %r65, %ctaid.z; mov.u32 %r66, %tid.z; mad.lo.s32 %r3, %r64, %r65, %r66; setp.ge.s32 %p4, %r2, %r56; setp.ge.s32 %p5, %r1, %r55; or.pred %p6, %p4, %p5; setp.ge.s32 %p7, %r3, %r57; or.pred %p8, %p6, %p7; @%p8 bra BB0_108; mul.lo.s32 %r4, %r3, %r56; add.s32 %r67, %r4, %r2; mul.lo.s32 %r5, %r67, %r55; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; add.s64 %rd14, %rd2, %rd12; ld.global.nc.f32 %f2, [%rd14]; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f3, [%rd15]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r7, %r1, -2; @%p9 bra BB0_3; rem.s32 %r68, %r7, %r55; add.s32 %r69, %r68, %r55; rem.s32 %r124, %r69, %r55; bra.uni BB0_4; BB0_3: mov.u32 %r70, 0; max.s32 %r124, %r7, %r70; BB0_4: setp.lt.s32 %p11, %r7, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_6; add.s32 %r71, %r124, %r5; mul.wide.s32 %rd16, %r71, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f7, [%rd17]; add.s64 %rd18, %rd2, %rd16; ld.global.nc.f32 %f8, [%rd18]; add.s64 %rd19, %rd1, %rd16; ld.global.nc.f32 %f9, [%rd19]; BB0_6: add.s32 %r11, %r1, -1; @%p9 bra BB0_8; rem.s32 %r72, %r11, %r55; add.s32 %r73, %r72, %r55; rem.s32 %r125, %r73, %r55; bra.uni BB0_9; BB0_8: mov.u32 %r74, 0; max.s32 %r125, %r11, %r74; BB0_9: setp.lt.s32 %p14, %r11, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_11; add.s32 %r75, %r125, %r5; mul.wide.s32 %rd20, %r75, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f13, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f14, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f15, [%rd23]; BB0_11: add.s32 %r15, %r1, 1; @%p9 bra BB0_13; rem.s32 %r76, %r15, %r55; add.s32 %r77, %r76, %r55; rem.s32 %r126, %r77, %r55; bra.uni BB0_14; BB0_13: add.s32 %r78, %r55, -1; min.s32 %r126, %r15, %r78; BB0_14: setp.ge.s32 %p18, %r15, %r55; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_16; add.s32 %r79, %r126, %r5; mul.wide.s32 %rd24, %r79, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f19, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f20, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f21, [%rd27]; BB0_16: add.s32 %r19, %r1, 2; @%p9 bra BB0_18; rem.s32 %r80, %r19, %r55; add.s32 %r81, %r80, %r55; rem.s32 %r127, %r81, %r55; bra.uni BB0_19; BB0_18: add.s32 %r82, %r55, -1; min.s32 %r127, %r19, %r82; BB0_19: setp.ge.s32 %p22, %r19, %r55; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_21; add.s32 %r83, %r127, %r5; mul.wide.s32 %rd28, %r83, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f25, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f26, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f27, [%rd31]; BB0_21: mul.f32 %f172, %f20, %f20; fma.rn.f32 %f173, %f19, %f19, %f172; fma.rn.f32 %f28, %f21, %f21, %f173; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_23; mul.f32 %f177, %f14, %f14; fma.rn.f32 %f178, %f13, %f13, %f177; fma.rn.f32 %f179, %f15, %f15, %f178; setp.eq.f32 %p26, %f179, 0f00000000; mov.f32 %f379, 0f00000000; mov.f32 %f380, %f379; mov.f32 %f381, %f379; @%p26 bra BB0_35; BB0_23: mul.f32 %f180, %f8, %f8; fma.rn.f32 %f181, %f7, %f7, %f180; fma.rn.f32 %f29, %f9, %f9, %f181; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f182, %f26, %f26; fma.rn.f32 %f183, %f25, %f25, %f182; fma.rn.f32 %f30, %f27, %f27, %f183; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_25; mul.f32 %f184, %f14, %f14; fma.rn.f32 %f185, %f13, %f13, %f184; fma.rn.f32 %f186, %f15, %f15, %f185; setp.neu.f32 %p32, %f186, 0f00000000; @%p32 bra BB0_34; bra.uni BB0_25; BB0_34: sub.f32 %f215, %f19, %f13; mul.f32 %f379, %f215, 0f3F000000; sub.f32 %f216, %f20, %f14; mul.f32 %f380, %f216, 0f3F000000; sub.f32 %f217, %f21, %f15; mul.f32 %f381, %f217, 0f3F000000; bra.uni BB0_35; BB0_25: or.pred %p34, %p25, %p27; @%p34 bra BB0_27; bra.uni BB0_26; BB0_27: mul.f32 %f187, %f14, %f14; fma.rn.f32 %f188, %f13, %f13, %f187; fma.rn.f32 %f34, %f15, %f15, %f188; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_29; bra.uni BB0_28; BB0_29: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_31; bra.uni BB0_30; BB0_31: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_33; bra.uni BB0_32; BB0_33: sub.f32 %f206, %f19, %f13; sub.f32 %f207, %f20, %f14; sub.f32 %f208, %f21, %f15; sub.f32 %f209, %f7, %f25; mul.f32 %f210, %f209, 0f3DAAAAAB; sub.f32 %f211, %f8, %f26; mul.f32 %f212, %f211, 0f3DAAAAAB; sub.f32 %f213, %f9, %f27; mul.f32 %f214, %f213, 0f3DAAAAAB; fma.rn.f32 %f379, %f206, 0f3F2AAAAB, %f210; fma.rn.f32 %f380, %f207, 0f3F2AAAAB, %f212; fma.rn.f32 %f381, %f208, 0f3F2AAAAB, %f214; bra.uni BB0_35; BB0_26: sub.f32 %f379, %f1, %f13; sub.f32 %f380, %f2, %f14; sub.f32 %f381, %f3, %f15; bra.uni BB0_35; BB0_28: sub.f32 %f379, %f19, %f1; sub.f32 %f380, %f20, %f2; sub.f32 %f381, %f21, %f3; bra.uni BB0_35; BB0_30: mul.f32 %f189, %f13, 0fC0000000; fma.rn.f32 %f190, %f7, 0f3F000000, %f189; add.f32 %f191, %f14, %f14; mul.f32 %f192, %f8, 0f3F000000; sub.f32 %f193, %f192, %f191; add.f32 %f194, %f15, %f15; mul.f32 %f195, %f9, 0f3F000000; sub.f32 %f196, %f195, %f194; fma.rn.f32 %f379, %f1, 0f3FC00000, %f190; fma.rn.f32 %f380, %f2, 0f3FC00000, %f193; fma.rn.f32 %f381, %f3, 0f3FC00000, %f196; bra.uni BB0_35; BB0_32: mul.f32 %f197, %f25, 0fBF000000; fma.rn.f32 %f198, %f19, 0f40000000, %f197; mul.f32 %f199, %f26, 0fBF000000; fma.rn.f32 %f200, %f20, 0f40000000, %f199; mul.f32 %f201, %f27, 0fBF000000; fma.rn.f32 %f202, %f21, 0f40000000, %f201; mul.f32 %f203, %f1, 0f3FC00000; sub.f32 %f379, %f198, %f203; mul.f32 %f204, %f2, 0f3FC00000; sub.f32 %f380, %f200, %f204; mul.f32 %f205, %f3, 0f3FC00000; sub.f32 %f381, %f202, %f205; BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_37; rem.s32 %r84, %r23, %r56; add.s32 %r85, %r84, %r56; rem.s32 %r128, %r85, %r56; bra.uni BB0_38; BB0_37: mov.u32 %r86, 0; max.s32 %r128, %r23, %r86; BB0_38: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_40; add.s32 %r87, %r128, %r4; mad.lo.s32 %r88, %r87, %r55, %r1; mul.wide.s32 %rd32, %r88, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f56, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f57, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f58, [%rd35]; BB0_40: add.s32 %r27, %r2, -1; @%p43 bra BB0_42; rem.s32 %r89, %r27, %r56; add.s32 %r90, %r89, %r56; rem.s32 %r129, %r90, %r56; bra.uni BB0_43; BB0_42: mov.u32 %r91, 0; max.s32 %r129, %r27, %r91; BB0_43: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_45; add.s32 %r92, %r129, %r4; mad.lo.s32 %r93, %r92, %r55, %r1; mul.wide.s32 %rd36, %r93, 4; add.s64 %rd37, %rd3, %rd36; ld.global.nc.f32 %f62, [%rd37]; add.s64 %rd38, %rd2, %rd36; ld.global.nc.f32 %f63, [%rd38]; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f64, [%rd39]; BB0_45: add.s32 %r31, %r2, 1; @%p43 bra BB0_47; rem.s32 %r94, %r31, %r56; add.s32 %r95, %r94, %r56; rem.s32 %r130, %r95, %r56; bra.uni BB0_48; BB0_47: add.s32 %r96, %r56, -1; min.s32 %r130, %r31, %r96; BB0_48: setp.ge.s32 %p52, %r31, %r56; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_50; add.s32 %r97, %r130, %r4; mad.lo.s32 %r98, %r97, %r55, %r1; mul.wide.s32 %rd40, %r98, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f68, [%rd41]; add.s64 %rd42, %rd2, %rd40; ld.global.nc.f32 %f69, [%rd42]; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f70, [%rd43]; BB0_50: add.s32 %r35, %r2, 2; @%p43 bra BB0_52; rem.s32 %r99, %r35, %r56; add.s32 %r100, %r99, %r56; rem.s32 %r131, %r100, %r56; bra.uni BB0_53; BB0_52: add.s32 %r101, %r56, -1; min.s32 %r131, %r35, %r101; BB0_53: setp.ge.s32 %p56, %r35, %r56; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_55; add.s32 %r102, %r131, %r4; mad.lo.s32 %r103, %r102, %r55, %r1; mul.wide.s32 %rd44, %r103, 4; add.s64 %rd45, %rd3, %rd44; ld.global.nc.f32 %f74, [%rd45]; add.s64 %rd46, %rd2, %rd44; ld.global.nc.f32 %f75, [%rd46]; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f76, [%rd47]; BB0_55: mul.f32 %f230, %f69, %f69; fma.rn.f32 %f231, %f68, %f68, %f230; fma.rn.f32 %f77, %f70, %f70, %f231; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_57; mul.f32 %f235, %f63, %f63; fma.rn.f32 %f236, %f62, %f62, %f235; fma.rn.f32 %f237, %f64, %f64, %f236; setp.eq.f32 %p60, %f237, 0f00000000; mov.f32 %f394, 0f00000000; mov.f32 %f395, %f394; mov.f32 %f396, %f394; @%p60 bra BB0_69; BB0_57: mul.f32 %f238, %f57, %f57; fma.rn.f32 %f239, %f56, %f56, %f238; fma.rn.f32 %f78, %f58, %f58, %f239; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f240, %f75, %f75; fma.rn.f32 %f241, %f74, %f74, %f240; fma.rn.f32 %f79, %f76, %f76, %f241; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_59; mul.f32 %f242, %f63, %f63; fma.rn.f32 %f243, %f62, %f62, %f242; fma.rn.f32 %f244, %f64, %f64, %f243; setp.neu.f32 %p66, %f244, 0f00000000; @%p66 bra BB0_68; bra.uni BB0_59; BB0_68: sub.f32 %f273, %f68, %f62; mul.f32 %f394, %f273, 0f3F000000; sub.f32 %f274, %f69, %f63; mul.f32 %f395, %f274, 0f3F000000; sub.f32 %f275, %f70, %f64; mul.f32 %f396, %f275, 0f3F000000; bra.uni BB0_69; BB0_59: or.pred %p68, %p59, %p61; @%p68 bra BB0_61; bra.uni BB0_60; BB0_61: mul.f32 %f245, %f63, %f63; fma.rn.f32 %f246, %f62, %f62, %f245; fma.rn.f32 %f83, %f64, %f64, %f246; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_63; bra.uni BB0_62; BB0_63: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_65; bra.uni BB0_64; BB0_65: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_67; bra.uni BB0_66; BB0_67: sub.f32 %f264, %f68, %f62; sub.f32 %f265, %f69, %f63; sub.f32 %f266, %f70, %f64; sub.f32 %f267, %f56, %f74; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f57, %f75; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f58, %f76; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f394, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f395, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f396, %f266, 0f3F2AAAAB, %f272; bra.uni BB0_69; BB0_60: sub.f32 %f394, %f1, %f62; sub.f32 %f395, %f2, %f63; sub.f32 %f396, %f3, %f64; bra.uni BB0_69; BB0_62: sub.f32 %f394, %f68, %f1; sub.f32 %f395, %f69, %f2; sub.f32 %f396, %f70, %f3; bra.uni BB0_69; BB0_64: mul.f32 %f247, %f62, 0fC0000000; fma.rn.f32 %f248, %f56, 0f3F000000, %f247; add.f32 %f249, %f63, %f63; mul.f32 %f250, %f57, 0f3F000000; sub.f32 %f251, %f250, %f249; add.f32 %f252, %f64, %f64; mul.f32 %f253, %f58, 0f3F000000; sub.f32 %f254, %f253, %f252; fma.rn.f32 %f394, %f1, 0f3FC00000, %f248; fma.rn.f32 %f395, %f2, 0f3FC00000, %f251; fma.rn.f32 %f396, %f3, 0f3FC00000, %f254; bra.uni BB0_69; BB0_66: mul.f32 %f255, %f74, 0fBF000000; fma.rn.f32 %f256, %f68, 0f40000000, %f255; mul.f32 %f257, %f75, 0fBF000000; fma.rn.f32 %f258, %f69, 0f40000000, %f257; mul.f32 %f259, %f76, 0fBF000000; fma.rn.f32 %f260, %f70, 0f40000000, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f394, %f256, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f395, %f258, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f396, %f260, %f263; BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p77, %rs3, 0; add.s32 %r39, %r3, -2; @%p77 bra BB0_71; rem.s32 %r104, %r39, %r57; add.s32 %r105, %r104, %r57; rem.s32 %r132, %r105, %r57; bra.uni BB0_72; BB0_71: mov.u32 %r106, 0; max.s32 %r132, %r39, %r106; BB0_72: setp.lt.s32 %p79, %r39, 0; mov.f32 %f105, 0f00000000; and.pred %p80, %p79, %p77; mov.f32 %f106, %f105; mov.f32 %f107, %f105; @%p80 bra BB0_74; mad.lo.s32 %r107, %r132, %r56, %r2; mad.lo.s32 %r108, %r107, %r55, %r1; mul.wide.s32 %rd48, %r108, 4; add.s64 %rd49, %rd3, %rd48; ld.global.nc.f32 %f105, [%rd49]; add.s64 %rd50, %rd2, %rd48; ld.global.nc.f32 %f106, [%rd50]; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f107, [%rd51]; BB0_74: add.s32 %r43, %r3, -1; @%p77 bra BB0_76; rem.s32 %r109, %r43, %r57; add.s32 %r110, %r109, %r57; rem.s32 %r133, %r110, %r57; bra.uni BB0_77; BB0_76: mov.u32 %r111, 0; max.s32 %r133, %r43, %r111; BB0_77: setp.lt.s32 %p82, %r43, 0; mov.f32 %f111, 0f00000000; and.pred %p84, %p82, %p77; mov.f32 %f112, %f111; mov.f32 %f113, %f111; @%p84 bra BB0_79; mad.lo.s32 %r112, %r133, %r56, %r2; mad.lo.s32 %r113, %r112, %r55, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f111, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f112, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f113, [%rd55]; BB0_79: add.s32 %r47, %r3, 1; @%p77 bra BB0_81; rem.s32 %r114, %r47, %r57; add.s32 %r115, %r114, %r57; rem.s32 %r134, %r115, %r57; bra.uni BB0_82; BB0_81: add.s32 %r116, %r57, -1; min.s32 %r134, %r47, %r116; BB0_82: setp.ge.s32 %p86, %r47, %r57; mov.f32 %f117, 0f00000000; and.pred %p88, %p86, %p77; mov.f32 %f118, %f117; mov.f32 %f119, %f117; @%p88 bra BB0_84; mad.lo.s32 %r117, %r134, %r56, %r2; mad.lo.s32 %r118, %r117, %r55, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f117, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f118, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f119, [%rd59]; BB0_84: add.s32 %r51, %r3, 2; @%p77 bra BB0_86; rem.s32 %r119, %r51, %r57; add.s32 %r120, %r119, %r57; rem.s32 %r135, %r120, %r57; bra.uni BB0_87; BB0_86: add.s32 %r121, %r57, -1; min.s32 %r135, %r51, %r121; BB0_87: setp.ge.s32 %p90, %r51, %r57; mov.f32 %f123, 0f00000000; and.pred %p92, %p90, %p77; mov.f32 %f124, %f123; mov.f32 %f125, %f123; @%p92 bra BB0_89; mad.lo.s32 %r122, %r135, %r56, %r2; mad.lo.s32 %r123, %r122, %r55, %r1; mul.wide.s32 %rd60, %r123, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f123, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f124, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f125, [%rd63]; BB0_89: mul.f32 %f288, %f118, %f118; fma.rn.f32 %f289, %f117, %f117, %f288; fma.rn.f32 %f126, %f119, %f119, %f289; setp.neu.f32 %p93, %f126, 0f00000000; @%p93 bra BB0_91; mul.f32 %f293, %f112, %f112; fma.rn.f32 %f294, %f111, %f111, %f293; fma.rn.f32 %f295, %f113, %f113, %f294; setp.eq.f32 %p94, %f295, 0f00000000; mov.f32 %f409, 0f00000000; mov.f32 %f410, %f409; mov.f32 %f411, %f409; @%p94 bra BB0_103; BB0_91: mul.f32 %f296, %f106, %f106; fma.rn.f32 %f297, %f105, %f105, %f296; fma.rn.f32 %f127, %f107, %f107, %f297; setp.neu.f32 %p95, %f127, 0f00000000; mul.f32 %f298, %f124, %f124; fma.rn.f32 %f299, %f123, %f123, %f298; fma.rn.f32 %f128, %f125, %f125, %f299; setp.neu.f32 %p96, %f128, 0f00000000; and.pred %p97, %p95, %p96; setp.eq.f32 %p98, %f126, 0f00000000; or.pred %p99, %p97, %p98; @%p99 bra BB0_93; mul.f32 %f300, %f112, %f112; fma.rn.f32 %f301, %f111, %f111, %f300; fma.rn.f32 %f302, %f113, %f113, %f301; setp.neu.f32 %p100, %f302, 0f00000000; @%p100 bra BB0_102; bra.uni BB0_93; BB0_102: sub.f32 %f331, %f117, %f111; mul.f32 %f409, %f331, 0f3F000000; sub.f32 %f332, %f118, %f112; mul.f32 %f410, %f332, 0f3F000000; sub.f32 %f333, %f119, %f113; mul.f32 %f411, %f333, 0f3F000000; bra.uni BB0_103; BB0_93: or.pred %p102, %p93, %p95; @%p102 bra BB0_95; bra.uni BB0_94; BB0_95: mul.f32 %f303, %f112, %f112; fma.rn.f32 %f304, %f111, %f111, %f303; fma.rn.f32 %f132, %f113, %f113, %f304; setp.neu.f32 %p103, %f132, 0f00000000; or.pred %p105, %p103, %p96; @%p105 bra BB0_97; bra.uni BB0_96; BB0_97: setp.eq.f32 %p106, %f127, 0f00000000; or.pred %p107, %p106, %p93; @%p107 bra BB0_99; bra.uni BB0_98; BB0_99: setp.eq.f32 %p109, %f128, 0f00000000; or.pred %p110, %p109, %p103; @%p110 bra BB0_101; bra.uni BB0_100; BB0_101: sub.f32 %f322, %f117, %f111; sub.f32 %f323, %f118, %f112; sub.f32 %f324, %f119, %f113; sub.f32 %f325, %f105, %f123; mul.f32 %f326, %f325, 0f3DAAAAAB; sub.f32 %f327, %f106, %f124; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f107, %f125; mul.f32 %f330, %f329, 0f3DAAAAAB; fma.rn.f32 %f409, %f322, 0f3F2AAAAB, %f326; fma.rn.f32 %f410, %f323, 0f3F2AAAAB, %f328; fma.rn.f32 %f411, %f324, 0f3F2AAAAB, %f330; bra.uni BB0_103; BB0_94: sub.f32 %f409, %f1, %f111; sub.f32 %f410, %f2, %f112; sub.f32 %f411, %f3, %f113; bra.uni BB0_103; BB0_96: sub.f32 %f409, %f117, %f1; sub.f32 %f410, %f118, %f2; sub.f32 %f411, %f119, %f3; bra.uni BB0_103; BB0_98: mul.f32 %f305, %f111, 0fC0000000; fma.rn.f32 %f306, %f105, 0f3F000000, %f305; add.f32 %f307, %f112, %f112; mul.f32 %f308, %f106, 0f3F000000; sub.f32 %f309, %f308, %f307; add.f32 %f310, %f113, %f113; mul.f32 %f311, %f107, 0f3F000000; sub.f32 %f312, %f311, %f310; fma.rn.f32 %f409, %f1, 0f3FC00000, %f306; fma.rn.f32 %f410, %f2, 0f3FC00000, %f309; fma.rn.f32 %f411, %f3, 0f3FC00000, %f312; bra.uni BB0_103; BB0_100: mul.f32 %f313, %f123, 0fBF000000; fma.rn.f32 %f314, %f117, 0f40000000, %f313; mul.f32 %f315, %f124, 0fBF000000; fma.rn.f32 %f316, %f118, 0f40000000, %f315; mul.f32 %f317, %f125, 0fBF000000; fma.rn.f32 %f318, %f119, 0f40000000, %f317; mul.f32 %f319, %f1, 0f3FC00000; sub.f32 %f409, %f314, %f319; mul.f32 %f320, %f2, 0f3FC00000; sub.f32 %f410, %f316, %f320; mul.f32 %f321, %f3, 0f3FC00000; sub.f32 %f411, %f318, %f321; BB0_103: setp.eq.s64 %p111, %rd7, 0; @%p111 bra BB0_105; cvta.to.global.u64 %rd64, %rd7; add.s64 %rd66, %rd64, %rd12; ld.global.nc.f32 %f334, [%rd66]; mul.f32 %f412, %f334, %f412; BB0_105: setp.eq.s64 %p112, %rd8, 0; @%p112 bra BB0_107; cvta.to.global.u64 %rd67, %rd8; add.s64 %rd69, %rd67, %rd12; ld.global.nc.f32 %f335, [%rd69]; mul.f32 %f413, %f335, %f413; BB0_107: cvta.to.global.u64 %rd70, %rd6; cvta.to.global.u64 %rd71, %rd5; cvta.to.global.u64 %rd72, %rd4; mul.f32 %f336, %f379, %f157; mul.f32 %f337, %f395, %f158; mul.f32 %f338, %f411, %f159; add.f32 %f339, %f412, %f412; mul.f32 %f340, %f1, %f339; add.f32 %f341, %f337, %f338; mul.f32 %f342, %f1, %f341; mul.f32 %f343, %f394, %f158; fma.rn.f32 %f344, %f2, %f343, %f342; mul.f32 %f345, %f409, %f159; fma.rn.f32 %f346, %f3, %f345, %f344; mul.f32 %f347, %f346, %f413; fma.rn.f32 %f348, %f336, %f340, %f347; add.s64 %rd74, %rd72, %rd12; st.global.f32 [%rd74], %f348; mul.f32 %f349, %f2, %f339; add.f32 %f350, %f336, %f338; mul.f32 %f351, %f2, %f350; mul.f32 %f352, %f380, %f157; fma.rn.f32 %f353, %f1, %f352, %f351; mul.f32 %f354, %f410, %f159; fma.rn.f32 %f355, %f3, %f354, %f353; mul.f32 %f356, %f355, %f413; fma.rn.f32 %f357, %f337, %f349, %f356; add.s64 %rd75, %rd71, %rd12; st.global.f32 [%rd75], %f357; mul.f32 %f358, %f3, %f339; mul.f32 %f359, %f396, %f158; mul.f32 %f360, %f2, %f359; mul.f32 %f361, %f381, %f157; fma.rn.f32 %f362, %f1, %f361, %f360; add.f32 %f363, %f336, %f337; fma.rn.f32 %f364, %f3, %f363, %f362; mul.f32 %f365, %f364, %f413; fma.rn.f32 %f366, %f338, %f358, %f365; add.s64 %rd76, %rd70, %rd12; st.global.f32 [%rd76], %f366; BB0_108: ret; } ` getmagnetoelasticforce_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<113>; .reg .b16 %rs<19>; .reg .f32 %f<414>; .reg .b32 %r<136>; .reg .b64 %rd<77>; ld.param.u64 %rd4, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_6]; ld.param.f32 %f412, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_8]; ld.param.f32 %f413, [getmagnetoelasticforce_param_9]; ld.param.f32 %f157, [getmagnetoelasticforce_param_10]; ld.param.f32 %f158, [getmagnetoelasticforce_param_11]; ld.param.f32 %f159, [getmagnetoelasticforce_param_12]; ld.param.u32 %r55, [getmagnetoelasticforce_param_13]; ld.param.u32 %r56, [getmagnetoelasticforce_param_14]; ld.param.u32 %r57, [getmagnetoelasticforce_param_15]; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd9; mov.u32 %r58, %ntid.x; mov.u32 %r59, %ctaid.x; mov.u32 %r60, %tid.x; mad.lo.s32 %r1, %r58, %r59, %r60; mov.u32 %r61, %ntid.y; mov.u32 %r62, %ctaid.y; mov.u32 %r63, %tid.y; mad.lo.s32 %r2, %r61, %r62, %r63; mov.u32 %r64, %ntid.z; mov.u32 %r65, %ctaid.z; mov.u32 %r66, %tid.z; mad.lo.s32 %r3, %r64, %r65, %r66; setp.ge.s32 %p4, %r2, %r56; setp.ge.s32 %p5, %r1, %r55; or.pred %p6, %p4, %p5; setp.ge.s32 %p7, %r3, %r57; or.pred %p8, %p6, %p7; @%p8 bra BB0_108; mul.lo.s32 %r4, %r3, %r56; add.s32 %r67, %r4, %r2; mul.lo.s32 %r5, %r67, %r55; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; add.s64 %rd14, %rd2, %rd12; ld.global.nc.f32 %f2, [%rd14]; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f3, [%rd15]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r7, %r1, -2; @%p9 bra BB0_3; rem.s32 %r68, %r7, %r55; add.s32 %r69, %r68, %r55; rem.s32 %r124, %r69, %r55; bra.uni BB0_4; BB0_3: mov.u32 %r70, 0; max.s32 %r124, %r7, %r70; BB0_4: setp.lt.s32 %p11, %r7, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_6; add.s32 %r71, %r124, %r5; mul.wide.s32 %rd16, %r71, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f7, [%rd17]; add.s64 %rd18, %rd2, %rd16; ld.global.nc.f32 %f8, [%rd18]; add.s64 %rd19, %rd1, %rd16; ld.global.nc.f32 %f9, [%rd19]; BB0_6: add.s32 %r11, %r1, -1; @%p9 bra BB0_8; rem.s32 %r72, %r11, %r55; add.s32 %r73, %r72, %r55; rem.s32 %r125, %r73, %r55; bra.uni BB0_9; BB0_8: mov.u32 %r74, 0; max.s32 %r125, %r11, %r74; BB0_9: setp.lt.s32 %p14, %r11, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_11; add.s32 %r75, %r125, %r5; mul.wide.s32 %rd20, %r75, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f13, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f14, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f15, [%rd23]; BB0_11: add.s32 %r15, %r1, 1; @%p9 bra BB0_13; rem.s32 %r76, %r15, %r55; add.s32 %r77, %r76, %r55; rem.s32 %r126, %r77, %r55; bra.uni BB0_14; BB0_13: add.s32 %r78, %r55, -1; min.s32 %r126, %r15, %r78; BB0_14: setp.ge.s32 %p18, %r15, %r55; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_16; add.s32 %r79, %r126, %r5; mul.wide.s32 %rd24, %r79, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f19, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f20, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f21, [%rd27]; BB0_16: add.s32 %r19, %r1, 2; @%p9 bra BB0_18; rem.s32 %r80, %r19, %r55; add.s32 %r81, %r80, %r55; rem.s32 %r127, %r81, %r55; bra.uni BB0_19; BB0_18: add.s32 %r82, %r55, -1; min.s32 %r127, %r19, %r82; BB0_19: setp.ge.s32 %p22, %r19, %r55; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_21; add.s32 %r83, %r127, %r5; mul.wide.s32 %rd28, %r83, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f25, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f26, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f27, [%rd31]; BB0_21: mul.f32 %f172, %f20, %f20; fma.rn.f32 %f173, %f19, %f19, %f172; fma.rn.f32 %f28, %f21, %f21, %f173; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_23; mul.f32 %f177, %f14, %f14; fma.rn.f32 %f178, %f13, %f13, %f177; fma.rn.f32 %f179, %f15, %f15, %f178; setp.eq.f32 %p26, %f179, 0f00000000; mov.f32 %f379, 0f00000000; mov.f32 %f380, %f379; mov.f32 %f381, %f379; @%p26 bra BB0_35; BB0_23: mul.f32 %f180, %f8, %f8; fma.rn.f32 %f181, %f7, %f7, %f180; fma.rn.f32 %f29, %f9, %f9, %f181; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f182, %f26, %f26; fma.rn.f32 %f183, %f25, %f25, %f182; fma.rn.f32 %f30, %f27, %f27, %f183; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_25; mul.f32 %f184, %f14, %f14; fma.rn.f32 %f185, %f13, %f13, %f184; fma.rn.f32 %f186, %f15, %f15, %f185; setp.neu.f32 %p32, %f186, 0f00000000; @%p32 bra BB0_34; bra.uni BB0_25; BB0_34: sub.f32 %f215, %f19, %f13; mul.f32 %f379, %f215, 0f3F000000; sub.f32 %f216, %f20, %f14; mul.f32 %f380, %f216, 0f3F000000; sub.f32 %f217, %f21, %f15; mul.f32 %f381, %f217, 0f3F000000; bra.uni BB0_35; BB0_25: or.pred %p34, %p25, %p27; @%p34 bra BB0_27; bra.uni BB0_26; BB0_27: mul.f32 %f187, %f14, %f14; fma.rn.f32 %f188, %f13, %f13, %f187; fma.rn.f32 %f34, %f15, %f15, %f188; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_29; bra.uni BB0_28; BB0_29: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_31; bra.uni BB0_30; BB0_31: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_33; bra.uni BB0_32; BB0_33: sub.f32 %f206, %f19, %f13; sub.f32 %f207, %f20, %f14; sub.f32 %f208, %f21, %f15; sub.f32 %f209, %f7, %f25; mul.f32 %f210, %f209, 0f3DAAAAAB; sub.f32 %f211, %f8, %f26; mul.f32 %f212, %f211, 0f3DAAAAAB; sub.f32 %f213, %f9, %f27; mul.f32 %f214, %f213, 0f3DAAAAAB; fma.rn.f32 %f379, %f206, 0f3F2AAAAB, %f210; fma.rn.f32 %f380, %f207, 0f3F2AAAAB, %f212; fma.rn.f32 %f381, %f208, 0f3F2AAAAB, %f214; bra.uni BB0_35; BB0_26: sub.f32 %f379, %f1, %f13; sub.f32 %f380, %f2, %f14; sub.f32 %f381, %f3, %f15; bra.uni BB0_35; BB0_28: sub.f32 %f379, %f19, %f1; sub.f32 %f380, %f20, %f2; sub.f32 %f381, %f21, %f3; bra.uni BB0_35; BB0_30: mul.f32 %f189, %f13, 0fC0000000; fma.rn.f32 %f190, %f7, 0f3F000000, %f189; add.f32 %f191, %f14, %f14; mul.f32 %f192, %f8, 0f3F000000; sub.f32 %f193, %f192, %f191; add.f32 %f194, %f15, %f15; mul.f32 %f195, %f9, 0f3F000000; sub.f32 %f196, %f195, %f194; fma.rn.f32 %f379, %f1, 0f3FC00000, %f190; fma.rn.f32 %f380, %f2, 0f3FC00000, %f193; fma.rn.f32 %f381, %f3, 0f3FC00000, %f196; bra.uni BB0_35; BB0_32: mul.f32 %f197, %f25, 0fBF000000; fma.rn.f32 %f198, %f19, 0f40000000, %f197; mul.f32 %f199, %f26, 0fBF000000; fma.rn.f32 %f200, %f20, 0f40000000, %f199; mul.f32 %f201, %f27, 0fBF000000; fma.rn.f32 %f202, %f21, 0f40000000, %f201; mul.f32 %f203, %f1, 0f3FC00000; sub.f32 %f379, %f198, %f203; mul.f32 %f204, %f2, 0f3FC00000; sub.f32 %f380, %f200, %f204; mul.f32 %f205, %f3, 0f3FC00000; sub.f32 %f381, %f202, %f205; BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_37; rem.s32 %r84, %r23, %r56; add.s32 %r85, %r84, %r56; rem.s32 %r128, %r85, %r56; bra.uni BB0_38; BB0_37: mov.u32 %r86, 0; max.s32 %r128, %r23, %r86; BB0_38: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_40; add.s32 %r87, %r128, %r4; mad.lo.s32 %r88, %r87, %r55, %r1; mul.wide.s32 %rd32, %r88, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f56, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f57, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f58, [%rd35]; BB0_40: add.s32 %r27, %r2, -1; @%p43 bra BB0_42; rem.s32 %r89, %r27, %r56; add.s32 %r90, %r89, %r56; rem.s32 %r129, %r90, %r56; bra.uni BB0_43; BB0_42: mov.u32 %r91, 0; max.s32 %r129, %r27, %r91; BB0_43: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_45; add.s32 %r92, %r129, %r4; mad.lo.s32 %r93, %r92, %r55, %r1; mul.wide.s32 %rd36, %r93, 4; add.s64 %rd37, %rd3, %rd36; ld.global.nc.f32 %f62, [%rd37]; add.s64 %rd38, %rd2, %rd36; ld.global.nc.f32 %f63, [%rd38]; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f64, [%rd39]; BB0_45: add.s32 %r31, %r2, 1; @%p43 bra BB0_47; rem.s32 %r94, %r31, %r56; add.s32 %r95, %r94, %r56; rem.s32 %r130, %r95, %r56; bra.uni BB0_48; BB0_47: add.s32 %r96, %r56, -1; min.s32 %r130, %r31, %r96; BB0_48: setp.ge.s32 %p52, %r31, %r56; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_50; add.s32 %r97, %r130, %r4; mad.lo.s32 %r98, %r97, %r55, %r1; mul.wide.s32 %rd40, %r98, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f68, [%rd41]; add.s64 %rd42, %rd2, %rd40; ld.global.nc.f32 %f69, [%rd42]; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f70, [%rd43]; BB0_50: add.s32 %r35, %r2, 2; @%p43 bra BB0_52; rem.s32 %r99, %r35, %r56; add.s32 %r100, %r99, %r56; rem.s32 %r131, %r100, %r56; bra.uni BB0_53; BB0_52: add.s32 %r101, %r56, -1; min.s32 %r131, %r35, %r101; BB0_53: setp.ge.s32 %p56, %r35, %r56; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_55; add.s32 %r102, %r131, %r4; mad.lo.s32 %r103, %r102, %r55, %r1; mul.wide.s32 %rd44, %r103, 4; add.s64 %rd45, %rd3, %rd44; ld.global.nc.f32 %f74, [%rd45]; add.s64 %rd46, %rd2, %rd44; ld.global.nc.f32 %f75, [%rd46]; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f76, [%rd47]; BB0_55: mul.f32 %f230, %f69, %f69; fma.rn.f32 %f231, %f68, %f68, %f230; fma.rn.f32 %f77, %f70, %f70, %f231; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_57; mul.f32 %f235, %f63, %f63; fma.rn.f32 %f236, %f62, %f62, %f235; fma.rn.f32 %f237, %f64, %f64, %f236; setp.eq.f32 %p60, %f237, 0f00000000; mov.f32 %f394, 0f00000000; mov.f32 %f395, %f394; mov.f32 %f396, %f394; @%p60 bra BB0_69; BB0_57: mul.f32 %f238, %f57, %f57; fma.rn.f32 %f239, %f56, %f56, %f238; fma.rn.f32 %f78, %f58, %f58, %f239; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f240, %f75, %f75; fma.rn.f32 %f241, %f74, %f74, %f240; fma.rn.f32 %f79, %f76, %f76, %f241; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_59; mul.f32 %f242, %f63, %f63; fma.rn.f32 %f243, %f62, %f62, %f242; fma.rn.f32 %f244, %f64, %f64, %f243; setp.neu.f32 %p66, %f244, 0f00000000; @%p66 bra BB0_68; bra.uni BB0_59; BB0_68: sub.f32 %f273, %f68, %f62; mul.f32 %f394, %f273, 0f3F000000; sub.f32 %f274, %f69, %f63; mul.f32 %f395, %f274, 0f3F000000; sub.f32 %f275, %f70, %f64; mul.f32 %f396, %f275, 0f3F000000; bra.uni BB0_69; BB0_59: or.pred %p68, %p59, %p61; @%p68 bra BB0_61; bra.uni BB0_60; BB0_61: mul.f32 %f245, %f63, %f63; fma.rn.f32 %f246, %f62, %f62, %f245; fma.rn.f32 %f83, %f64, %f64, %f246; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_63; bra.uni BB0_62; BB0_63: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_65; bra.uni BB0_64; BB0_65: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_67; bra.uni BB0_66; BB0_67: sub.f32 %f264, %f68, %f62; sub.f32 %f265, %f69, %f63; sub.f32 %f266, %f70, %f64; sub.f32 %f267, %f56, %f74; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f57, %f75; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f58, %f76; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f394, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f395, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f396, %f266, 0f3F2AAAAB, %f272; bra.uni BB0_69; BB0_60: sub.f32 %f394, %f1, %f62; sub.f32 %f395, %f2, %f63; sub.f32 %f396, %f3, %f64; bra.uni BB0_69; BB0_62: sub.f32 %f394, %f68, %f1; sub.f32 %f395, %f69, %f2; sub.f32 %f396, %f70, %f3; bra.uni BB0_69; BB0_64: mul.f32 %f247, %f62, 0fC0000000; fma.rn.f32 %f248, %f56, 0f3F000000, %f247; add.f32 %f249, %f63, %f63; mul.f32 %f250, %f57, 0f3F000000; sub.f32 %f251, %f250, %f249; add.f32 %f252, %f64, %f64; mul.f32 %f253, %f58, 0f3F000000; sub.f32 %f254, %f253, %f252; fma.rn.f32 %f394, %f1, 0f3FC00000, %f248; fma.rn.f32 %f395, %f2, 0f3FC00000, %f251; fma.rn.f32 %f396, %f3, 0f3FC00000, %f254; bra.uni BB0_69; BB0_66: mul.f32 %f255, %f74, 0fBF000000; fma.rn.f32 %f256, %f68, 0f40000000, %f255; mul.f32 %f257, %f75, 0fBF000000; fma.rn.f32 %f258, %f69, 0f40000000, %f257; mul.f32 %f259, %f76, 0fBF000000; fma.rn.f32 %f260, %f70, 0f40000000, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f394, %f256, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f395, %f258, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f396, %f260, %f263; BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p77, %rs3, 0; add.s32 %r39, %r3, -2; @%p77 bra BB0_71; rem.s32 %r104, %r39, %r57; add.s32 %r105, %r104, %r57; rem.s32 %r132, %r105, %r57; bra.uni BB0_72; BB0_71: mov.u32 %r106, 0; max.s32 %r132, %r39, %r106; BB0_72: setp.lt.s32 %p79, %r39, 0; mov.f32 %f105, 0f00000000; and.pred %p80, %p79, %p77; mov.f32 %f106, %f105; mov.f32 %f107, %f105; @%p80 bra BB0_74; mad.lo.s32 %r107, %r132, %r56, %r2; mad.lo.s32 %r108, %r107, %r55, %r1; mul.wide.s32 %rd48, %r108, 4; add.s64 %rd49, %rd3, %rd48; ld.global.nc.f32 %f105, [%rd49]; add.s64 %rd50, %rd2, %rd48; ld.global.nc.f32 %f106, [%rd50]; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f107, [%rd51]; BB0_74: add.s32 %r43, %r3, -1; @%p77 bra BB0_76; rem.s32 %r109, %r43, %r57; add.s32 %r110, %r109, %r57; rem.s32 %r133, %r110, %r57; bra.uni BB0_77; BB0_76: mov.u32 %r111, 0; max.s32 %r133, %r43, %r111; BB0_77: setp.lt.s32 %p82, %r43, 0; mov.f32 %f111, 0f00000000; and.pred %p84, %p82, %p77; mov.f32 %f112, %f111; mov.f32 %f113, %f111; @%p84 bra BB0_79; mad.lo.s32 %r112, %r133, %r56, %r2; mad.lo.s32 %r113, %r112, %r55, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f111, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f112, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f113, [%rd55]; BB0_79: add.s32 %r47, %r3, 1; @%p77 bra BB0_81; rem.s32 %r114, %r47, %r57; add.s32 %r115, %r114, %r57; rem.s32 %r134, %r115, %r57; bra.uni BB0_82; BB0_81: add.s32 %r116, %r57, -1; min.s32 %r134, %r47, %r116; BB0_82: setp.ge.s32 %p86, %r47, %r57; mov.f32 %f117, 0f00000000; and.pred %p88, %p86, %p77; mov.f32 %f118, %f117; mov.f32 %f119, %f117; @%p88 bra BB0_84; mad.lo.s32 %r117, %r134, %r56, %r2; mad.lo.s32 %r118, %r117, %r55, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f117, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f118, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f119, [%rd59]; BB0_84: add.s32 %r51, %r3, 2; @%p77 bra BB0_86; rem.s32 %r119, %r51, %r57; add.s32 %r120, %r119, %r57; rem.s32 %r135, %r120, %r57; bra.uni BB0_87; BB0_86: add.s32 %r121, %r57, -1; min.s32 %r135, %r51, %r121; BB0_87: setp.ge.s32 %p90, %r51, %r57; mov.f32 %f123, 0f00000000; and.pred %p92, %p90, %p77; mov.f32 %f124, %f123; mov.f32 %f125, %f123; @%p92 bra BB0_89; mad.lo.s32 %r122, %r135, %r56, %r2; mad.lo.s32 %r123, %r122, %r55, %r1; mul.wide.s32 %rd60, %r123, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f123, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f124, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f125, [%rd63]; BB0_89: mul.f32 %f288, %f118, %f118; fma.rn.f32 %f289, %f117, %f117, %f288; fma.rn.f32 %f126, %f119, %f119, %f289; setp.neu.f32 %p93, %f126, 0f00000000; @%p93 bra BB0_91; mul.f32 %f293, %f112, %f112; fma.rn.f32 %f294, %f111, %f111, %f293; fma.rn.f32 %f295, %f113, %f113, %f294; setp.eq.f32 %p94, %f295, 0f00000000; mov.f32 %f409, 0f00000000; mov.f32 %f410, %f409; mov.f32 %f411, %f409; @%p94 bra BB0_103; BB0_91: mul.f32 %f296, %f106, %f106; fma.rn.f32 %f297, %f105, %f105, %f296; fma.rn.f32 %f127, %f107, %f107, %f297; setp.neu.f32 %p95, %f127, 0f00000000; mul.f32 %f298, %f124, %f124; fma.rn.f32 %f299, %f123, %f123, %f298; fma.rn.f32 %f128, %f125, %f125, %f299; setp.neu.f32 %p96, %f128, 0f00000000; and.pred %p97, %p95, %p96; setp.eq.f32 %p98, %f126, 0f00000000; or.pred %p99, %p97, %p98; @%p99 bra BB0_93; mul.f32 %f300, %f112, %f112; fma.rn.f32 %f301, %f111, %f111, %f300; fma.rn.f32 %f302, %f113, %f113, %f301; setp.neu.f32 %p100, %f302, 0f00000000; @%p100 bra BB0_102; bra.uni BB0_93; BB0_102: sub.f32 %f331, %f117, %f111; mul.f32 %f409, %f331, 0f3F000000; sub.f32 %f332, %f118, %f112; mul.f32 %f410, %f332, 0f3F000000; sub.f32 %f333, %f119, %f113; mul.f32 %f411, %f333, 0f3F000000; bra.uni BB0_103; BB0_93: or.pred %p102, %p93, %p95; @%p102 bra BB0_95; bra.uni BB0_94; BB0_95: mul.f32 %f303, %f112, %f112; fma.rn.f32 %f304, %f111, %f111, %f303; fma.rn.f32 %f132, %f113, %f113, %f304; setp.neu.f32 %p103, %f132, 0f00000000; or.pred %p105, %p103, %p96; @%p105 bra BB0_97; bra.uni BB0_96; BB0_97: setp.eq.f32 %p106, %f127, 0f00000000; or.pred %p107, %p106, %p93; @%p107 bra BB0_99; bra.uni BB0_98; BB0_99: setp.eq.f32 %p109, %f128, 0f00000000; or.pred %p110, %p109, %p103; @%p110 bra BB0_101; bra.uni BB0_100; BB0_101: sub.f32 %f322, %f117, %f111; sub.f32 %f323, %f118, %f112; sub.f32 %f324, %f119, %f113; sub.f32 %f325, %f105, %f123; mul.f32 %f326, %f325, 0f3DAAAAAB; sub.f32 %f327, %f106, %f124; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f107, %f125; mul.f32 %f330, %f329, 0f3DAAAAAB; fma.rn.f32 %f409, %f322, 0f3F2AAAAB, %f326; fma.rn.f32 %f410, %f323, 0f3F2AAAAB, %f328; fma.rn.f32 %f411, %f324, 0f3F2AAAAB, %f330; bra.uni BB0_103; BB0_94: sub.f32 %f409, %f1, %f111; sub.f32 %f410, %f2, %f112; sub.f32 %f411, %f3, %f113; bra.uni BB0_103; BB0_96: sub.f32 %f409, %f117, %f1; sub.f32 %f410, %f118, %f2; sub.f32 %f411, %f119, %f3; bra.uni BB0_103; BB0_98: mul.f32 %f305, %f111, 0fC0000000; fma.rn.f32 %f306, %f105, 0f3F000000, %f305; add.f32 %f307, %f112, %f112; mul.f32 %f308, %f106, 0f3F000000; sub.f32 %f309, %f308, %f307; add.f32 %f310, %f113, %f113; mul.f32 %f311, %f107, 0f3F000000; sub.f32 %f312, %f311, %f310; fma.rn.f32 %f409, %f1, 0f3FC00000, %f306; fma.rn.f32 %f410, %f2, 0f3FC00000, %f309; fma.rn.f32 %f411, %f3, 0f3FC00000, %f312; bra.uni BB0_103; BB0_100: mul.f32 %f313, %f123, 0fBF000000; fma.rn.f32 %f314, %f117, 0f40000000, %f313; mul.f32 %f315, %f124, 0fBF000000; fma.rn.f32 %f316, %f118, 0f40000000, %f315; mul.f32 %f317, %f125, 0fBF000000; fma.rn.f32 %f318, %f119, 0f40000000, %f317; mul.f32 %f319, %f1, 0f3FC00000; sub.f32 %f409, %f314, %f319; mul.f32 %f320, %f2, 0f3FC00000; sub.f32 %f410, %f316, %f320; mul.f32 %f321, %f3, 0f3FC00000; sub.f32 %f411, %f318, %f321; BB0_103: setp.eq.s64 %p111, %rd7, 0; @%p111 bra BB0_105; cvta.to.global.u64 %rd64, %rd7; add.s64 %rd66, %rd64, %rd12; ld.global.nc.f32 %f334, [%rd66]; mul.f32 %f412, %f334, %f412; BB0_105: setp.eq.s64 %p112, %rd8, 0; @%p112 bra BB0_107; cvta.to.global.u64 %rd67, %rd8; add.s64 %rd69, %rd67, %rd12; ld.global.nc.f32 %f335, [%rd69]; mul.f32 %f413, %f335, %f413; BB0_107: cvta.to.global.u64 %rd70, %rd6; cvta.to.global.u64 %rd71, %rd5; cvta.to.global.u64 %rd72, %rd4; mul.f32 %f336, %f379, %f157; mul.f32 %f337, %f395, %f158; mul.f32 %f338, %f411, %f159; add.f32 %f339, %f412, %f412; mul.f32 %f340, %f1, %f339; add.f32 %f341, %f337, %f338; mul.f32 %f342, %f1, %f341; mul.f32 %f343, %f394, %f158; fma.rn.f32 %f344, %f2, %f343, %f342; mul.f32 %f345, %f409, %f159; fma.rn.f32 %f346, %f3, %f345, %f344; mul.f32 %f347, %f346, %f413; fma.rn.f32 %f348, %f336, %f340, %f347; add.s64 %rd74, %rd72, %rd12; st.global.f32 [%rd74], %f348; mul.f32 %f349, %f2, %f339; add.f32 %f350, %f336, %f338; mul.f32 %f351, %f2, %f350; mul.f32 %f352, %f380, %f157; fma.rn.f32 %f353, %f1, %f352, %f351; mul.f32 %f354, %f410, %f159; fma.rn.f32 %f355, %f3, %f354, %f353; mul.f32 %f356, %f355, %f413; fma.rn.f32 %f357, %f337, %f349, %f356; add.s64 %rd75, %rd71, %rd12; st.global.f32 [%rd75], %f357; mul.f32 %f358, %f3, %f339; mul.f32 %f359, %f396, %f158; mul.f32 %f360, %f2, %f359; mul.f32 %f361, %f381, %f157; fma.rn.f32 %f362, %f1, %f361, %f360; add.f32 %f363, %f336, %f337; fma.rn.f32 %f364, %f3, %f363, %f362; mul.f32 %f365, %f364, %f413; fma.rn.f32 %f366, %f338, %f358, %f365; add.s64 %rd76, %rd70, %rd12; st.global.f32 [%rd76], %f366; BB0_108: ret; } ` getmagnetoelasticforce_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl getmagnetoelasticforce .visible .entry getmagnetoelasticforce( .param .u64 getmagnetoelasticforce_param_0, .param .u64 getmagnetoelasticforce_param_1, .param .u64 getmagnetoelasticforce_param_2, .param .u64 getmagnetoelasticforce_param_3, .param .u64 getmagnetoelasticforce_param_4, .param .u64 getmagnetoelasticforce_param_5, .param .u64 getmagnetoelasticforce_param_6, .param .f32 getmagnetoelasticforce_param_7, .param .u64 getmagnetoelasticforce_param_8, .param .f32 getmagnetoelasticforce_param_9, .param .f32 getmagnetoelasticforce_param_10, .param .f32 getmagnetoelasticforce_param_11, .param .f32 getmagnetoelasticforce_param_12, .param .u32 getmagnetoelasticforce_param_13, .param .u32 getmagnetoelasticforce_param_14, .param .u32 getmagnetoelasticforce_param_15, .param .u8 getmagnetoelasticforce_param_16 ) { .reg .pred %p<113>; .reg .b16 %rs<19>; .reg .f32 %f<414>; .reg .b32 %r<136>; .reg .b64 %rd<77>; ld.param.u64 %rd4, [getmagnetoelasticforce_param_0]; ld.param.u64 %rd5, [getmagnetoelasticforce_param_1]; ld.param.u64 %rd6, [getmagnetoelasticforce_param_2]; ld.param.u64 %rd9, [getmagnetoelasticforce_param_3]; ld.param.u64 %rd10, [getmagnetoelasticforce_param_4]; ld.param.u64 %rd11, [getmagnetoelasticforce_param_5]; ld.param.u64 %rd7, [getmagnetoelasticforce_param_6]; ld.param.f32 %f412, [getmagnetoelasticforce_param_7]; ld.param.u64 %rd8, [getmagnetoelasticforce_param_8]; ld.param.f32 %f413, [getmagnetoelasticforce_param_9]; ld.param.f32 %f157, [getmagnetoelasticforce_param_10]; ld.param.f32 %f158, [getmagnetoelasticforce_param_11]; ld.param.f32 %f159, [getmagnetoelasticforce_param_12]; ld.param.u32 %r55, [getmagnetoelasticforce_param_13]; ld.param.u32 %r56, [getmagnetoelasticforce_param_14]; ld.param.u32 %r57, [getmagnetoelasticforce_param_15]; ld.param.u8 %rs4, [getmagnetoelasticforce_param_16]; cvta.to.global.u64 %rd1, %rd11; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd9; mov.u32 %r58, %ntid.x; mov.u32 %r59, %ctaid.x; mov.u32 %r60, %tid.x; mad.lo.s32 %r1, %r58, %r59, %r60; mov.u32 %r61, %ntid.y; mov.u32 %r62, %ctaid.y; mov.u32 %r63, %tid.y; mad.lo.s32 %r2, %r61, %r62, %r63; mov.u32 %r64, %ntid.z; mov.u32 %r65, %ctaid.z; mov.u32 %r66, %tid.z; mad.lo.s32 %r3, %r64, %r65, %r66; setp.ge.s32 %p4, %r2, %r56; setp.ge.s32 %p5, %r1, %r55; or.pred %p6, %p4, %p5; setp.ge.s32 %p7, %r3, %r57; or.pred %p8, %p6, %p7; @%p8 bra BB0_108; mul.lo.s32 %r4, %r3, %r56; add.s32 %r67, %r4, %r2; mul.lo.s32 %r5, %r67, %r55; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; add.s64 %rd14, %rd2, %rd12; ld.global.nc.f32 %f2, [%rd14]; add.s64 %rd15, %rd1, %rd12; ld.global.nc.f32 %f3, [%rd15]; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r7, %r1, -2; @%p9 bra BB0_3; rem.s32 %r68, %r7, %r55; add.s32 %r69, %r68, %r55; rem.s32 %r124, %r69, %r55; bra.uni BB0_4; BB0_3: mov.u32 %r70, 0; max.s32 %r124, %r7, %r70; BB0_4: setp.lt.s32 %p11, %r7, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_6; add.s32 %r71, %r124, %r5; mul.wide.s32 %rd16, %r71, 4; add.s64 %rd17, %rd3, %rd16; ld.global.nc.f32 %f7, [%rd17]; add.s64 %rd18, %rd2, %rd16; ld.global.nc.f32 %f8, [%rd18]; add.s64 %rd19, %rd1, %rd16; ld.global.nc.f32 %f9, [%rd19]; BB0_6: add.s32 %r11, %r1, -1; @%p9 bra BB0_8; rem.s32 %r72, %r11, %r55; add.s32 %r73, %r72, %r55; rem.s32 %r125, %r73, %r55; bra.uni BB0_9; BB0_8: mov.u32 %r74, 0; max.s32 %r125, %r11, %r74; BB0_9: setp.lt.s32 %p14, %r11, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_11; add.s32 %r75, %r125, %r5; mul.wide.s32 %rd20, %r75, 4; add.s64 %rd21, %rd3, %rd20; ld.global.nc.f32 %f13, [%rd21]; add.s64 %rd22, %rd2, %rd20; ld.global.nc.f32 %f14, [%rd22]; add.s64 %rd23, %rd1, %rd20; ld.global.nc.f32 %f15, [%rd23]; BB0_11: add.s32 %r15, %r1, 1; @%p9 bra BB0_13; rem.s32 %r76, %r15, %r55; add.s32 %r77, %r76, %r55; rem.s32 %r126, %r77, %r55; bra.uni BB0_14; BB0_13: add.s32 %r78, %r55, -1; min.s32 %r126, %r15, %r78; BB0_14: setp.ge.s32 %p18, %r15, %r55; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_16; add.s32 %r79, %r126, %r5; mul.wide.s32 %rd24, %r79, 4; add.s64 %rd25, %rd3, %rd24; ld.global.nc.f32 %f19, [%rd25]; add.s64 %rd26, %rd2, %rd24; ld.global.nc.f32 %f20, [%rd26]; add.s64 %rd27, %rd1, %rd24; ld.global.nc.f32 %f21, [%rd27]; BB0_16: add.s32 %r19, %r1, 2; @%p9 bra BB0_18; rem.s32 %r80, %r19, %r55; add.s32 %r81, %r80, %r55; rem.s32 %r127, %r81, %r55; bra.uni BB0_19; BB0_18: add.s32 %r82, %r55, -1; min.s32 %r127, %r19, %r82; BB0_19: setp.ge.s32 %p22, %r19, %r55; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_21; add.s32 %r83, %r127, %r5; mul.wide.s32 %rd28, %r83, 4; add.s64 %rd29, %rd3, %rd28; ld.global.nc.f32 %f25, [%rd29]; add.s64 %rd30, %rd2, %rd28; ld.global.nc.f32 %f26, [%rd30]; add.s64 %rd31, %rd1, %rd28; ld.global.nc.f32 %f27, [%rd31]; BB0_21: mul.f32 %f172, %f20, %f20; fma.rn.f32 %f173, %f19, %f19, %f172; fma.rn.f32 %f28, %f21, %f21, %f173; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_23; mul.f32 %f177, %f14, %f14; fma.rn.f32 %f178, %f13, %f13, %f177; fma.rn.f32 %f179, %f15, %f15, %f178; setp.eq.f32 %p26, %f179, 0f00000000; mov.f32 %f379, 0f00000000; mov.f32 %f380, %f379; mov.f32 %f381, %f379; @%p26 bra BB0_35; BB0_23: mul.f32 %f180, %f8, %f8; fma.rn.f32 %f181, %f7, %f7, %f180; fma.rn.f32 %f29, %f9, %f9, %f181; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f182, %f26, %f26; fma.rn.f32 %f183, %f25, %f25, %f182; fma.rn.f32 %f30, %f27, %f27, %f183; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_25; mul.f32 %f184, %f14, %f14; fma.rn.f32 %f185, %f13, %f13, %f184; fma.rn.f32 %f186, %f15, %f15, %f185; setp.neu.f32 %p32, %f186, 0f00000000; @%p32 bra BB0_34; bra.uni BB0_25; BB0_34: sub.f32 %f215, %f19, %f13; mul.f32 %f379, %f215, 0f3F000000; sub.f32 %f216, %f20, %f14; mul.f32 %f380, %f216, 0f3F000000; sub.f32 %f217, %f21, %f15; mul.f32 %f381, %f217, 0f3F000000; bra.uni BB0_35; BB0_25: or.pred %p34, %p25, %p27; @%p34 bra BB0_27; bra.uni BB0_26; BB0_27: mul.f32 %f187, %f14, %f14; fma.rn.f32 %f188, %f13, %f13, %f187; fma.rn.f32 %f34, %f15, %f15, %f188; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_29; bra.uni BB0_28; BB0_29: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_31; bra.uni BB0_30; BB0_31: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_33; bra.uni BB0_32; BB0_33: sub.f32 %f206, %f19, %f13; sub.f32 %f207, %f20, %f14; sub.f32 %f208, %f21, %f15; sub.f32 %f209, %f7, %f25; mul.f32 %f210, %f209, 0f3DAAAAAB; sub.f32 %f211, %f8, %f26; mul.f32 %f212, %f211, 0f3DAAAAAB; sub.f32 %f213, %f9, %f27; mul.f32 %f214, %f213, 0f3DAAAAAB; fma.rn.f32 %f379, %f206, 0f3F2AAAAB, %f210; fma.rn.f32 %f380, %f207, 0f3F2AAAAB, %f212; fma.rn.f32 %f381, %f208, 0f3F2AAAAB, %f214; bra.uni BB0_35; BB0_26: sub.f32 %f379, %f1, %f13; sub.f32 %f380, %f2, %f14; sub.f32 %f381, %f3, %f15; bra.uni BB0_35; BB0_28: sub.f32 %f379, %f19, %f1; sub.f32 %f380, %f20, %f2; sub.f32 %f381, %f21, %f3; bra.uni BB0_35; BB0_30: mul.f32 %f189, %f13, 0fC0000000; fma.rn.f32 %f190, %f7, 0f3F000000, %f189; add.f32 %f191, %f14, %f14; mul.f32 %f192, %f8, 0f3F000000; sub.f32 %f193, %f192, %f191; add.f32 %f194, %f15, %f15; mul.f32 %f195, %f9, 0f3F000000; sub.f32 %f196, %f195, %f194; fma.rn.f32 %f379, %f1, 0f3FC00000, %f190; fma.rn.f32 %f380, %f2, 0f3FC00000, %f193; fma.rn.f32 %f381, %f3, 0f3FC00000, %f196; bra.uni BB0_35; BB0_32: mul.f32 %f197, %f25, 0fBF000000; fma.rn.f32 %f198, %f19, 0f40000000, %f197; mul.f32 %f199, %f26, 0fBF000000; fma.rn.f32 %f200, %f20, 0f40000000, %f199; mul.f32 %f201, %f27, 0fBF000000; fma.rn.f32 %f202, %f21, 0f40000000, %f201; mul.f32 %f203, %f1, 0f3FC00000; sub.f32 %f379, %f198, %f203; mul.f32 %f204, %f2, 0f3FC00000; sub.f32 %f380, %f200, %f204; mul.f32 %f205, %f3, 0f3FC00000; sub.f32 %f381, %f202, %f205; BB0_35: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_37; rem.s32 %r84, %r23, %r56; add.s32 %r85, %r84, %r56; rem.s32 %r128, %r85, %r56; bra.uni BB0_38; BB0_37: mov.u32 %r86, 0; max.s32 %r128, %r23, %r86; BB0_38: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_40; add.s32 %r87, %r128, %r4; mad.lo.s32 %r88, %r87, %r55, %r1; mul.wide.s32 %rd32, %r88, 4; add.s64 %rd33, %rd3, %rd32; ld.global.nc.f32 %f56, [%rd33]; add.s64 %rd34, %rd2, %rd32; ld.global.nc.f32 %f57, [%rd34]; add.s64 %rd35, %rd1, %rd32; ld.global.nc.f32 %f58, [%rd35]; BB0_40: add.s32 %r27, %r2, -1; @%p43 bra BB0_42; rem.s32 %r89, %r27, %r56; add.s32 %r90, %r89, %r56; rem.s32 %r129, %r90, %r56; bra.uni BB0_43; BB0_42: mov.u32 %r91, 0; max.s32 %r129, %r27, %r91; BB0_43: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_45; add.s32 %r92, %r129, %r4; mad.lo.s32 %r93, %r92, %r55, %r1; mul.wide.s32 %rd36, %r93, 4; add.s64 %rd37, %rd3, %rd36; ld.global.nc.f32 %f62, [%rd37]; add.s64 %rd38, %rd2, %rd36; ld.global.nc.f32 %f63, [%rd38]; add.s64 %rd39, %rd1, %rd36; ld.global.nc.f32 %f64, [%rd39]; BB0_45: add.s32 %r31, %r2, 1; @%p43 bra BB0_47; rem.s32 %r94, %r31, %r56; add.s32 %r95, %r94, %r56; rem.s32 %r130, %r95, %r56; bra.uni BB0_48; BB0_47: add.s32 %r96, %r56, -1; min.s32 %r130, %r31, %r96; BB0_48: setp.ge.s32 %p52, %r31, %r56; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_50; add.s32 %r97, %r130, %r4; mad.lo.s32 %r98, %r97, %r55, %r1; mul.wide.s32 %rd40, %r98, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f68, [%rd41]; add.s64 %rd42, %rd2, %rd40; ld.global.nc.f32 %f69, [%rd42]; add.s64 %rd43, %rd1, %rd40; ld.global.nc.f32 %f70, [%rd43]; BB0_50: add.s32 %r35, %r2, 2; @%p43 bra BB0_52; rem.s32 %r99, %r35, %r56; add.s32 %r100, %r99, %r56; rem.s32 %r131, %r100, %r56; bra.uni BB0_53; BB0_52: add.s32 %r101, %r56, -1; min.s32 %r131, %r35, %r101; BB0_53: setp.ge.s32 %p56, %r35, %r56; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_55; add.s32 %r102, %r131, %r4; mad.lo.s32 %r103, %r102, %r55, %r1; mul.wide.s32 %rd44, %r103, 4; add.s64 %rd45, %rd3, %rd44; ld.global.nc.f32 %f74, [%rd45]; add.s64 %rd46, %rd2, %rd44; ld.global.nc.f32 %f75, [%rd46]; add.s64 %rd47, %rd1, %rd44; ld.global.nc.f32 %f76, [%rd47]; BB0_55: mul.f32 %f230, %f69, %f69; fma.rn.f32 %f231, %f68, %f68, %f230; fma.rn.f32 %f77, %f70, %f70, %f231; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_57; mul.f32 %f235, %f63, %f63; fma.rn.f32 %f236, %f62, %f62, %f235; fma.rn.f32 %f237, %f64, %f64, %f236; setp.eq.f32 %p60, %f237, 0f00000000; mov.f32 %f394, 0f00000000; mov.f32 %f395, %f394; mov.f32 %f396, %f394; @%p60 bra BB0_69; BB0_57: mul.f32 %f238, %f57, %f57; fma.rn.f32 %f239, %f56, %f56, %f238; fma.rn.f32 %f78, %f58, %f58, %f239; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f240, %f75, %f75; fma.rn.f32 %f241, %f74, %f74, %f240; fma.rn.f32 %f79, %f76, %f76, %f241; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_59; mul.f32 %f242, %f63, %f63; fma.rn.f32 %f243, %f62, %f62, %f242; fma.rn.f32 %f244, %f64, %f64, %f243; setp.neu.f32 %p66, %f244, 0f00000000; @%p66 bra BB0_68; bra.uni BB0_59; BB0_68: sub.f32 %f273, %f68, %f62; mul.f32 %f394, %f273, 0f3F000000; sub.f32 %f274, %f69, %f63; mul.f32 %f395, %f274, 0f3F000000; sub.f32 %f275, %f70, %f64; mul.f32 %f396, %f275, 0f3F000000; bra.uni BB0_69; BB0_59: or.pred %p68, %p59, %p61; @%p68 bra BB0_61; bra.uni BB0_60; BB0_61: mul.f32 %f245, %f63, %f63; fma.rn.f32 %f246, %f62, %f62, %f245; fma.rn.f32 %f83, %f64, %f64, %f246; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_63; bra.uni BB0_62; BB0_63: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_65; bra.uni BB0_64; BB0_65: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_67; bra.uni BB0_66; BB0_67: sub.f32 %f264, %f68, %f62; sub.f32 %f265, %f69, %f63; sub.f32 %f266, %f70, %f64; sub.f32 %f267, %f56, %f74; mul.f32 %f268, %f267, 0f3DAAAAAB; sub.f32 %f269, %f57, %f75; mul.f32 %f270, %f269, 0f3DAAAAAB; sub.f32 %f271, %f58, %f76; mul.f32 %f272, %f271, 0f3DAAAAAB; fma.rn.f32 %f394, %f264, 0f3F2AAAAB, %f268; fma.rn.f32 %f395, %f265, 0f3F2AAAAB, %f270; fma.rn.f32 %f396, %f266, 0f3F2AAAAB, %f272; bra.uni BB0_69; BB0_60: sub.f32 %f394, %f1, %f62; sub.f32 %f395, %f2, %f63; sub.f32 %f396, %f3, %f64; bra.uni BB0_69; BB0_62: sub.f32 %f394, %f68, %f1; sub.f32 %f395, %f69, %f2; sub.f32 %f396, %f70, %f3; bra.uni BB0_69; BB0_64: mul.f32 %f247, %f62, 0fC0000000; fma.rn.f32 %f248, %f56, 0f3F000000, %f247; add.f32 %f249, %f63, %f63; mul.f32 %f250, %f57, 0f3F000000; sub.f32 %f251, %f250, %f249; add.f32 %f252, %f64, %f64; mul.f32 %f253, %f58, 0f3F000000; sub.f32 %f254, %f253, %f252; fma.rn.f32 %f394, %f1, 0f3FC00000, %f248; fma.rn.f32 %f395, %f2, 0f3FC00000, %f251; fma.rn.f32 %f396, %f3, 0f3FC00000, %f254; bra.uni BB0_69; BB0_66: mul.f32 %f255, %f74, 0fBF000000; fma.rn.f32 %f256, %f68, 0f40000000, %f255; mul.f32 %f257, %f75, 0fBF000000; fma.rn.f32 %f258, %f69, 0f40000000, %f257; mul.f32 %f259, %f76, 0fBF000000; fma.rn.f32 %f260, %f70, 0f40000000, %f259; mul.f32 %f261, %f1, 0f3FC00000; sub.f32 %f394, %f256, %f261; mul.f32 %f262, %f2, 0f3FC00000; sub.f32 %f395, %f258, %f262; mul.f32 %f263, %f3, 0f3FC00000; sub.f32 %f396, %f260, %f263; BB0_69: and.b16 %rs3, %rs4, 4; setp.eq.s16 %p77, %rs3, 0; add.s32 %r39, %r3, -2; @%p77 bra BB0_71; rem.s32 %r104, %r39, %r57; add.s32 %r105, %r104, %r57; rem.s32 %r132, %r105, %r57; bra.uni BB0_72; BB0_71: mov.u32 %r106, 0; max.s32 %r132, %r39, %r106; BB0_72: setp.lt.s32 %p79, %r39, 0; mov.f32 %f105, 0f00000000; and.pred %p80, %p79, %p77; mov.f32 %f106, %f105; mov.f32 %f107, %f105; @%p80 bra BB0_74; mad.lo.s32 %r107, %r132, %r56, %r2; mad.lo.s32 %r108, %r107, %r55, %r1; mul.wide.s32 %rd48, %r108, 4; add.s64 %rd49, %rd3, %rd48; ld.global.nc.f32 %f105, [%rd49]; add.s64 %rd50, %rd2, %rd48; ld.global.nc.f32 %f106, [%rd50]; add.s64 %rd51, %rd1, %rd48; ld.global.nc.f32 %f107, [%rd51]; BB0_74: add.s32 %r43, %r3, -1; @%p77 bra BB0_76; rem.s32 %r109, %r43, %r57; add.s32 %r110, %r109, %r57; rem.s32 %r133, %r110, %r57; bra.uni BB0_77; BB0_76: mov.u32 %r111, 0; max.s32 %r133, %r43, %r111; BB0_77: setp.lt.s32 %p82, %r43, 0; mov.f32 %f111, 0f00000000; and.pred %p84, %p82, %p77; mov.f32 %f112, %f111; mov.f32 %f113, %f111; @%p84 bra BB0_79; mad.lo.s32 %r112, %r133, %r56, %r2; mad.lo.s32 %r113, %r112, %r55, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f111, [%rd53]; add.s64 %rd54, %rd2, %rd52; ld.global.nc.f32 %f112, [%rd54]; add.s64 %rd55, %rd1, %rd52; ld.global.nc.f32 %f113, [%rd55]; BB0_79: add.s32 %r47, %r3, 1; @%p77 bra BB0_81; rem.s32 %r114, %r47, %r57; add.s32 %r115, %r114, %r57; rem.s32 %r134, %r115, %r57; bra.uni BB0_82; BB0_81: add.s32 %r116, %r57, -1; min.s32 %r134, %r47, %r116; BB0_82: setp.ge.s32 %p86, %r47, %r57; mov.f32 %f117, 0f00000000; and.pred %p88, %p86, %p77; mov.f32 %f118, %f117; mov.f32 %f119, %f117; @%p88 bra BB0_84; mad.lo.s32 %r117, %r134, %r56, %r2; mad.lo.s32 %r118, %r117, %r55, %r1; mul.wide.s32 %rd56, %r118, 4; add.s64 %rd57, %rd3, %rd56; ld.global.nc.f32 %f117, [%rd57]; add.s64 %rd58, %rd2, %rd56; ld.global.nc.f32 %f118, [%rd58]; add.s64 %rd59, %rd1, %rd56; ld.global.nc.f32 %f119, [%rd59]; BB0_84: add.s32 %r51, %r3, 2; @%p77 bra BB0_86; rem.s32 %r119, %r51, %r57; add.s32 %r120, %r119, %r57; rem.s32 %r135, %r120, %r57; bra.uni BB0_87; BB0_86: add.s32 %r121, %r57, -1; min.s32 %r135, %r51, %r121; BB0_87: setp.ge.s32 %p90, %r51, %r57; mov.f32 %f123, 0f00000000; and.pred %p92, %p90, %p77; mov.f32 %f124, %f123; mov.f32 %f125, %f123; @%p92 bra BB0_89; mad.lo.s32 %r122, %r135, %r56, %r2; mad.lo.s32 %r123, %r122, %r55, %r1; mul.wide.s32 %rd60, %r123, 4; add.s64 %rd61, %rd3, %rd60; ld.global.nc.f32 %f123, [%rd61]; add.s64 %rd62, %rd2, %rd60; ld.global.nc.f32 %f124, [%rd62]; add.s64 %rd63, %rd1, %rd60; ld.global.nc.f32 %f125, [%rd63]; BB0_89: mul.f32 %f288, %f118, %f118; fma.rn.f32 %f289, %f117, %f117, %f288; fma.rn.f32 %f126, %f119, %f119, %f289; setp.neu.f32 %p93, %f126, 0f00000000; @%p93 bra BB0_91; mul.f32 %f293, %f112, %f112; fma.rn.f32 %f294, %f111, %f111, %f293; fma.rn.f32 %f295, %f113, %f113, %f294; setp.eq.f32 %p94, %f295, 0f00000000; mov.f32 %f409, 0f00000000; mov.f32 %f410, %f409; mov.f32 %f411, %f409; @%p94 bra BB0_103; BB0_91: mul.f32 %f296, %f106, %f106; fma.rn.f32 %f297, %f105, %f105, %f296; fma.rn.f32 %f127, %f107, %f107, %f297; setp.neu.f32 %p95, %f127, 0f00000000; mul.f32 %f298, %f124, %f124; fma.rn.f32 %f299, %f123, %f123, %f298; fma.rn.f32 %f128, %f125, %f125, %f299; setp.neu.f32 %p96, %f128, 0f00000000; and.pred %p97, %p95, %p96; setp.eq.f32 %p98, %f126, 0f00000000; or.pred %p99, %p97, %p98; @%p99 bra BB0_93; mul.f32 %f300, %f112, %f112; fma.rn.f32 %f301, %f111, %f111, %f300; fma.rn.f32 %f302, %f113, %f113, %f301; setp.neu.f32 %p100, %f302, 0f00000000; @%p100 bra BB0_102; bra.uni BB0_93; BB0_102: sub.f32 %f331, %f117, %f111; mul.f32 %f409, %f331, 0f3F000000; sub.f32 %f332, %f118, %f112; mul.f32 %f410, %f332, 0f3F000000; sub.f32 %f333, %f119, %f113; mul.f32 %f411, %f333, 0f3F000000; bra.uni BB0_103; BB0_93: or.pred %p102, %p93, %p95; @%p102 bra BB0_95; bra.uni BB0_94; BB0_95: mul.f32 %f303, %f112, %f112; fma.rn.f32 %f304, %f111, %f111, %f303; fma.rn.f32 %f132, %f113, %f113, %f304; setp.neu.f32 %p103, %f132, 0f00000000; or.pred %p105, %p103, %p96; @%p105 bra BB0_97; bra.uni BB0_96; BB0_97: setp.eq.f32 %p106, %f127, 0f00000000; or.pred %p107, %p106, %p93; @%p107 bra BB0_99; bra.uni BB0_98; BB0_99: setp.eq.f32 %p109, %f128, 0f00000000; or.pred %p110, %p109, %p103; @%p110 bra BB0_101; bra.uni BB0_100; BB0_101: sub.f32 %f322, %f117, %f111; sub.f32 %f323, %f118, %f112; sub.f32 %f324, %f119, %f113; sub.f32 %f325, %f105, %f123; mul.f32 %f326, %f325, 0f3DAAAAAB; sub.f32 %f327, %f106, %f124; mul.f32 %f328, %f327, 0f3DAAAAAB; sub.f32 %f329, %f107, %f125; mul.f32 %f330, %f329, 0f3DAAAAAB; fma.rn.f32 %f409, %f322, 0f3F2AAAAB, %f326; fma.rn.f32 %f410, %f323, 0f3F2AAAAB, %f328; fma.rn.f32 %f411, %f324, 0f3F2AAAAB, %f330; bra.uni BB0_103; BB0_94: sub.f32 %f409, %f1, %f111; sub.f32 %f410, %f2, %f112; sub.f32 %f411, %f3, %f113; bra.uni BB0_103; BB0_96: sub.f32 %f409, %f117, %f1; sub.f32 %f410, %f118, %f2; sub.f32 %f411, %f119, %f3; bra.uni BB0_103; BB0_98: mul.f32 %f305, %f111, 0fC0000000; fma.rn.f32 %f306, %f105, 0f3F000000, %f305; add.f32 %f307, %f112, %f112; mul.f32 %f308, %f106, 0f3F000000; sub.f32 %f309, %f308, %f307; add.f32 %f310, %f113, %f113; mul.f32 %f311, %f107, 0f3F000000; sub.f32 %f312, %f311, %f310; fma.rn.f32 %f409, %f1, 0f3FC00000, %f306; fma.rn.f32 %f410, %f2, 0f3FC00000, %f309; fma.rn.f32 %f411, %f3, 0f3FC00000, %f312; bra.uni BB0_103; BB0_100: mul.f32 %f313, %f123, 0fBF000000; fma.rn.f32 %f314, %f117, 0f40000000, %f313; mul.f32 %f315, %f124, 0fBF000000; fma.rn.f32 %f316, %f118, 0f40000000, %f315; mul.f32 %f317, %f125, 0fBF000000; fma.rn.f32 %f318, %f119, 0f40000000, %f317; mul.f32 %f319, %f1, 0f3FC00000; sub.f32 %f409, %f314, %f319; mul.f32 %f320, %f2, 0f3FC00000; sub.f32 %f410, %f316, %f320; mul.f32 %f321, %f3, 0f3FC00000; sub.f32 %f411, %f318, %f321; BB0_103: setp.eq.s64 %p111, %rd7, 0; @%p111 bra BB0_105; cvta.to.global.u64 %rd64, %rd7; add.s64 %rd66, %rd64, %rd12; ld.global.nc.f32 %f334, [%rd66]; mul.f32 %f412, %f334, %f412; BB0_105: setp.eq.s64 %p112, %rd8, 0; @%p112 bra BB0_107; cvta.to.global.u64 %rd67, %rd8; add.s64 %rd69, %rd67, %rd12; ld.global.nc.f32 %f335, [%rd69]; mul.f32 %f413, %f335, %f413; BB0_107: cvta.to.global.u64 %rd70, %rd6; cvta.to.global.u64 %rd71, %rd5; cvta.to.global.u64 %rd72, %rd4; mul.f32 %f336, %f379, %f157; mul.f32 %f337, %f395, %f158; mul.f32 %f338, %f411, %f159; add.f32 %f339, %f412, %f412; mul.f32 %f340, %f1, %f339; add.f32 %f341, %f337, %f338; mul.f32 %f342, %f1, %f341; mul.f32 %f343, %f394, %f158; fma.rn.f32 %f344, %f2, %f343, %f342; mul.f32 %f345, %f409, %f159; fma.rn.f32 %f346, %f3, %f345, %f344; mul.f32 %f347, %f346, %f413; fma.rn.f32 %f348, %f336, %f340, %f347; add.s64 %rd74, %rd72, %rd12; st.global.f32 [%rd74], %f348; mul.f32 %f349, %f2, %f339; add.f32 %f350, %f336, %f338; mul.f32 %f351, %f2, %f350; mul.f32 %f352, %f380, %f157; fma.rn.f32 %f353, %f1, %f352, %f351; mul.f32 %f354, %f410, %f159; fma.rn.f32 %f355, %f3, %f354, %f353; mul.f32 %f356, %f355, %f413; fma.rn.f32 %f357, %f337, %f349, %f356; add.s64 %rd75, %rd71, %rd12; st.global.f32 [%rd75], %f357; mul.f32 %f358, %f3, %f339; mul.f32 %f359, %f396, %f158; mul.f32 %f360, %f2, %f359; mul.f32 %f361, %f381, %f157; fma.rn.f32 %f362, %f1, %f361, %f360; add.f32 %f363, %f336, %f337; fma.rn.f32 %f364, %f3, %f363, %f362; mul.f32 %f365, %f364, %f413; fma.rn.f32 %f366, %f338, %f358, %f365; add.s64 %rd76, %rd70, %rd12; st.global.f32 [%rd76], %f366; BB0_108: ret; } ` ) mumax3-3.10/cuda/maxangle.cu000066400000000000000000000052561371432437400157300ustar00rootroot00000000000000#include #include "exchange.h" #include "float3.h" #include "stencil.h" // See maxangle.go for more details. extern "C" __global__ void setmaxangle(float* __restrict__ dst, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ aLUT2d, uint8_t* __restrict__ regions, int Nx, int Ny, int Nz, uint8_t PBC) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } // central cell int I = idx(ix, iy, iz); float3 m0 = make_float3(mx[I], my[I], mz[I]); if (is0(m0)) { return; } uint8_t r0 = regions[I]; float angle = 0.0f; int i_; // neighbor index float3 m_; // neighbor mag float a__; // inter-cell exchange stiffness // left neighbor i_ = idx(lclampx(ix-1), iy, iz); // clamps or wraps index according to PBC m_ = make_float3(mx[i_], my[i_], mz[i_]); // load m m_ = ( is0(m_)? m0: m_ ); // replace missing non-boundary neighbor a__ = aLUT2d[symidx(r0, regions[i_])]; if (a__ != 0) { angle = max(angle, acosf(dot(m_,m0))); } // right neighbor i_ = idx(hclampx(ix+1), iy, iz); m_ = make_float3(mx[i_], my[i_], mz[i_]); m_ = ( is0(m_)? m0: m_ ); a__ = aLUT2d[symidx(r0, regions[i_])]; if (a__ != 0) { angle = max(angle, acosf(dot(m_,m0))); } // back neighbor i_ = idx(ix, lclampy(iy-1), iz); m_ = make_float3(mx[i_], my[i_], mz[i_]); m_ = ( is0(m_)? m0: m_ ); a__ = aLUT2d[symidx(r0, regions[i_])]; if (a__ != 0) { angle = max(angle, acosf(dot(m_,m0))); } // front neighbor i_ = idx(ix, hclampy(iy+1), iz); m_ = make_float3(mx[i_], my[i_], mz[i_]); m_ = ( is0(m_)? m0: m_ ); a__ = aLUT2d[symidx(r0, regions[i_])]; if (a__ != 0) { angle = max(angle, acosf(dot(m_,m0))); } // only take vertical derivative for 3D sim if (Nz != 1) { // bottom neighbor i_ = idx(ix, iy, lclampz(iz-1)); m_ = make_float3(mx[i_], my[i_], mz[i_]); m_ = ( is0(m_)? m0: m_ ); a__ = aLUT2d[symidx(r0, regions[i_])]; if (a__ != 0) { angle = max(angle, acosf(dot(m_,m0))); } // top neighbor i_ = idx(ix, iy, hclampz(iz+1)); m_ = make_float3(mx[i_], my[i_], mz[i_]); m_ = ( is0(m_)? m0: m_ ); a__ = aLUT2d[symidx(r0, regions[i_])]; if (a__ != 0) { angle = max(angle, acosf(dot(m_,m0))); } } dst[I] = angle; } mumax3-3.10/cuda/maxangle.go000066400000000000000000000010241371432437400157130ustar00rootroot00000000000000package cuda import ( "unsafe" "github.com/mumax/3/data" ) // SetMaxAngle sets dst to the maximum angle of each cells magnetization with all of its neighbors, // provided the exchange stiffness with that neighbor is nonzero. func SetMaxAngle(dst, m *data.Slice, Aex_red SymmLUT, regions *Bytes, mesh *data.Mesh) { N := mesh.Size() pbc := mesh.PBC_code() cfg := make3DConf(N) k_setmaxangle_async(dst.DevPtr(0), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), unsafe.Pointer(Aex_red), regions.Ptr, N[X], N[Y], N[Z], pbc, cfg) } mumax3-3.10/cuda/maxangle_wrapper.go000066400000000000000000006441731371432437400174750ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for setmaxangle kernel var setmaxangle_code cu.Function // Stores the arguments for setmaxangle kernel invocation type setmaxangle_args_t struct { arg_dst unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_aLUT2d unsafe.Pointer arg_regions unsafe.Pointer arg_Nx int arg_Ny int arg_Nz int arg_PBC byte argptr [10]unsafe.Pointer sync.Mutex } // Stores the arguments for setmaxangle kernel invocation var setmaxangle_args setmaxangle_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. setmaxangle_args.argptr[0] = unsafe.Pointer(&setmaxangle_args.arg_dst) setmaxangle_args.argptr[1] = unsafe.Pointer(&setmaxangle_args.arg_mx) setmaxangle_args.argptr[2] = unsafe.Pointer(&setmaxangle_args.arg_my) setmaxangle_args.argptr[3] = unsafe.Pointer(&setmaxangle_args.arg_mz) setmaxangle_args.argptr[4] = unsafe.Pointer(&setmaxangle_args.arg_aLUT2d) setmaxangle_args.argptr[5] = unsafe.Pointer(&setmaxangle_args.arg_regions) setmaxangle_args.argptr[6] = unsafe.Pointer(&setmaxangle_args.arg_Nx) setmaxangle_args.argptr[7] = unsafe.Pointer(&setmaxangle_args.arg_Ny) setmaxangle_args.argptr[8] = unsafe.Pointer(&setmaxangle_args.arg_Nz) setmaxangle_args.argptr[9] = unsafe.Pointer(&setmaxangle_args.arg_PBC) } // Wrapper for setmaxangle CUDA kernel, asynchronous. func k_setmaxangle_async(dst unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, aLUT2d unsafe.Pointer, regions unsafe.Pointer, Nx int, Ny int, Nz int, PBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("setmaxangle") } setmaxangle_args.Lock() defer setmaxangle_args.Unlock() if setmaxangle_code == 0 { setmaxangle_code = fatbinLoad(setmaxangle_map, "setmaxangle") } setmaxangle_args.arg_dst = dst setmaxangle_args.arg_mx = mx setmaxangle_args.arg_my = my setmaxangle_args.arg_mz = mz setmaxangle_args.arg_aLUT2d = aLUT2d setmaxangle_args.arg_regions = regions setmaxangle_args.arg_Nx = Nx setmaxangle_args.arg_Ny = Ny setmaxangle_args.arg_Nz = Nz setmaxangle_args.arg_PBC = PBC args := setmaxangle_args.argptr[:] cu.LaunchKernel(setmaxangle_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("setmaxangle") } } // maps compute capability on PTX code for setmaxangle kernel. var setmaxangle_map = map[int]string{0: "", 30: setmaxangle_ptx_30, 32: setmaxangle_ptx_32, 35: setmaxangle_ptx_35, 37: setmaxangle_ptx_37, 50: setmaxangle_ptx_50, 52: setmaxangle_ptx_52, 53: setmaxangle_ptx_53, 60: setmaxangle_ptx_60, 61: setmaxangle_ptx_61, 62: setmaxangle_ptx_62, 70: setmaxangle_ptx_70, 72: setmaxangle_ptx_72, 75: setmaxangle_ptx_75} // setmaxangle PTX code for various compute capabilities. const ( setmaxangle_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<21>; .reg .f32 %f<255>; .reg .b32 %r<237>; .reg .b64 %rd<98>; ld.param.u64 %rd1, [setmaxangle_param_0]; ld.param.u64 %rd2, [setmaxangle_param_1]; ld.param.u64 %rd3, [setmaxangle_param_2]; ld.param.u64 %rd4, [setmaxangle_param_3]; ld.param.u64 %rd5, [setmaxangle_param_4]; ld.param.u64 %rd6, [setmaxangle_param_5]; ld.param.u32 %r27, [setmaxangle_param_6]; ld.param.u32 %r28, [setmaxangle_param_7]; ld.param.u32 %r29, [setmaxangle_param_8]; ld.param.u8 %rs4, [setmaxangle_param_9]; mov.u32 %r30, %ntid.x; mov.u32 %r31, %ctaid.x; mov.u32 %r32, %tid.x; mad.lo.s32 %r1, %r30, %r31, %r32; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r2, %r33, %r34, %r35; mov.u32 %r36, %ntid.z; mov.u32 %r37, %ctaid.z; mov.u32 %r38, %tid.z; mad.lo.s32 %r3, %r36, %r37, %r38; setp.ge.s32 %p1, %r2, %r28; setp.ge.s32 %p2, %r1, %r27; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r29; or.pred %p5, %p3, %p4; @%p5 bra BB0_34; cvta.to.global.u64 %rd7, %rd4; cvta.to.global.u64 %rd8, %rd3; cvta.to.global.u64 %rd9, %rd2; mad.lo.s32 %r39, %r3, %r28, %r2; mad.lo.s32 %r40, %r39, %r27, %r1; mul.wide.s32 %rd10, %r40, 4; add.s64 %rd11, %rd9, %rd10; add.s64 %rd12, %rd8, %rd10; add.s64 %rd13, %rd7, %rd10; ld.global.f32 %f1, [%rd11]; ld.global.f32 %f2, [%rd12]; mul.f32 %f34, %f2, %f2; fma.rn.f32 %f35, %f1, %f1, %f34; ld.global.f32 %f3, [%rd13]; fma.rn.f32 %f36, %f3, %f3, %f35; setp.eq.f32 %p6, %f36, 0f00000000; @%p6 bra BB0_34; cvta.to.global.u64 %rd14, %rd6; cvt.s64.s32 %rd15, %r40; add.s64 %rd16, %rd14, %rd15; ld.global.u8 %rs1, [%rd16]; and.b16 %rs5, %rs4, 1; setp.eq.b16 %p7, %rs5, 1; @!%p7 bra BB0_4; bra.uni BB0_3; BB0_3: add.s32 %r59, %r1, -1; rem.s32 %r60, %r59, %r27; add.s32 %r61, %r60, %r27; rem.s32 %r231, %r61, %r27; bra.uni BB0_5; BB0_4: add.s32 %r66, %r1, -1; mov.u32 %r67, 0; max.s32 %r231, %r66, %r67; BB0_5: mad.lo.s32 %r77, %r39, %r27, %r231; cvt.s64.s32 %rd18, %r77; mul.wide.s32 %rd19, %r77, 4; add.s64 %rd20, %rd9, %rd19; add.s64 %rd22, %rd8, %rd19; add.s64 %rd24, %rd7, %rd19; ld.global.f32 %f38, [%rd20]; ld.global.f32 %f39, [%rd22]; mul.f32 %f40, %f39, %f39; fma.rn.f32 %f41, %f38, %f38, %f40; ld.global.f32 %f42, [%rd24]; fma.rn.f32 %f43, %f42, %f42, %f41; setp.eq.f32 %p8, %f43, 0f00000000; selp.f32 %f4, %f1, %f38, %p8; selp.f32 %f5, %f2, %f39, %p8; selp.f32 %f6, %f3, %f42, %p8; add.s64 %rd26, %rd14, %rd18; ld.global.u8 %rs6, [%rd26]; setp.gt.u16 %p9, %rs6, %rs1; cvt.u32.u16 %r78, %rs6; cvt.u32.u16 %r79, %rs1; and.b32 %r80, %r79, 255; selp.b32 %r81, %r80, %r78, %p9; selp.b32 %r82, %r78, %r80, %p9; add.s32 %r83, %r82, 1; mul.lo.s32 %r84, %r83, %r82; shr.u32 %r85, %r84, 1; add.s32 %r86, %r85, %r81; cvta.to.global.u64 %rd27, %rd5; mul.wide.s32 %rd28, %r86, 4; add.s64 %rd29, %rd27, %rd28; ld.global.f32 %f44, [%rd29]; mov.f32 %f250, 0f00000000; setp.eq.f32 %p10, %f44, 0f00000000; @%p10 bra BB0_7; mul.f32 %f45, %f2, %f5; fma.rn.f32 %f46, %f1, %f4, %f45; fma.rn.f32 %f47, %f3, %f6, %f46; abs.f32 %f48, %f47; mov.f32 %f49, 0f3F800000; sub.f32 %f50, %f49, %f48; mul.f32 %f51, %f50, 0f3F000000; sqrt.rn.f32 %f52, %f51; setp.gt.f32 %p11, %f48, 0f3F11EB85; selp.f32 %f53, %f52, %f48, %p11; mul.f32 %f54, %f53, %f53; mov.f32 %f55, 0f3C94D2E9; mov.f32 %f56, 0f3D53F941; fma.rn.f32 %f57, %f56, %f54, %f55; mov.f32 %f58, 0f3D3F841F; fma.rn.f32 %f59, %f57, %f54, %f58; mov.f32 %f60, 0f3D994929; fma.rn.f32 %f61, %f59, %f54, %f60; mov.f32 %f62, 0f3E2AAB94; fma.rn.f32 %f63, %f61, %f54, %f62; mul.f32 %f64, %f54, %f63; fma.rn.f32 %f65, %f64, %f53, %f53; add.f32 %f66, %f65, %f65; mov.f32 %f67, 0f3FC90FDB; sub.f32 %f68, %f67, %f65; selp.f32 %f69, %f66, %f68, %p11; setp.lt.f32 %p12, %f47, 0f00000000; mov.f32 %f70, 0f40490FDB; sub.f32 %f71, %f70, %f69; selp.f32 %f72, %f71, %f69, %p12; mov.f32 %f73, 0f00000000; max.f32 %f250, %f73, %f72; BB0_7: setp.eq.b16 %p13, %rs5, 1; add.s32 %r7, %r1, 1; @!%p13 bra BB0_9; bra.uni BB0_8; BB0_8: rem.s32 %r91, %r7, %r27; add.s32 %r92, %r91, %r27; rem.s32 %r232, %r92, %r27; bra.uni BB0_10; BB0_9: add.s32 %r93, %r27, -1; min.s32 %r232, %r7, %r93; BB0_10: mad.lo.s32 %r103, %r39, %r27, %r232; cvt.s64.s32 %rd31, %r103; mul.wide.s32 %rd32, %r103, 4; add.s64 %rd33, %rd9, %rd32; add.s64 %rd35, %rd8, %rd32; add.s64 %rd37, %rd7, %rd32; ld.global.f32 %f74, [%rd33]; ld.global.f32 %f75, [%rd35]; mul.f32 %f76, %f75, %f75; fma.rn.f32 %f77, %f74, %f74, %f76; ld.global.f32 %f78, [%rd37]; fma.rn.f32 %f79, %f78, %f78, %f77; setp.eq.f32 %p14, %f79, 0f00000000; selp.f32 %f9, %f1, %f74, %p14; selp.f32 %f10, %f2, %f75, %p14; selp.f32 %f11, %f3, %f78, %p14; add.s64 %rd39, %rd14, %rd31; ld.global.u8 %rs9, [%rd39]; setp.gt.u16 %p15, %rs9, %rs1; cvt.u32.u16 %r104, %rs9; selp.b32 %r107, %r80, %r104, %p15; selp.b32 %r108, %r104, %r80, %p15; add.s32 %r109, %r108, 1; mul.lo.s32 %r110, %r109, %r108; shr.u32 %r111, %r110, 1; add.s32 %r112, %r111, %r107; mul.wide.s32 %rd41, %r112, 4; add.s64 %rd42, %rd27, %rd41; ld.global.f32 %f80, [%rd42]; setp.eq.f32 %p16, %f80, 0f00000000; @%p16 bra BB0_12; mul.f32 %f81, %f2, %f10; fma.rn.f32 %f82, %f1, %f9, %f81; fma.rn.f32 %f83, %f3, %f11, %f82; abs.f32 %f84, %f83; mov.f32 %f85, 0f3F800000; sub.f32 %f86, %f85, %f84; mul.f32 %f87, %f86, 0f3F000000; sqrt.rn.f32 %f88, %f87; setp.gt.f32 %p17, %f84, 0f3F11EB85; selp.f32 %f89, %f88, %f84, %p17; mul.f32 %f90, %f89, %f89; mov.f32 %f91, 0f3C94D2E9; mov.f32 %f92, 0f3D53F941; fma.rn.f32 %f93, %f92, %f90, %f91; mov.f32 %f94, 0f3D3F841F; fma.rn.f32 %f95, %f93, %f90, %f94; mov.f32 %f96, 0f3D994929; fma.rn.f32 %f97, %f95, %f90, %f96; mov.f32 %f98, 0f3E2AAB94; fma.rn.f32 %f99, %f97, %f90, %f98; mul.f32 %f100, %f90, %f99; fma.rn.f32 %f101, %f100, %f89, %f89; add.f32 %f102, %f101, %f101; mov.f32 %f103, 0f3FC90FDB; sub.f32 %f104, %f103, %f101; selp.f32 %f105, %f102, %f104, %p17; setp.lt.f32 %p18, %f83, 0f00000000; mov.f32 %f106, 0f40490FDB; sub.f32 %f107, %f106, %f105; selp.f32 %f108, %f107, %f105, %p18; max.f32 %f250, %f250, %f108; BB0_12: and.b16 %rs2, %rs4, 2; setp.eq.s16 %p19, %rs2, 0; add.s32 %r11, %r2, -1; @%p19 bra BB0_14; rem.s32 %r117, %r11, %r28; add.s32 %r118, %r117, %r28; rem.s32 %r233, %r118, %r28; bra.uni BB0_15; BB0_14: mov.u32 %r119, 0; max.s32 %r233, %r11, %r119; BB0_15: mad.lo.s32 %r124, %r3, %r28, %r233; mad.lo.s32 %r129, %r124, %r27, %r1; cvt.s64.s32 %rd44, %r129; mul.wide.s32 %rd45, %r129, 4; add.s64 %rd46, %rd9, %rd45; add.s64 %rd48, %rd8, %rd45; add.s64 %rd50, %rd7, %rd45; ld.global.f32 %f109, [%rd46]; ld.global.f32 %f110, [%rd48]; mul.f32 %f111, %f110, %f110; fma.rn.f32 %f112, %f109, %f109, %f111; ld.global.f32 %f113, [%rd50]; fma.rn.f32 %f114, %f113, %f113, %f112; setp.eq.f32 %p20, %f114, 0f00000000; selp.f32 %f14, %f1, %f109, %p20; selp.f32 %f15, %f2, %f110, %p20; selp.f32 %f16, %f3, %f113, %p20; add.s64 %rd52, %rd14, %rd44; ld.global.u8 %rs11, [%rd52]; setp.gt.u16 %p21, %rs11, %rs1; cvt.u32.u16 %r130, %rs11; selp.b32 %r133, %r80, %r130, %p21; selp.b32 %r134, %r130, %r80, %p21; add.s32 %r135, %r134, 1; mul.lo.s32 %r136, %r135, %r134; shr.u32 %r137, %r136, 1; add.s32 %r138, %r137, %r133; mul.wide.s32 %rd54, %r138, 4; add.s64 %rd55, %rd27, %rd54; ld.global.f32 %f115, [%rd55]; setp.eq.f32 %p22, %f115, 0f00000000; @%p22 bra BB0_17; mul.f32 %f116, %f2, %f15; fma.rn.f32 %f117, %f1, %f14, %f116; fma.rn.f32 %f118, %f3, %f16, %f117; abs.f32 %f119, %f118; mov.f32 %f120, 0f3F800000; sub.f32 %f121, %f120, %f119; mul.f32 %f122, %f121, 0f3F000000; sqrt.rn.f32 %f123, %f122; setp.gt.f32 %p23, %f119, 0f3F11EB85; selp.f32 %f124, %f123, %f119, %p23; mul.f32 %f125, %f124, %f124; mov.f32 %f126, 0f3C94D2E9; mov.f32 %f127, 0f3D53F941; fma.rn.f32 %f128, %f127, %f125, %f126; mov.f32 %f129, 0f3D3F841F; fma.rn.f32 %f130, %f128, %f125, %f129; mov.f32 %f131, 0f3D994929; fma.rn.f32 %f132, %f130, %f125, %f131; mov.f32 %f133, 0f3E2AAB94; fma.rn.f32 %f134, %f132, %f125, %f133; mul.f32 %f135, %f125, %f134; fma.rn.f32 %f136, %f135, %f124, %f124; add.f32 %f137, %f136, %f136; mov.f32 %f138, 0f3FC90FDB; sub.f32 %f139, %f138, %f136; selp.f32 %f140, %f137, %f139, %p23; setp.lt.f32 %p24, %f118, 0f00000000; mov.f32 %f141, 0f40490FDB; sub.f32 %f142, %f141, %f140; selp.f32 %f143, %f142, %f140, %p24; max.f32 %f250, %f250, %f143; BB0_17: add.s32 %r15, %r2, 1; @%p19 bra BB0_19; rem.s32 %r143, %r15, %r28; add.s32 %r144, %r143, %r28; rem.s32 %r234, %r144, %r28; bra.uni BB0_20; BB0_19: add.s32 %r145, %r28, -1; min.s32 %r234, %r15, %r145; BB0_20: mad.lo.s32 %r150, %r3, %r28, %r234; mad.lo.s32 %r155, %r150, %r27, %r1; cvt.s64.s32 %rd57, %r155; mul.wide.s32 %rd58, %r155, 4; add.s64 %rd59, %rd9, %rd58; add.s64 %rd61, %rd8, %rd58; add.s64 %rd63, %rd7, %rd58; ld.global.f32 %f144, [%rd59]; ld.global.f32 %f145, [%rd61]; mul.f32 %f146, %f145, %f145; fma.rn.f32 %f147, %f144, %f144, %f146; ld.global.f32 %f148, [%rd63]; fma.rn.f32 %f149, %f148, %f148, %f147; setp.eq.f32 %p26, %f149, 0f00000000; selp.f32 %f19, %f1, %f144, %p26; selp.f32 %f20, %f2, %f145, %p26; selp.f32 %f21, %f3, %f148, %p26; add.s64 %rd65, %rd14, %rd57; ld.global.u8 %rs14, [%rd65]; setp.gt.u16 %p27, %rs14, %rs1; cvt.u32.u16 %r156, %rs14; selp.b32 %r159, %r80, %r156, %p27; selp.b32 %r160, %r156, %r80, %p27; add.s32 %r161, %r160, 1; mul.lo.s32 %r162, %r161, %r160; shr.u32 %r163, %r162, 1; add.s32 %r164, %r163, %r159; mul.wide.s32 %rd67, %r164, 4; add.s64 %rd68, %rd27, %rd67; ld.global.f32 %f150, [%rd68]; setp.eq.f32 %p28, %f150, 0f00000000; @%p28 bra BB0_22; mul.f32 %f151, %f2, %f20; fma.rn.f32 %f152, %f1, %f19, %f151; fma.rn.f32 %f153, %f3, %f21, %f152; abs.f32 %f154, %f153; mov.f32 %f155, 0f3F800000; sub.f32 %f156, %f155, %f154; mul.f32 %f157, %f156, 0f3F000000; sqrt.rn.f32 %f158, %f157; setp.gt.f32 %p29, %f154, 0f3F11EB85; selp.f32 %f159, %f158, %f154, %p29; mul.f32 %f160, %f159, %f159; mov.f32 %f161, 0f3C94D2E9; mov.f32 %f162, 0f3D53F941; fma.rn.f32 %f163, %f162, %f160, %f161; mov.f32 %f164, 0f3D3F841F; fma.rn.f32 %f165, %f163, %f160, %f164; mov.f32 %f166, 0f3D994929; fma.rn.f32 %f167, %f165, %f160, %f166; mov.f32 %f168, 0f3E2AAB94; fma.rn.f32 %f169, %f167, %f160, %f168; mul.f32 %f170, %f160, %f169; fma.rn.f32 %f171, %f170, %f159, %f159; add.f32 %f172, %f171, %f171; mov.f32 %f173, 0f3FC90FDB; sub.f32 %f174, %f173, %f171; selp.f32 %f175, %f172, %f174, %p29; setp.lt.f32 %p30, %f153, 0f00000000; mov.f32 %f176, 0f40490FDB; sub.f32 %f177, %f176, %f175; selp.f32 %f178, %f177, %f175, %p30; max.f32 %f250, %f250, %f178; BB0_22: setp.eq.s32 %p31, %r29, 1; @%p31 bra BB0_33; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p32, %rs3, 0; add.s32 %r19, %r3, -1; @%p32 bra BB0_25; rem.s32 %r169, %r19, %r29; add.s32 %r170, %r169, %r29; rem.s32 %r235, %r170, %r29; bra.uni BB0_26; BB0_25: mov.u32 %r171, 0; max.s32 %r235, %r19, %r171; BB0_26: mad.lo.s32 %r176, %r235, %r28, %r2; mad.lo.s32 %r181, %r176, %r27, %r1; cvt.s64.s32 %rd70, %r181; mul.wide.s32 %rd71, %r181, 4; add.s64 %rd72, %rd9, %rd71; add.s64 %rd74, %rd8, %rd71; add.s64 %rd76, %rd7, %rd71; ld.global.f32 %f179, [%rd72]; ld.global.f32 %f180, [%rd74]; mul.f32 %f181, %f180, %f180; fma.rn.f32 %f182, %f179, %f179, %f181; ld.global.f32 %f183, [%rd76]; fma.rn.f32 %f184, %f183, %f183, %f182; setp.eq.f32 %p33, %f184, 0f00000000; selp.f32 %f24, %f1, %f179, %p33; selp.f32 %f25, %f2, %f180, %p33; selp.f32 %f26, %f3, %f183, %p33; add.s64 %rd78, %rd14, %rd70; ld.global.u8 %rs16, [%rd78]; setp.gt.u16 %p34, %rs16, %rs1; cvt.u32.u16 %r182, %rs16; selp.b32 %r185, %r80, %r182, %p34; selp.b32 %r186, %r182, %r80, %p34; add.s32 %r187, %r186, 1; mul.lo.s32 %r188, %r187, %r186; shr.u32 %r189, %r188, 1; add.s32 %r190, %r189, %r185; mul.wide.s32 %rd80, %r190, 4; add.s64 %rd81, %rd27, %rd80; ld.global.f32 %f185, [%rd81]; setp.eq.f32 %p35, %f185, 0f00000000; @%p35 bra BB0_28; mul.f32 %f186, %f2, %f25; fma.rn.f32 %f187, %f1, %f24, %f186; fma.rn.f32 %f188, %f3, %f26, %f187; abs.f32 %f189, %f188; mov.f32 %f190, 0f3F800000; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f191, 0f3F000000; sqrt.rn.f32 %f193, %f192; setp.gt.f32 %p36, %f189, 0f3F11EB85; selp.f32 %f194, %f193, %f189, %p36; mul.f32 %f195, %f194, %f194; mov.f32 %f196, 0f3C94D2E9; mov.f32 %f197, 0f3D53F941; fma.rn.f32 %f198, %f197, %f195, %f196; mov.f32 %f199, 0f3D3F841F; fma.rn.f32 %f200, %f198, %f195, %f199; mov.f32 %f201, 0f3D994929; fma.rn.f32 %f202, %f200, %f195, %f201; mov.f32 %f203, 0f3E2AAB94; fma.rn.f32 %f204, %f202, %f195, %f203; mul.f32 %f205, %f195, %f204; fma.rn.f32 %f206, %f205, %f194, %f194; add.f32 %f207, %f206, %f206; mov.f32 %f208, 0f3FC90FDB; sub.f32 %f209, %f208, %f206; selp.f32 %f210, %f207, %f209, %p36; setp.lt.f32 %p37, %f188, 0f00000000; mov.f32 %f211, 0f40490FDB; sub.f32 %f212, %f211, %f210; selp.f32 %f213, %f212, %f210, %p37; max.f32 %f250, %f250, %f213; BB0_28: add.s32 %r23, %r3, 1; @%p32 bra BB0_30; rem.s32 %r195, %r23, %r29; add.s32 %r196, %r195, %r29; rem.s32 %r236, %r196, %r29; bra.uni BB0_31; BB0_30: add.s32 %r197, %r29, -1; min.s32 %r236, %r23, %r197; BB0_31: mad.lo.s32 %r202, %r236, %r28, %r2; mad.lo.s32 %r207, %r202, %r27, %r1; cvt.s64.s32 %rd83, %r207; mul.wide.s32 %rd84, %r207, 4; add.s64 %rd85, %rd9, %rd84; add.s64 %rd87, %rd8, %rd84; add.s64 %rd89, %rd7, %rd84; ld.global.f32 %f214, [%rd85]; ld.global.f32 %f215, [%rd87]; mul.f32 %f216, %f215, %f215; fma.rn.f32 %f217, %f214, %f214, %f216; ld.global.f32 %f218, [%rd89]; fma.rn.f32 %f219, %f218, %f218, %f217; setp.eq.f32 %p39, %f219, 0f00000000; selp.f32 %f29, %f1, %f214, %p39; selp.f32 %f30, %f2, %f215, %p39; selp.f32 %f31, %f3, %f218, %p39; add.s64 %rd91, %rd14, %rd83; ld.global.u8 %rs19, [%rd91]; setp.gt.u16 %p40, %rs19, %rs1; cvt.u32.u16 %r208, %rs19; selp.b32 %r211, %r80, %r208, %p40; selp.b32 %r212, %r208, %r80, %p40; add.s32 %r213, %r212, 1; mul.lo.s32 %r214, %r213, %r212; shr.u32 %r215, %r214, 1; add.s32 %r216, %r215, %r211; mul.wide.s32 %rd93, %r216, 4; add.s64 %rd94, %rd27, %rd93; ld.global.f32 %f220, [%rd94]; setp.eq.f32 %p41, %f220, 0f00000000; @%p41 bra BB0_33; mul.f32 %f221, %f2, %f30; fma.rn.f32 %f222, %f1, %f29, %f221; fma.rn.f32 %f223, %f3, %f31, %f222; abs.f32 %f224, %f223; mov.f32 %f225, 0f3F800000; sub.f32 %f226, %f225, %f224; mul.f32 %f227, %f226, 0f3F000000; sqrt.rn.f32 %f228, %f227; setp.gt.f32 %p42, %f224, 0f3F11EB85; selp.f32 %f229, %f228, %f224, %p42; mul.f32 %f230, %f229, %f229; mov.f32 %f231, 0f3C94D2E9; mov.f32 %f232, 0f3D53F941; fma.rn.f32 %f233, %f232, %f230, %f231; mov.f32 %f234, 0f3D3F841F; fma.rn.f32 %f235, %f233, %f230, %f234; mov.f32 %f236, 0f3D994929; fma.rn.f32 %f237, %f235, %f230, %f236; mov.f32 %f238, 0f3E2AAB94; fma.rn.f32 %f239, %f237, %f230, %f238; mul.f32 %f240, %f230, %f239; fma.rn.f32 %f241, %f240, %f229, %f229; add.f32 %f242, %f241, %f241; mov.f32 %f243, 0f3FC90FDB; sub.f32 %f244, %f243, %f241; selp.f32 %f245, %f242, %f244, %p42; setp.lt.f32 %p43, %f223, 0f00000000; mov.f32 %f246, 0f40490FDB; sub.f32 %f247, %f246, %f245; selp.f32 %f248, %f247, %f245, %p43; max.f32 %f250, %f250, %f248; BB0_33: cvta.to.global.u64 %rd95, %rd1; add.s64 %rd97, %rd95, %rd10; st.global.f32 [%rd97], %f250; BB0_34: ret; } ` setmaxangle_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<26>; .reg .f32 %f<255>; .reg .b32 %r<128>; .reg .b64 %rd<69>; ld.param.u64 %rd6, [setmaxangle_param_0]; ld.param.u64 %rd7, [setmaxangle_param_1]; ld.param.u64 %rd8, [setmaxangle_param_2]; ld.param.u64 %rd9, [setmaxangle_param_3]; ld.param.u64 %rd10, [setmaxangle_param_4]; ld.param.u64 %rd11, [setmaxangle_param_5]; ld.param.u32 %r32, [setmaxangle_param_6]; ld.param.u32 %r33, [setmaxangle_param_7]; ld.param.u32 %r34, [setmaxangle_param_8]; ld.param.u8 %rs5, [setmaxangle_param_9]; cvta.to.global.u64 %rd1, %rd10; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd9; cvta.to.global.u64 %rd4, %rd8; cvta.to.global.u64 %rd5, %rd7; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_34; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd5, %rd12; add.s64 %rd14, %rd4, %rd12; add.s64 %rd15, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f38, %f2, %f2; fma.rn.f32 %f39, %f1, %f1, %f38; ld.global.nc.f32 %f3, [%rd15]; fma.rn.f32 %f40, %f3, %f3, %f39; setp.eq.f32 %p6, %f40, 0f00000000; @%p6 bra BB0_34; cvt.s64.s32 %rd16, %r6; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs1, [%rd17]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd18, %r49; mul.wide.s32 %rd19, %r49, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd21]; mul.f32 %f42, %f5, %f5; fma.rn.f32 %f43, %f4, %f4, %f42; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f43; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; setp.gt.u16 %p8, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p8; selp.b32 %r53, %r51, %r7, %p8; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd24, %r57, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f44, [%rd25]; mov.f32 %f250, 0f00000000; setp.eq.f32 %p9, %f44, 0f00000000; @%p9 bra BB0_7; setp.eq.f32 %p10, %f7, 0f00000000; selp.f32 %f45, %f1, %f4, %p10; selp.f32 %f46, %f2, %f5, %p10; mul.f32 %f47, %f2, %f46; fma.rn.f32 %f48, %f1, %f45, %f47; selp.f32 %f49, %f3, %f6, %p10; fma.rn.f32 %f50, %f3, %f49, %f48; abs.f32 %f51, %f50; mov.f32 %f52, 0f3F800000; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, 0f3F000000; sqrt.rn.f32 %f55, %f54; setp.gt.f32 %p11, %f51, 0f3F11EB85; selp.f32 %f56, %f55, %f51, %p11; mul.f32 %f57, %f56, %f56; mov.f32 %f58, 0f3C94D2E9; mov.f32 %f59, 0f3D53F941; fma.rn.f32 %f60, %f59, %f57, %f58; mov.f32 %f61, 0f3D3F841F; fma.rn.f32 %f62, %f60, %f57, %f61; mov.f32 %f63, 0f3D994929; fma.rn.f32 %f64, %f62, %f57, %f63; mov.f32 %f65, 0f3E2AAB94; fma.rn.f32 %f66, %f64, %f57, %f65; mul.f32 %f67, %f57, %f66; fma.rn.f32 %f68, %f67, %f56, %f56; add.f32 %f69, %f68, %f68; mov.f32 %f70, 0f3FC90FDB; sub.f32 %f71, %f70, %f68; selp.f32 %f72, %f69, %f71, %p11; setp.lt.f32 %p12, %f50, 0f00000000; mov.f32 %f73, 0f40490FDB; sub.f32 %f74, %f73, %f72; selp.f32 %f75, %f74, %f72, %p12; mov.f32 %f76, 0f00000000; max.f32 %f250, %f76, %f75; BB0_7: add.s32 %r12, %r1, 1; @%p7 bra BB0_9; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_10; BB0_9: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_10: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd26, %r61; mul.wide.s32 %rd27, %r61, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f10, [%rd28]; ld.global.nc.f32 %f11, [%rd29]; mul.f32 %f77, %f11, %f11; fma.rn.f32 %f78, %f10, %f10, %f77; ld.global.nc.f32 %f12, [%rd30]; fma.rn.f32 %f13, %f12, %f12, %f78; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs9, [%rd31]; setp.gt.u16 %p14, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p14; selp.b32 %r65, %r63, %r7, %p14; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd32, %r69, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f79, [%rd33]; setp.eq.f32 %p15, %f79, 0f00000000; @%p15 bra BB0_12; setp.eq.f32 %p16, %f13, 0f00000000; selp.f32 %f80, %f1, %f10, %p16; selp.f32 %f81, %f2, %f11, %p16; mul.f32 %f82, %f2, %f81; fma.rn.f32 %f83, %f1, %f80, %f82; selp.f32 %f84, %f3, %f12, %p16; fma.rn.f32 %f85, %f3, %f84, %f83; abs.f32 %f86, %f85; mov.f32 %f87, 0f3F800000; sub.f32 %f88, %f87, %f86; mul.f32 %f89, %f88, 0f3F000000; sqrt.rn.f32 %f90, %f89; setp.gt.f32 %p17, %f86, 0f3F11EB85; selp.f32 %f91, %f90, %f86, %p17; mul.f32 %f92, %f91, %f91; mov.f32 %f93, 0f3C94D2E9; mov.f32 %f94, 0f3D53F941; fma.rn.f32 %f95, %f94, %f92, %f93; mov.f32 %f96, 0f3D3F841F; fma.rn.f32 %f97, %f95, %f92, %f96; mov.f32 %f98, 0f3D994929; fma.rn.f32 %f99, %f97, %f92, %f98; mov.f32 %f100, 0f3E2AAB94; fma.rn.f32 %f101, %f99, %f92, %f100; mul.f32 %f102, %f92, %f101; fma.rn.f32 %f103, %f102, %f91, %f91; add.f32 %f104, %f103, %f103; mov.f32 %f105, 0f3FC90FDB; sub.f32 %f106, %f105, %f103; selp.f32 %f107, %f104, %f106, %p17; setp.lt.f32 %p18, %f85, 0f00000000; mov.f32 %f108, 0f40490FDB; sub.f32 %f109, %f108, %f107; selp.f32 %f110, %f109, %f107, %p18; max.f32 %f250, %f250, %f110; BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r16, %r2, -1; @%p19 bra BB0_14; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_15; BB0_14: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_15: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f16, [%rd36]; ld.global.nc.f32 %f17, [%rd37]; mul.f32 %f111, %f17, %f17; fma.rn.f32 %f112, %f16, %f16, %f111; ld.global.nc.f32 %f18, [%rd38]; fma.rn.f32 %f19, %f18, %f18, %f112; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs12, [%rd39]; setp.gt.u16 %p20, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p20; selp.b32 %r78, %r76, %r7, %p20; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd40, %r82, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f113, [%rd41]; setp.eq.f32 %p21, %f113, 0f00000000; @%p21 bra BB0_17; setp.eq.f32 %p22, %f19, 0f00000000; selp.f32 %f114, %f1, %f16, %p22; selp.f32 %f115, %f2, %f17, %p22; mul.f32 %f116, %f2, %f115; fma.rn.f32 %f117, %f1, %f114, %f116; selp.f32 %f118, %f3, %f18, %p22; fma.rn.f32 %f119, %f3, %f118, %f117; abs.f32 %f120, %f119; mov.f32 %f121, 0f3F800000; sub.f32 %f122, %f121, %f120; mul.f32 %f123, %f122, 0f3F000000; sqrt.rn.f32 %f124, %f123; setp.gt.f32 %p23, %f120, 0f3F11EB85; selp.f32 %f125, %f124, %f120, %p23; mul.f32 %f126, %f125, %f125; mov.f32 %f127, 0f3C94D2E9; mov.f32 %f128, 0f3D53F941; fma.rn.f32 %f129, %f128, %f126, %f127; mov.f32 %f130, 0f3D3F841F; fma.rn.f32 %f131, %f129, %f126, %f130; mov.f32 %f132, 0f3D994929; fma.rn.f32 %f133, %f131, %f126, %f132; mov.f32 %f134, 0f3E2AAB94; fma.rn.f32 %f135, %f133, %f126, %f134; mul.f32 %f136, %f126, %f135; fma.rn.f32 %f137, %f136, %f125, %f125; add.f32 %f138, %f137, %f137; mov.f32 %f139, 0f3FC90FDB; sub.f32 %f140, %f139, %f137; selp.f32 %f141, %f138, %f140, %p23; setp.lt.f32 %p24, %f119, 0f00000000; mov.f32 %f142, 0f40490FDB; sub.f32 %f143, %f142, %f141; selp.f32 %f144, %f143, %f141, %p24; max.f32 %f250, %f250, %f144; BB0_17: add.s32 %r20, %r2, 1; @%p19 bra BB0_19; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_20; BB0_19: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_20: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd42, %r87; mul.wide.s32 %rd43, %r87, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f22, [%rd44]; ld.global.nc.f32 %f23, [%rd45]; mul.f32 %f145, %f23, %f23; fma.rn.f32 %f146, %f22, %f22, %f145; ld.global.nc.f32 %f24, [%rd46]; fma.rn.f32 %f25, %f24, %f24, %f146; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs16, [%rd47]; setp.gt.u16 %p26, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p26; selp.b32 %r91, %r89, %r7, %p26; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f147, [%rd49]; setp.eq.f32 %p27, %f147, 0f00000000; @%p27 bra BB0_22; setp.eq.f32 %p28, %f25, 0f00000000; selp.f32 %f148, %f1, %f22, %p28; selp.f32 %f149, %f2, %f23, %p28; mul.f32 %f150, %f2, %f149; fma.rn.f32 %f151, %f1, %f148, %f150; selp.f32 %f152, %f3, %f24, %p28; fma.rn.f32 %f153, %f3, %f152, %f151; abs.f32 %f154, %f153; mov.f32 %f155, 0f3F800000; sub.f32 %f156, %f155, %f154; mul.f32 %f157, %f156, 0f3F000000; sqrt.rn.f32 %f158, %f157; setp.gt.f32 %p29, %f154, 0f3F11EB85; selp.f32 %f159, %f158, %f154, %p29; mul.f32 %f160, %f159, %f159; mov.f32 %f161, 0f3C94D2E9; mov.f32 %f162, 0f3D53F941; fma.rn.f32 %f163, %f162, %f160, %f161; mov.f32 %f164, 0f3D3F841F; fma.rn.f32 %f165, %f163, %f160, %f164; mov.f32 %f166, 0f3D994929; fma.rn.f32 %f167, %f165, %f160, %f166; mov.f32 %f168, 0f3E2AAB94; fma.rn.f32 %f169, %f167, %f160, %f168; mul.f32 %f170, %f160, %f169; fma.rn.f32 %f171, %f170, %f159, %f159; add.f32 %f172, %f171, %f171; mov.f32 %f173, 0f3FC90FDB; sub.f32 %f174, %f173, %f171; selp.f32 %f175, %f172, %f174, %p29; setp.lt.f32 %p30, %f153, 0f00000000; mov.f32 %f176, 0f40490FDB; sub.f32 %f177, %f176, %f175; selp.f32 %f178, %f177, %f175, %p30; max.f32 %f250, %f250, %f178; BB0_22: setp.eq.s32 %p31, %r34, 1; @%p31 bra BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r24, %r3, -1; @%p32 bra BB0_25; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_26; BB0_25: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_26: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd50, %r100; mul.wide.s32 %rd51, %r100, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f179, [%rd52]; ld.global.nc.f32 %f180, [%rd53]; mul.f32 %f181, %f180, %f180; fma.rn.f32 %f182, %f179, %f179, %f181; ld.global.nc.f32 %f183, [%rd54]; fma.rn.f32 %f184, %f183, %f183, %f182; setp.eq.f32 %p33, %f184, 0f00000000; selp.f32 %f28, %f1, %f179, %p33; selp.f32 %f29, %f2, %f180, %p33; selp.f32 %f30, %f3, %f183, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs19, [%rd55]; setp.gt.u16 %p34, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p34; selp.b32 %r104, %r102, %r7, %p34; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f185, [%rd57]; setp.eq.f32 %p35, %f185, 0f00000000; @%p35 bra BB0_28; mul.f32 %f186, %f2, %f29; fma.rn.f32 %f187, %f1, %f28, %f186; fma.rn.f32 %f188, %f3, %f30, %f187; abs.f32 %f189, %f188; mov.f32 %f190, 0f3F800000; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f191, 0f3F000000; sqrt.rn.f32 %f193, %f192; setp.gt.f32 %p36, %f189, 0f3F11EB85; selp.f32 %f194, %f193, %f189, %p36; mul.f32 %f195, %f194, %f194; mov.f32 %f196, 0f3C94D2E9; mov.f32 %f197, 0f3D53F941; fma.rn.f32 %f198, %f197, %f195, %f196; mov.f32 %f199, 0f3D3F841F; fma.rn.f32 %f200, %f198, %f195, %f199; mov.f32 %f201, 0f3D994929; fma.rn.f32 %f202, %f200, %f195, %f201; mov.f32 %f203, 0f3E2AAB94; fma.rn.f32 %f204, %f202, %f195, %f203; mul.f32 %f205, %f195, %f204; fma.rn.f32 %f206, %f205, %f194, %f194; add.f32 %f207, %f206, %f206; mov.f32 %f208, 0f3FC90FDB; sub.f32 %f209, %f208, %f206; selp.f32 %f210, %f207, %f209, %p36; setp.lt.f32 %p37, %f188, 0f00000000; mov.f32 %f211, 0f40490FDB; sub.f32 %f212, %f211, %f210; selp.f32 %f213, %f212, %f210, %p37; max.f32 %f250, %f250, %f213; BB0_28: add.s32 %r28, %r3, 1; @%p32 bra BB0_30; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_31; BB0_30: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_31: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd58, %r113; mul.wide.s32 %rd59, %r113, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f214, [%rd60]; ld.global.nc.f32 %f215, [%rd61]; mul.f32 %f216, %f215, %f215; fma.rn.f32 %f217, %f214, %f214, %f216; ld.global.nc.f32 %f218, [%rd62]; fma.rn.f32 %f219, %f218, %f218, %f217; setp.eq.f32 %p39, %f219, 0f00000000; selp.f32 %f33, %f1, %f214, %p39; selp.f32 %f34, %f2, %f215, %p39; selp.f32 %f35, %f3, %f218, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs23, [%rd63]; setp.gt.u16 %p40, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p40; selp.b32 %r117, %r115, %r7, %p40; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd64, %r121, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f220, [%rd65]; setp.eq.f32 %p41, %f220, 0f00000000; @%p41 bra BB0_33; mul.f32 %f221, %f2, %f34; fma.rn.f32 %f222, %f1, %f33, %f221; fma.rn.f32 %f223, %f3, %f35, %f222; abs.f32 %f224, %f223; mov.f32 %f225, 0f3F800000; sub.f32 %f226, %f225, %f224; mul.f32 %f227, %f226, 0f3F000000; sqrt.rn.f32 %f228, %f227; setp.gt.f32 %p42, %f224, 0f3F11EB85; selp.f32 %f229, %f228, %f224, %p42; mul.f32 %f230, %f229, %f229; mov.f32 %f231, 0f3C94D2E9; mov.f32 %f232, 0f3D53F941; fma.rn.f32 %f233, %f232, %f230, %f231; mov.f32 %f234, 0f3D3F841F; fma.rn.f32 %f235, %f233, %f230, %f234; mov.f32 %f236, 0f3D994929; fma.rn.f32 %f237, %f235, %f230, %f236; mov.f32 %f238, 0f3E2AAB94; fma.rn.f32 %f239, %f237, %f230, %f238; mul.f32 %f240, %f230, %f239; fma.rn.f32 %f241, %f240, %f229, %f229; add.f32 %f242, %f241, %f241; mov.f32 %f243, 0f3FC90FDB; sub.f32 %f244, %f243, %f241; selp.f32 %f245, %f242, %f244, %p42; setp.lt.f32 %p43, %f223, 0f00000000; mov.f32 %f246, 0f40490FDB; sub.f32 %f247, %f246, %f245; selp.f32 %f248, %f247, %f245, %p43; max.f32 %f250, %f250, %f248; BB0_33: cvta.to.global.u64 %rd66, %rd6; add.s64 %rd68, %rd66, %rd12; st.global.f32 [%rd68], %f250; BB0_34: ret; } ` setmaxangle_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<26>; .reg .f32 %f<255>; .reg .b32 %r<128>; .reg .b64 %rd<69>; ld.param.u64 %rd6, [setmaxangle_param_0]; ld.param.u64 %rd7, [setmaxangle_param_1]; ld.param.u64 %rd8, [setmaxangle_param_2]; ld.param.u64 %rd9, [setmaxangle_param_3]; ld.param.u64 %rd10, [setmaxangle_param_4]; ld.param.u64 %rd11, [setmaxangle_param_5]; ld.param.u32 %r32, [setmaxangle_param_6]; ld.param.u32 %r33, [setmaxangle_param_7]; ld.param.u32 %r34, [setmaxangle_param_8]; ld.param.u8 %rs5, [setmaxangle_param_9]; cvta.to.global.u64 %rd1, %rd10; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd9; cvta.to.global.u64 %rd4, %rd8; cvta.to.global.u64 %rd5, %rd7; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_34; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd5, %rd12; add.s64 %rd14, %rd4, %rd12; add.s64 %rd15, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f38, %f2, %f2; fma.rn.f32 %f39, %f1, %f1, %f38; ld.global.nc.f32 %f3, [%rd15]; fma.rn.f32 %f40, %f3, %f3, %f39; setp.eq.f32 %p6, %f40, 0f00000000; @%p6 bra BB0_34; cvt.s64.s32 %rd16, %r6; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs1, [%rd17]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd18, %r49; mul.wide.s32 %rd19, %r49, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd21]; mul.f32 %f42, %f5, %f5; fma.rn.f32 %f43, %f4, %f4, %f42; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f43; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; setp.gt.u16 %p8, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p8; selp.b32 %r53, %r51, %r7, %p8; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd24, %r57, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f44, [%rd25]; mov.f32 %f250, 0f00000000; setp.eq.f32 %p9, %f44, 0f00000000; @%p9 bra BB0_7; setp.eq.f32 %p10, %f7, 0f00000000; selp.f32 %f45, %f1, %f4, %p10; selp.f32 %f46, %f2, %f5, %p10; mul.f32 %f47, %f2, %f46; fma.rn.f32 %f48, %f1, %f45, %f47; selp.f32 %f49, %f3, %f6, %p10; fma.rn.f32 %f50, %f3, %f49, %f48; abs.f32 %f51, %f50; mov.f32 %f52, 0f3F800000; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, 0f3F000000; sqrt.rn.f32 %f55, %f54; setp.gt.f32 %p11, %f51, 0f3F11EB85; selp.f32 %f56, %f55, %f51, %p11; mul.f32 %f57, %f56, %f56; mov.f32 %f58, 0f3C94D2E9; mov.f32 %f59, 0f3D53F941; fma.rn.f32 %f60, %f59, %f57, %f58; mov.f32 %f61, 0f3D3F841F; fma.rn.f32 %f62, %f60, %f57, %f61; mov.f32 %f63, 0f3D994929; fma.rn.f32 %f64, %f62, %f57, %f63; mov.f32 %f65, 0f3E2AAB94; fma.rn.f32 %f66, %f64, %f57, %f65; mul.f32 %f67, %f57, %f66; fma.rn.f32 %f68, %f67, %f56, %f56; add.f32 %f69, %f68, %f68; mov.f32 %f70, 0f3FC90FDB; sub.f32 %f71, %f70, %f68; selp.f32 %f72, %f69, %f71, %p11; setp.lt.f32 %p12, %f50, 0f00000000; mov.f32 %f73, 0f40490FDB; sub.f32 %f74, %f73, %f72; selp.f32 %f75, %f74, %f72, %p12; mov.f32 %f76, 0f00000000; max.f32 %f250, %f76, %f75; BB0_7: add.s32 %r12, %r1, 1; @%p7 bra BB0_9; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_10; BB0_9: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_10: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd26, %r61; mul.wide.s32 %rd27, %r61, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f10, [%rd28]; ld.global.nc.f32 %f11, [%rd29]; mul.f32 %f77, %f11, %f11; fma.rn.f32 %f78, %f10, %f10, %f77; ld.global.nc.f32 %f12, [%rd30]; fma.rn.f32 %f13, %f12, %f12, %f78; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs9, [%rd31]; setp.gt.u16 %p14, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p14; selp.b32 %r65, %r63, %r7, %p14; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd32, %r69, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f79, [%rd33]; setp.eq.f32 %p15, %f79, 0f00000000; @%p15 bra BB0_12; setp.eq.f32 %p16, %f13, 0f00000000; selp.f32 %f80, %f1, %f10, %p16; selp.f32 %f81, %f2, %f11, %p16; mul.f32 %f82, %f2, %f81; fma.rn.f32 %f83, %f1, %f80, %f82; selp.f32 %f84, %f3, %f12, %p16; fma.rn.f32 %f85, %f3, %f84, %f83; abs.f32 %f86, %f85; mov.f32 %f87, 0f3F800000; sub.f32 %f88, %f87, %f86; mul.f32 %f89, %f88, 0f3F000000; sqrt.rn.f32 %f90, %f89; setp.gt.f32 %p17, %f86, 0f3F11EB85; selp.f32 %f91, %f90, %f86, %p17; mul.f32 %f92, %f91, %f91; mov.f32 %f93, 0f3C94D2E9; mov.f32 %f94, 0f3D53F941; fma.rn.f32 %f95, %f94, %f92, %f93; mov.f32 %f96, 0f3D3F841F; fma.rn.f32 %f97, %f95, %f92, %f96; mov.f32 %f98, 0f3D994929; fma.rn.f32 %f99, %f97, %f92, %f98; mov.f32 %f100, 0f3E2AAB94; fma.rn.f32 %f101, %f99, %f92, %f100; mul.f32 %f102, %f92, %f101; fma.rn.f32 %f103, %f102, %f91, %f91; add.f32 %f104, %f103, %f103; mov.f32 %f105, 0f3FC90FDB; sub.f32 %f106, %f105, %f103; selp.f32 %f107, %f104, %f106, %p17; setp.lt.f32 %p18, %f85, 0f00000000; mov.f32 %f108, 0f40490FDB; sub.f32 %f109, %f108, %f107; selp.f32 %f110, %f109, %f107, %p18; max.f32 %f250, %f250, %f110; BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r16, %r2, -1; @%p19 bra BB0_14; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_15; BB0_14: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_15: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f16, [%rd36]; ld.global.nc.f32 %f17, [%rd37]; mul.f32 %f111, %f17, %f17; fma.rn.f32 %f112, %f16, %f16, %f111; ld.global.nc.f32 %f18, [%rd38]; fma.rn.f32 %f19, %f18, %f18, %f112; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs12, [%rd39]; setp.gt.u16 %p20, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p20; selp.b32 %r78, %r76, %r7, %p20; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd40, %r82, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f113, [%rd41]; setp.eq.f32 %p21, %f113, 0f00000000; @%p21 bra BB0_17; setp.eq.f32 %p22, %f19, 0f00000000; selp.f32 %f114, %f1, %f16, %p22; selp.f32 %f115, %f2, %f17, %p22; mul.f32 %f116, %f2, %f115; fma.rn.f32 %f117, %f1, %f114, %f116; selp.f32 %f118, %f3, %f18, %p22; fma.rn.f32 %f119, %f3, %f118, %f117; abs.f32 %f120, %f119; mov.f32 %f121, 0f3F800000; sub.f32 %f122, %f121, %f120; mul.f32 %f123, %f122, 0f3F000000; sqrt.rn.f32 %f124, %f123; setp.gt.f32 %p23, %f120, 0f3F11EB85; selp.f32 %f125, %f124, %f120, %p23; mul.f32 %f126, %f125, %f125; mov.f32 %f127, 0f3C94D2E9; mov.f32 %f128, 0f3D53F941; fma.rn.f32 %f129, %f128, %f126, %f127; mov.f32 %f130, 0f3D3F841F; fma.rn.f32 %f131, %f129, %f126, %f130; mov.f32 %f132, 0f3D994929; fma.rn.f32 %f133, %f131, %f126, %f132; mov.f32 %f134, 0f3E2AAB94; fma.rn.f32 %f135, %f133, %f126, %f134; mul.f32 %f136, %f126, %f135; fma.rn.f32 %f137, %f136, %f125, %f125; add.f32 %f138, %f137, %f137; mov.f32 %f139, 0f3FC90FDB; sub.f32 %f140, %f139, %f137; selp.f32 %f141, %f138, %f140, %p23; setp.lt.f32 %p24, %f119, 0f00000000; mov.f32 %f142, 0f40490FDB; sub.f32 %f143, %f142, %f141; selp.f32 %f144, %f143, %f141, %p24; max.f32 %f250, %f250, %f144; BB0_17: add.s32 %r20, %r2, 1; @%p19 bra BB0_19; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_20; BB0_19: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_20: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd42, %r87; mul.wide.s32 %rd43, %r87, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f22, [%rd44]; ld.global.nc.f32 %f23, [%rd45]; mul.f32 %f145, %f23, %f23; fma.rn.f32 %f146, %f22, %f22, %f145; ld.global.nc.f32 %f24, [%rd46]; fma.rn.f32 %f25, %f24, %f24, %f146; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs16, [%rd47]; setp.gt.u16 %p26, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p26; selp.b32 %r91, %r89, %r7, %p26; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f147, [%rd49]; setp.eq.f32 %p27, %f147, 0f00000000; @%p27 bra BB0_22; setp.eq.f32 %p28, %f25, 0f00000000; selp.f32 %f148, %f1, %f22, %p28; selp.f32 %f149, %f2, %f23, %p28; mul.f32 %f150, %f2, %f149; fma.rn.f32 %f151, %f1, %f148, %f150; selp.f32 %f152, %f3, %f24, %p28; fma.rn.f32 %f153, %f3, %f152, %f151; abs.f32 %f154, %f153; mov.f32 %f155, 0f3F800000; sub.f32 %f156, %f155, %f154; mul.f32 %f157, %f156, 0f3F000000; sqrt.rn.f32 %f158, %f157; setp.gt.f32 %p29, %f154, 0f3F11EB85; selp.f32 %f159, %f158, %f154, %p29; mul.f32 %f160, %f159, %f159; mov.f32 %f161, 0f3C94D2E9; mov.f32 %f162, 0f3D53F941; fma.rn.f32 %f163, %f162, %f160, %f161; mov.f32 %f164, 0f3D3F841F; fma.rn.f32 %f165, %f163, %f160, %f164; mov.f32 %f166, 0f3D994929; fma.rn.f32 %f167, %f165, %f160, %f166; mov.f32 %f168, 0f3E2AAB94; fma.rn.f32 %f169, %f167, %f160, %f168; mul.f32 %f170, %f160, %f169; fma.rn.f32 %f171, %f170, %f159, %f159; add.f32 %f172, %f171, %f171; mov.f32 %f173, 0f3FC90FDB; sub.f32 %f174, %f173, %f171; selp.f32 %f175, %f172, %f174, %p29; setp.lt.f32 %p30, %f153, 0f00000000; mov.f32 %f176, 0f40490FDB; sub.f32 %f177, %f176, %f175; selp.f32 %f178, %f177, %f175, %p30; max.f32 %f250, %f250, %f178; BB0_22: setp.eq.s32 %p31, %r34, 1; @%p31 bra BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r24, %r3, -1; @%p32 bra BB0_25; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_26; BB0_25: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_26: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd50, %r100; mul.wide.s32 %rd51, %r100, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f179, [%rd52]; ld.global.nc.f32 %f180, [%rd53]; mul.f32 %f181, %f180, %f180; fma.rn.f32 %f182, %f179, %f179, %f181; ld.global.nc.f32 %f183, [%rd54]; fma.rn.f32 %f184, %f183, %f183, %f182; setp.eq.f32 %p33, %f184, 0f00000000; selp.f32 %f28, %f1, %f179, %p33; selp.f32 %f29, %f2, %f180, %p33; selp.f32 %f30, %f3, %f183, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs19, [%rd55]; setp.gt.u16 %p34, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p34; selp.b32 %r104, %r102, %r7, %p34; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f185, [%rd57]; setp.eq.f32 %p35, %f185, 0f00000000; @%p35 bra BB0_28; mul.f32 %f186, %f2, %f29; fma.rn.f32 %f187, %f1, %f28, %f186; fma.rn.f32 %f188, %f3, %f30, %f187; abs.f32 %f189, %f188; mov.f32 %f190, 0f3F800000; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f191, 0f3F000000; sqrt.rn.f32 %f193, %f192; setp.gt.f32 %p36, %f189, 0f3F11EB85; selp.f32 %f194, %f193, %f189, %p36; mul.f32 %f195, %f194, %f194; mov.f32 %f196, 0f3C94D2E9; mov.f32 %f197, 0f3D53F941; fma.rn.f32 %f198, %f197, %f195, %f196; mov.f32 %f199, 0f3D3F841F; fma.rn.f32 %f200, %f198, %f195, %f199; mov.f32 %f201, 0f3D994929; fma.rn.f32 %f202, %f200, %f195, %f201; mov.f32 %f203, 0f3E2AAB94; fma.rn.f32 %f204, %f202, %f195, %f203; mul.f32 %f205, %f195, %f204; fma.rn.f32 %f206, %f205, %f194, %f194; add.f32 %f207, %f206, %f206; mov.f32 %f208, 0f3FC90FDB; sub.f32 %f209, %f208, %f206; selp.f32 %f210, %f207, %f209, %p36; setp.lt.f32 %p37, %f188, 0f00000000; mov.f32 %f211, 0f40490FDB; sub.f32 %f212, %f211, %f210; selp.f32 %f213, %f212, %f210, %p37; max.f32 %f250, %f250, %f213; BB0_28: add.s32 %r28, %r3, 1; @%p32 bra BB0_30; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_31; BB0_30: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_31: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd58, %r113; mul.wide.s32 %rd59, %r113, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f214, [%rd60]; ld.global.nc.f32 %f215, [%rd61]; mul.f32 %f216, %f215, %f215; fma.rn.f32 %f217, %f214, %f214, %f216; ld.global.nc.f32 %f218, [%rd62]; fma.rn.f32 %f219, %f218, %f218, %f217; setp.eq.f32 %p39, %f219, 0f00000000; selp.f32 %f33, %f1, %f214, %p39; selp.f32 %f34, %f2, %f215, %p39; selp.f32 %f35, %f3, %f218, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs23, [%rd63]; setp.gt.u16 %p40, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p40; selp.b32 %r117, %r115, %r7, %p40; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd64, %r121, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f220, [%rd65]; setp.eq.f32 %p41, %f220, 0f00000000; @%p41 bra BB0_33; mul.f32 %f221, %f2, %f34; fma.rn.f32 %f222, %f1, %f33, %f221; fma.rn.f32 %f223, %f3, %f35, %f222; abs.f32 %f224, %f223; mov.f32 %f225, 0f3F800000; sub.f32 %f226, %f225, %f224; mul.f32 %f227, %f226, 0f3F000000; sqrt.rn.f32 %f228, %f227; setp.gt.f32 %p42, %f224, 0f3F11EB85; selp.f32 %f229, %f228, %f224, %p42; mul.f32 %f230, %f229, %f229; mov.f32 %f231, 0f3C94D2E9; mov.f32 %f232, 0f3D53F941; fma.rn.f32 %f233, %f232, %f230, %f231; mov.f32 %f234, 0f3D3F841F; fma.rn.f32 %f235, %f233, %f230, %f234; mov.f32 %f236, 0f3D994929; fma.rn.f32 %f237, %f235, %f230, %f236; mov.f32 %f238, 0f3E2AAB94; fma.rn.f32 %f239, %f237, %f230, %f238; mul.f32 %f240, %f230, %f239; fma.rn.f32 %f241, %f240, %f229, %f229; add.f32 %f242, %f241, %f241; mov.f32 %f243, 0f3FC90FDB; sub.f32 %f244, %f243, %f241; selp.f32 %f245, %f242, %f244, %p42; setp.lt.f32 %p43, %f223, 0f00000000; mov.f32 %f246, 0f40490FDB; sub.f32 %f247, %f246, %f245; selp.f32 %f248, %f247, %f245, %p43; max.f32 %f250, %f250, %f248; BB0_33: cvta.to.global.u64 %rd66, %rd6; add.s64 %rd68, %rd66, %rd12; st.global.f32 [%rd68], %f250; BB0_34: ret; } ` setmaxangle_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<26>; .reg .f32 %f<255>; .reg .b32 %r<128>; .reg .b64 %rd<69>; ld.param.u64 %rd6, [setmaxangle_param_0]; ld.param.u64 %rd7, [setmaxangle_param_1]; ld.param.u64 %rd8, [setmaxangle_param_2]; ld.param.u64 %rd9, [setmaxangle_param_3]; ld.param.u64 %rd10, [setmaxangle_param_4]; ld.param.u64 %rd11, [setmaxangle_param_5]; ld.param.u32 %r32, [setmaxangle_param_6]; ld.param.u32 %r33, [setmaxangle_param_7]; ld.param.u32 %r34, [setmaxangle_param_8]; ld.param.u8 %rs5, [setmaxangle_param_9]; cvta.to.global.u64 %rd1, %rd10; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd9; cvta.to.global.u64 %rd4, %rd8; cvta.to.global.u64 %rd5, %rd7; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_34; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd5, %rd12; add.s64 %rd14, %rd4, %rd12; add.s64 %rd15, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f38, %f2, %f2; fma.rn.f32 %f39, %f1, %f1, %f38; ld.global.nc.f32 %f3, [%rd15]; fma.rn.f32 %f40, %f3, %f3, %f39; setp.eq.f32 %p6, %f40, 0f00000000; @%p6 bra BB0_34; cvt.s64.s32 %rd16, %r6; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs1, [%rd17]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd18, %r49; mul.wide.s32 %rd19, %r49, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd21]; mul.f32 %f42, %f5, %f5; fma.rn.f32 %f43, %f4, %f4, %f42; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f43; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; setp.gt.u16 %p8, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p8; selp.b32 %r53, %r51, %r7, %p8; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd24, %r57, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f44, [%rd25]; mov.f32 %f250, 0f00000000; setp.eq.f32 %p9, %f44, 0f00000000; @%p9 bra BB0_7; setp.eq.f32 %p10, %f7, 0f00000000; selp.f32 %f45, %f1, %f4, %p10; selp.f32 %f46, %f2, %f5, %p10; mul.f32 %f47, %f2, %f46; fma.rn.f32 %f48, %f1, %f45, %f47; selp.f32 %f49, %f3, %f6, %p10; fma.rn.f32 %f50, %f3, %f49, %f48; abs.f32 %f51, %f50; mov.f32 %f52, 0f3F800000; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, 0f3F000000; sqrt.rn.f32 %f55, %f54; setp.gt.f32 %p11, %f51, 0f3F11EB85; selp.f32 %f56, %f55, %f51, %p11; mul.f32 %f57, %f56, %f56; mov.f32 %f58, 0f3C94D2E9; mov.f32 %f59, 0f3D53F941; fma.rn.f32 %f60, %f59, %f57, %f58; mov.f32 %f61, 0f3D3F841F; fma.rn.f32 %f62, %f60, %f57, %f61; mov.f32 %f63, 0f3D994929; fma.rn.f32 %f64, %f62, %f57, %f63; mov.f32 %f65, 0f3E2AAB94; fma.rn.f32 %f66, %f64, %f57, %f65; mul.f32 %f67, %f57, %f66; fma.rn.f32 %f68, %f67, %f56, %f56; add.f32 %f69, %f68, %f68; mov.f32 %f70, 0f3FC90FDB; sub.f32 %f71, %f70, %f68; selp.f32 %f72, %f69, %f71, %p11; setp.lt.f32 %p12, %f50, 0f00000000; mov.f32 %f73, 0f40490FDB; sub.f32 %f74, %f73, %f72; selp.f32 %f75, %f74, %f72, %p12; mov.f32 %f76, 0f00000000; max.f32 %f250, %f76, %f75; BB0_7: add.s32 %r12, %r1, 1; @%p7 bra BB0_9; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_10; BB0_9: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_10: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd26, %r61; mul.wide.s32 %rd27, %r61, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f10, [%rd28]; ld.global.nc.f32 %f11, [%rd29]; mul.f32 %f77, %f11, %f11; fma.rn.f32 %f78, %f10, %f10, %f77; ld.global.nc.f32 %f12, [%rd30]; fma.rn.f32 %f13, %f12, %f12, %f78; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs9, [%rd31]; setp.gt.u16 %p14, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p14; selp.b32 %r65, %r63, %r7, %p14; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd32, %r69, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f79, [%rd33]; setp.eq.f32 %p15, %f79, 0f00000000; @%p15 bra BB0_12; setp.eq.f32 %p16, %f13, 0f00000000; selp.f32 %f80, %f1, %f10, %p16; selp.f32 %f81, %f2, %f11, %p16; mul.f32 %f82, %f2, %f81; fma.rn.f32 %f83, %f1, %f80, %f82; selp.f32 %f84, %f3, %f12, %p16; fma.rn.f32 %f85, %f3, %f84, %f83; abs.f32 %f86, %f85; mov.f32 %f87, 0f3F800000; sub.f32 %f88, %f87, %f86; mul.f32 %f89, %f88, 0f3F000000; sqrt.rn.f32 %f90, %f89; setp.gt.f32 %p17, %f86, 0f3F11EB85; selp.f32 %f91, %f90, %f86, %p17; mul.f32 %f92, %f91, %f91; mov.f32 %f93, 0f3C94D2E9; mov.f32 %f94, 0f3D53F941; fma.rn.f32 %f95, %f94, %f92, %f93; mov.f32 %f96, 0f3D3F841F; fma.rn.f32 %f97, %f95, %f92, %f96; mov.f32 %f98, 0f3D994929; fma.rn.f32 %f99, %f97, %f92, %f98; mov.f32 %f100, 0f3E2AAB94; fma.rn.f32 %f101, %f99, %f92, %f100; mul.f32 %f102, %f92, %f101; fma.rn.f32 %f103, %f102, %f91, %f91; add.f32 %f104, %f103, %f103; mov.f32 %f105, 0f3FC90FDB; sub.f32 %f106, %f105, %f103; selp.f32 %f107, %f104, %f106, %p17; setp.lt.f32 %p18, %f85, 0f00000000; mov.f32 %f108, 0f40490FDB; sub.f32 %f109, %f108, %f107; selp.f32 %f110, %f109, %f107, %p18; max.f32 %f250, %f250, %f110; BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r16, %r2, -1; @%p19 bra BB0_14; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_15; BB0_14: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_15: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f16, [%rd36]; ld.global.nc.f32 %f17, [%rd37]; mul.f32 %f111, %f17, %f17; fma.rn.f32 %f112, %f16, %f16, %f111; ld.global.nc.f32 %f18, [%rd38]; fma.rn.f32 %f19, %f18, %f18, %f112; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs12, [%rd39]; setp.gt.u16 %p20, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p20; selp.b32 %r78, %r76, %r7, %p20; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd40, %r82, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f113, [%rd41]; setp.eq.f32 %p21, %f113, 0f00000000; @%p21 bra BB0_17; setp.eq.f32 %p22, %f19, 0f00000000; selp.f32 %f114, %f1, %f16, %p22; selp.f32 %f115, %f2, %f17, %p22; mul.f32 %f116, %f2, %f115; fma.rn.f32 %f117, %f1, %f114, %f116; selp.f32 %f118, %f3, %f18, %p22; fma.rn.f32 %f119, %f3, %f118, %f117; abs.f32 %f120, %f119; mov.f32 %f121, 0f3F800000; sub.f32 %f122, %f121, %f120; mul.f32 %f123, %f122, 0f3F000000; sqrt.rn.f32 %f124, %f123; setp.gt.f32 %p23, %f120, 0f3F11EB85; selp.f32 %f125, %f124, %f120, %p23; mul.f32 %f126, %f125, %f125; mov.f32 %f127, 0f3C94D2E9; mov.f32 %f128, 0f3D53F941; fma.rn.f32 %f129, %f128, %f126, %f127; mov.f32 %f130, 0f3D3F841F; fma.rn.f32 %f131, %f129, %f126, %f130; mov.f32 %f132, 0f3D994929; fma.rn.f32 %f133, %f131, %f126, %f132; mov.f32 %f134, 0f3E2AAB94; fma.rn.f32 %f135, %f133, %f126, %f134; mul.f32 %f136, %f126, %f135; fma.rn.f32 %f137, %f136, %f125, %f125; add.f32 %f138, %f137, %f137; mov.f32 %f139, 0f3FC90FDB; sub.f32 %f140, %f139, %f137; selp.f32 %f141, %f138, %f140, %p23; setp.lt.f32 %p24, %f119, 0f00000000; mov.f32 %f142, 0f40490FDB; sub.f32 %f143, %f142, %f141; selp.f32 %f144, %f143, %f141, %p24; max.f32 %f250, %f250, %f144; BB0_17: add.s32 %r20, %r2, 1; @%p19 bra BB0_19; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_20; BB0_19: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_20: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd42, %r87; mul.wide.s32 %rd43, %r87, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f22, [%rd44]; ld.global.nc.f32 %f23, [%rd45]; mul.f32 %f145, %f23, %f23; fma.rn.f32 %f146, %f22, %f22, %f145; ld.global.nc.f32 %f24, [%rd46]; fma.rn.f32 %f25, %f24, %f24, %f146; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs16, [%rd47]; setp.gt.u16 %p26, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p26; selp.b32 %r91, %r89, %r7, %p26; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f147, [%rd49]; setp.eq.f32 %p27, %f147, 0f00000000; @%p27 bra BB0_22; setp.eq.f32 %p28, %f25, 0f00000000; selp.f32 %f148, %f1, %f22, %p28; selp.f32 %f149, %f2, %f23, %p28; mul.f32 %f150, %f2, %f149; fma.rn.f32 %f151, %f1, %f148, %f150; selp.f32 %f152, %f3, %f24, %p28; fma.rn.f32 %f153, %f3, %f152, %f151; abs.f32 %f154, %f153; mov.f32 %f155, 0f3F800000; sub.f32 %f156, %f155, %f154; mul.f32 %f157, %f156, 0f3F000000; sqrt.rn.f32 %f158, %f157; setp.gt.f32 %p29, %f154, 0f3F11EB85; selp.f32 %f159, %f158, %f154, %p29; mul.f32 %f160, %f159, %f159; mov.f32 %f161, 0f3C94D2E9; mov.f32 %f162, 0f3D53F941; fma.rn.f32 %f163, %f162, %f160, %f161; mov.f32 %f164, 0f3D3F841F; fma.rn.f32 %f165, %f163, %f160, %f164; mov.f32 %f166, 0f3D994929; fma.rn.f32 %f167, %f165, %f160, %f166; mov.f32 %f168, 0f3E2AAB94; fma.rn.f32 %f169, %f167, %f160, %f168; mul.f32 %f170, %f160, %f169; fma.rn.f32 %f171, %f170, %f159, %f159; add.f32 %f172, %f171, %f171; mov.f32 %f173, 0f3FC90FDB; sub.f32 %f174, %f173, %f171; selp.f32 %f175, %f172, %f174, %p29; setp.lt.f32 %p30, %f153, 0f00000000; mov.f32 %f176, 0f40490FDB; sub.f32 %f177, %f176, %f175; selp.f32 %f178, %f177, %f175, %p30; max.f32 %f250, %f250, %f178; BB0_22: setp.eq.s32 %p31, %r34, 1; @%p31 bra BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r24, %r3, -1; @%p32 bra BB0_25; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_26; BB0_25: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_26: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd50, %r100; mul.wide.s32 %rd51, %r100, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f179, [%rd52]; ld.global.nc.f32 %f180, [%rd53]; mul.f32 %f181, %f180, %f180; fma.rn.f32 %f182, %f179, %f179, %f181; ld.global.nc.f32 %f183, [%rd54]; fma.rn.f32 %f184, %f183, %f183, %f182; setp.eq.f32 %p33, %f184, 0f00000000; selp.f32 %f28, %f1, %f179, %p33; selp.f32 %f29, %f2, %f180, %p33; selp.f32 %f30, %f3, %f183, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs19, [%rd55]; setp.gt.u16 %p34, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p34; selp.b32 %r104, %r102, %r7, %p34; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f185, [%rd57]; setp.eq.f32 %p35, %f185, 0f00000000; @%p35 bra BB0_28; mul.f32 %f186, %f2, %f29; fma.rn.f32 %f187, %f1, %f28, %f186; fma.rn.f32 %f188, %f3, %f30, %f187; abs.f32 %f189, %f188; mov.f32 %f190, 0f3F800000; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f191, 0f3F000000; sqrt.rn.f32 %f193, %f192; setp.gt.f32 %p36, %f189, 0f3F11EB85; selp.f32 %f194, %f193, %f189, %p36; mul.f32 %f195, %f194, %f194; mov.f32 %f196, 0f3C94D2E9; mov.f32 %f197, 0f3D53F941; fma.rn.f32 %f198, %f197, %f195, %f196; mov.f32 %f199, 0f3D3F841F; fma.rn.f32 %f200, %f198, %f195, %f199; mov.f32 %f201, 0f3D994929; fma.rn.f32 %f202, %f200, %f195, %f201; mov.f32 %f203, 0f3E2AAB94; fma.rn.f32 %f204, %f202, %f195, %f203; mul.f32 %f205, %f195, %f204; fma.rn.f32 %f206, %f205, %f194, %f194; add.f32 %f207, %f206, %f206; mov.f32 %f208, 0f3FC90FDB; sub.f32 %f209, %f208, %f206; selp.f32 %f210, %f207, %f209, %p36; setp.lt.f32 %p37, %f188, 0f00000000; mov.f32 %f211, 0f40490FDB; sub.f32 %f212, %f211, %f210; selp.f32 %f213, %f212, %f210, %p37; max.f32 %f250, %f250, %f213; BB0_28: add.s32 %r28, %r3, 1; @%p32 bra BB0_30; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_31; BB0_30: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_31: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd58, %r113; mul.wide.s32 %rd59, %r113, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f214, [%rd60]; ld.global.nc.f32 %f215, [%rd61]; mul.f32 %f216, %f215, %f215; fma.rn.f32 %f217, %f214, %f214, %f216; ld.global.nc.f32 %f218, [%rd62]; fma.rn.f32 %f219, %f218, %f218, %f217; setp.eq.f32 %p39, %f219, 0f00000000; selp.f32 %f33, %f1, %f214, %p39; selp.f32 %f34, %f2, %f215, %p39; selp.f32 %f35, %f3, %f218, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs23, [%rd63]; setp.gt.u16 %p40, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p40; selp.b32 %r117, %r115, %r7, %p40; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd64, %r121, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f220, [%rd65]; setp.eq.f32 %p41, %f220, 0f00000000; @%p41 bra BB0_33; mul.f32 %f221, %f2, %f34; fma.rn.f32 %f222, %f1, %f33, %f221; fma.rn.f32 %f223, %f3, %f35, %f222; abs.f32 %f224, %f223; mov.f32 %f225, 0f3F800000; sub.f32 %f226, %f225, %f224; mul.f32 %f227, %f226, 0f3F000000; sqrt.rn.f32 %f228, %f227; setp.gt.f32 %p42, %f224, 0f3F11EB85; selp.f32 %f229, %f228, %f224, %p42; mul.f32 %f230, %f229, %f229; mov.f32 %f231, 0f3C94D2E9; mov.f32 %f232, 0f3D53F941; fma.rn.f32 %f233, %f232, %f230, %f231; mov.f32 %f234, 0f3D3F841F; fma.rn.f32 %f235, %f233, %f230, %f234; mov.f32 %f236, 0f3D994929; fma.rn.f32 %f237, %f235, %f230, %f236; mov.f32 %f238, 0f3E2AAB94; fma.rn.f32 %f239, %f237, %f230, %f238; mul.f32 %f240, %f230, %f239; fma.rn.f32 %f241, %f240, %f229, %f229; add.f32 %f242, %f241, %f241; mov.f32 %f243, 0f3FC90FDB; sub.f32 %f244, %f243, %f241; selp.f32 %f245, %f242, %f244, %p42; setp.lt.f32 %p43, %f223, 0f00000000; mov.f32 %f246, 0f40490FDB; sub.f32 %f247, %f246, %f245; selp.f32 %f248, %f247, %f245, %p43; max.f32 %f250, %f250, %f248; BB0_33: cvta.to.global.u64 %rd66, %rd6; add.s64 %rd68, %rd66, %rd12; st.global.f32 [%rd68], %f250; BB0_34: ret; } ` setmaxangle_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<26>; .reg .f32 %f<255>; .reg .b32 %r<128>; .reg .b64 %rd<69>; ld.param.u64 %rd6, [setmaxangle_param_0]; ld.param.u64 %rd7, [setmaxangle_param_1]; ld.param.u64 %rd8, [setmaxangle_param_2]; ld.param.u64 %rd9, [setmaxangle_param_3]; ld.param.u64 %rd10, [setmaxangle_param_4]; ld.param.u64 %rd11, [setmaxangle_param_5]; ld.param.u32 %r32, [setmaxangle_param_6]; ld.param.u32 %r33, [setmaxangle_param_7]; ld.param.u32 %r34, [setmaxangle_param_8]; ld.param.u8 %rs5, [setmaxangle_param_9]; cvta.to.global.u64 %rd1, %rd10; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd9; cvta.to.global.u64 %rd4, %rd8; cvta.to.global.u64 %rd5, %rd7; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_34; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd5, %rd12; add.s64 %rd14, %rd4, %rd12; add.s64 %rd15, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f38, %f2, %f2; fma.rn.f32 %f39, %f1, %f1, %f38; ld.global.nc.f32 %f3, [%rd15]; fma.rn.f32 %f40, %f3, %f3, %f39; setp.eq.f32 %p6, %f40, 0f00000000; @%p6 bra BB0_34; cvt.s64.s32 %rd16, %r6; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs1, [%rd17]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd18, %r49; mul.wide.s32 %rd19, %r49, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd21]; mul.f32 %f42, %f5, %f5; fma.rn.f32 %f43, %f4, %f4, %f42; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f43; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; setp.gt.u16 %p8, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p8; selp.b32 %r53, %r51, %r7, %p8; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd24, %r57, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f44, [%rd25]; mov.f32 %f250, 0f00000000; setp.eq.f32 %p9, %f44, 0f00000000; @%p9 bra BB0_7; setp.eq.f32 %p10, %f7, 0f00000000; selp.f32 %f45, %f1, %f4, %p10; selp.f32 %f46, %f2, %f5, %p10; mul.f32 %f47, %f2, %f46; fma.rn.f32 %f48, %f1, %f45, %f47; selp.f32 %f49, %f3, %f6, %p10; fma.rn.f32 %f50, %f3, %f49, %f48; abs.f32 %f51, %f50; mov.f32 %f52, 0f3F800000; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, 0f3F000000; sqrt.rn.f32 %f55, %f54; setp.gt.f32 %p11, %f51, 0f3F11EB85; selp.f32 %f56, %f55, %f51, %p11; mul.f32 %f57, %f56, %f56; mov.f32 %f58, 0f3C94D2E9; mov.f32 %f59, 0f3D53F941; fma.rn.f32 %f60, %f59, %f57, %f58; mov.f32 %f61, 0f3D3F841F; fma.rn.f32 %f62, %f60, %f57, %f61; mov.f32 %f63, 0f3D994929; fma.rn.f32 %f64, %f62, %f57, %f63; mov.f32 %f65, 0f3E2AAB94; fma.rn.f32 %f66, %f64, %f57, %f65; mul.f32 %f67, %f57, %f66; fma.rn.f32 %f68, %f67, %f56, %f56; add.f32 %f69, %f68, %f68; mov.f32 %f70, 0f3FC90FDB; sub.f32 %f71, %f70, %f68; selp.f32 %f72, %f69, %f71, %p11; setp.lt.f32 %p12, %f50, 0f00000000; mov.f32 %f73, 0f40490FDB; sub.f32 %f74, %f73, %f72; selp.f32 %f75, %f74, %f72, %p12; mov.f32 %f76, 0f00000000; max.f32 %f250, %f76, %f75; BB0_7: add.s32 %r12, %r1, 1; @%p7 bra BB0_9; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_10; BB0_9: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_10: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd26, %r61; mul.wide.s32 %rd27, %r61, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f10, [%rd28]; ld.global.nc.f32 %f11, [%rd29]; mul.f32 %f77, %f11, %f11; fma.rn.f32 %f78, %f10, %f10, %f77; ld.global.nc.f32 %f12, [%rd30]; fma.rn.f32 %f13, %f12, %f12, %f78; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs9, [%rd31]; setp.gt.u16 %p14, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p14; selp.b32 %r65, %r63, %r7, %p14; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd32, %r69, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f79, [%rd33]; setp.eq.f32 %p15, %f79, 0f00000000; @%p15 bra BB0_12; setp.eq.f32 %p16, %f13, 0f00000000; selp.f32 %f80, %f1, %f10, %p16; selp.f32 %f81, %f2, %f11, %p16; mul.f32 %f82, %f2, %f81; fma.rn.f32 %f83, %f1, %f80, %f82; selp.f32 %f84, %f3, %f12, %p16; fma.rn.f32 %f85, %f3, %f84, %f83; abs.f32 %f86, %f85; mov.f32 %f87, 0f3F800000; sub.f32 %f88, %f87, %f86; mul.f32 %f89, %f88, 0f3F000000; sqrt.rn.f32 %f90, %f89; setp.gt.f32 %p17, %f86, 0f3F11EB85; selp.f32 %f91, %f90, %f86, %p17; mul.f32 %f92, %f91, %f91; mov.f32 %f93, 0f3C94D2E9; mov.f32 %f94, 0f3D53F941; fma.rn.f32 %f95, %f94, %f92, %f93; mov.f32 %f96, 0f3D3F841F; fma.rn.f32 %f97, %f95, %f92, %f96; mov.f32 %f98, 0f3D994929; fma.rn.f32 %f99, %f97, %f92, %f98; mov.f32 %f100, 0f3E2AAB94; fma.rn.f32 %f101, %f99, %f92, %f100; mul.f32 %f102, %f92, %f101; fma.rn.f32 %f103, %f102, %f91, %f91; add.f32 %f104, %f103, %f103; mov.f32 %f105, 0f3FC90FDB; sub.f32 %f106, %f105, %f103; selp.f32 %f107, %f104, %f106, %p17; setp.lt.f32 %p18, %f85, 0f00000000; mov.f32 %f108, 0f40490FDB; sub.f32 %f109, %f108, %f107; selp.f32 %f110, %f109, %f107, %p18; max.f32 %f250, %f250, %f110; BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r16, %r2, -1; @%p19 bra BB0_14; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_15; BB0_14: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_15: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f16, [%rd36]; ld.global.nc.f32 %f17, [%rd37]; mul.f32 %f111, %f17, %f17; fma.rn.f32 %f112, %f16, %f16, %f111; ld.global.nc.f32 %f18, [%rd38]; fma.rn.f32 %f19, %f18, %f18, %f112; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs12, [%rd39]; setp.gt.u16 %p20, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p20; selp.b32 %r78, %r76, %r7, %p20; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd40, %r82, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f113, [%rd41]; setp.eq.f32 %p21, %f113, 0f00000000; @%p21 bra BB0_17; setp.eq.f32 %p22, %f19, 0f00000000; selp.f32 %f114, %f1, %f16, %p22; selp.f32 %f115, %f2, %f17, %p22; mul.f32 %f116, %f2, %f115; fma.rn.f32 %f117, %f1, %f114, %f116; selp.f32 %f118, %f3, %f18, %p22; fma.rn.f32 %f119, %f3, %f118, %f117; abs.f32 %f120, %f119; mov.f32 %f121, 0f3F800000; sub.f32 %f122, %f121, %f120; mul.f32 %f123, %f122, 0f3F000000; sqrt.rn.f32 %f124, %f123; setp.gt.f32 %p23, %f120, 0f3F11EB85; selp.f32 %f125, %f124, %f120, %p23; mul.f32 %f126, %f125, %f125; mov.f32 %f127, 0f3C94D2E9; mov.f32 %f128, 0f3D53F941; fma.rn.f32 %f129, %f128, %f126, %f127; mov.f32 %f130, 0f3D3F841F; fma.rn.f32 %f131, %f129, %f126, %f130; mov.f32 %f132, 0f3D994929; fma.rn.f32 %f133, %f131, %f126, %f132; mov.f32 %f134, 0f3E2AAB94; fma.rn.f32 %f135, %f133, %f126, %f134; mul.f32 %f136, %f126, %f135; fma.rn.f32 %f137, %f136, %f125, %f125; add.f32 %f138, %f137, %f137; mov.f32 %f139, 0f3FC90FDB; sub.f32 %f140, %f139, %f137; selp.f32 %f141, %f138, %f140, %p23; setp.lt.f32 %p24, %f119, 0f00000000; mov.f32 %f142, 0f40490FDB; sub.f32 %f143, %f142, %f141; selp.f32 %f144, %f143, %f141, %p24; max.f32 %f250, %f250, %f144; BB0_17: add.s32 %r20, %r2, 1; @%p19 bra BB0_19; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_20; BB0_19: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_20: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd42, %r87; mul.wide.s32 %rd43, %r87, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f22, [%rd44]; ld.global.nc.f32 %f23, [%rd45]; mul.f32 %f145, %f23, %f23; fma.rn.f32 %f146, %f22, %f22, %f145; ld.global.nc.f32 %f24, [%rd46]; fma.rn.f32 %f25, %f24, %f24, %f146; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs16, [%rd47]; setp.gt.u16 %p26, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p26; selp.b32 %r91, %r89, %r7, %p26; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f147, [%rd49]; setp.eq.f32 %p27, %f147, 0f00000000; @%p27 bra BB0_22; setp.eq.f32 %p28, %f25, 0f00000000; selp.f32 %f148, %f1, %f22, %p28; selp.f32 %f149, %f2, %f23, %p28; mul.f32 %f150, %f2, %f149; fma.rn.f32 %f151, %f1, %f148, %f150; selp.f32 %f152, %f3, %f24, %p28; fma.rn.f32 %f153, %f3, %f152, %f151; abs.f32 %f154, %f153; mov.f32 %f155, 0f3F800000; sub.f32 %f156, %f155, %f154; mul.f32 %f157, %f156, 0f3F000000; sqrt.rn.f32 %f158, %f157; setp.gt.f32 %p29, %f154, 0f3F11EB85; selp.f32 %f159, %f158, %f154, %p29; mul.f32 %f160, %f159, %f159; mov.f32 %f161, 0f3C94D2E9; mov.f32 %f162, 0f3D53F941; fma.rn.f32 %f163, %f162, %f160, %f161; mov.f32 %f164, 0f3D3F841F; fma.rn.f32 %f165, %f163, %f160, %f164; mov.f32 %f166, 0f3D994929; fma.rn.f32 %f167, %f165, %f160, %f166; mov.f32 %f168, 0f3E2AAB94; fma.rn.f32 %f169, %f167, %f160, %f168; mul.f32 %f170, %f160, %f169; fma.rn.f32 %f171, %f170, %f159, %f159; add.f32 %f172, %f171, %f171; mov.f32 %f173, 0f3FC90FDB; sub.f32 %f174, %f173, %f171; selp.f32 %f175, %f172, %f174, %p29; setp.lt.f32 %p30, %f153, 0f00000000; mov.f32 %f176, 0f40490FDB; sub.f32 %f177, %f176, %f175; selp.f32 %f178, %f177, %f175, %p30; max.f32 %f250, %f250, %f178; BB0_22: setp.eq.s32 %p31, %r34, 1; @%p31 bra BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r24, %r3, -1; @%p32 bra BB0_25; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_26; BB0_25: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_26: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd50, %r100; mul.wide.s32 %rd51, %r100, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f179, [%rd52]; ld.global.nc.f32 %f180, [%rd53]; mul.f32 %f181, %f180, %f180; fma.rn.f32 %f182, %f179, %f179, %f181; ld.global.nc.f32 %f183, [%rd54]; fma.rn.f32 %f184, %f183, %f183, %f182; setp.eq.f32 %p33, %f184, 0f00000000; selp.f32 %f28, %f1, %f179, %p33; selp.f32 %f29, %f2, %f180, %p33; selp.f32 %f30, %f3, %f183, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs19, [%rd55]; setp.gt.u16 %p34, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p34; selp.b32 %r104, %r102, %r7, %p34; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f185, [%rd57]; setp.eq.f32 %p35, %f185, 0f00000000; @%p35 bra BB0_28; mul.f32 %f186, %f2, %f29; fma.rn.f32 %f187, %f1, %f28, %f186; fma.rn.f32 %f188, %f3, %f30, %f187; abs.f32 %f189, %f188; mov.f32 %f190, 0f3F800000; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f191, 0f3F000000; sqrt.rn.f32 %f193, %f192; setp.gt.f32 %p36, %f189, 0f3F11EB85; selp.f32 %f194, %f193, %f189, %p36; mul.f32 %f195, %f194, %f194; mov.f32 %f196, 0f3C94D2E9; mov.f32 %f197, 0f3D53F941; fma.rn.f32 %f198, %f197, %f195, %f196; mov.f32 %f199, 0f3D3F841F; fma.rn.f32 %f200, %f198, %f195, %f199; mov.f32 %f201, 0f3D994929; fma.rn.f32 %f202, %f200, %f195, %f201; mov.f32 %f203, 0f3E2AAB94; fma.rn.f32 %f204, %f202, %f195, %f203; mul.f32 %f205, %f195, %f204; fma.rn.f32 %f206, %f205, %f194, %f194; add.f32 %f207, %f206, %f206; mov.f32 %f208, 0f3FC90FDB; sub.f32 %f209, %f208, %f206; selp.f32 %f210, %f207, %f209, %p36; setp.lt.f32 %p37, %f188, 0f00000000; mov.f32 %f211, 0f40490FDB; sub.f32 %f212, %f211, %f210; selp.f32 %f213, %f212, %f210, %p37; max.f32 %f250, %f250, %f213; BB0_28: add.s32 %r28, %r3, 1; @%p32 bra BB0_30; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_31; BB0_30: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_31: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd58, %r113; mul.wide.s32 %rd59, %r113, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f214, [%rd60]; ld.global.nc.f32 %f215, [%rd61]; mul.f32 %f216, %f215, %f215; fma.rn.f32 %f217, %f214, %f214, %f216; ld.global.nc.f32 %f218, [%rd62]; fma.rn.f32 %f219, %f218, %f218, %f217; setp.eq.f32 %p39, %f219, 0f00000000; selp.f32 %f33, %f1, %f214, %p39; selp.f32 %f34, %f2, %f215, %p39; selp.f32 %f35, %f3, %f218, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs23, [%rd63]; setp.gt.u16 %p40, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p40; selp.b32 %r117, %r115, %r7, %p40; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd64, %r121, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f220, [%rd65]; setp.eq.f32 %p41, %f220, 0f00000000; @%p41 bra BB0_33; mul.f32 %f221, %f2, %f34; fma.rn.f32 %f222, %f1, %f33, %f221; fma.rn.f32 %f223, %f3, %f35, %f222; abs.f32 %f224, %f223; mov.f32 %f225, 0f3F800000; sub.f32 %f226, %f225, %f224; mul.f32 %f227, %f226, 0f3F000000; sqrt.rn.f32 %f228, %f227; setp.gt.f32 %p42, %f224, 0f3F11EB85; selp.f32 %f229, %f228, %f224, %p42; mul.f32 %f230, %f229, %f229; mov.f32 %f231, 0f3C94D2E9; mov.f32 %f232, 0f3D53F941; fma.rn.f32 %f233, %f232, %f230, %f231; mov.f32 %f234, 0f3D3F841F; fma.rn.f32 %f235, %f233, %f230, %f234; mov.f32 %f236, 0f3D994929; fma.rn.f32 %f237, %f235, %f230, %f236; mov.f32 %f238, 0f3E2AAB94; fma.rn.f32 %f239, %f237, %f230, %f238; mul.f32 %f240, %f230, %f239; fma.rn.f32 %f241, %f240, %f229, %f229; add.f32 %f242, %f241, %f241; mov.f32 %f243, 0f3FC90FDB; sub.f32 %f244, %f243, %f241; selp.f32 %f245, %f242, %f244, %p42; setp.lt.f32 %p43, %f223, 0f00000000; mov.f32 %f246, 0f40490FDB; sub.f32 %f247, %f246, %f245; selp.f32 %f248, %f247, %f245, %p43; max.f32 %f250, %f250, %f248; BB0_33: cvta.to.global.u64 %rd66, %rd6; add.s64 %rd68, %rd66, %rd12; st.global.f32 [%rd68], %f250; BB0_34: ret; } ` setmaxangle_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<26>; .reg .f32 %f<255>; .reg .b32 %r<128>; .reg .b64 %rd<69>; ld.param.u64 %rd6, [setmaxangle_param_0]; ld.param.u64 %rd7, [setmaxangle_param_1]; ld.param.u64 %rd8, [setmaxangle_param_2]; ld.param.u64 %rd9, [setmaxangle_param_3]; ld.param.u64 %rd10, [setmaxangle_param_4]; ld.param.u64 %rd11, [setmaxangle_param_5]; ld.param.u32 %r32, [setmaxangle_param_6]; ld.param.u32 %r33, [setmaxangle_param_7]; ld.param.u32 %r34, [setmaxangle_param_8]; ld.param.u8 %rs5, [setmaxangle_param_9]; cvta.to.global.u64 %rd1, %rd10; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd9; cvta.to.global.u64 %rd4, %rd8; cvta.to.global.u64 %rd5, %rd7; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_34; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd5, %rd12; add.s64 %rd14, %rd4, %rd12; add.s64 %rd15, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f38, %f2, %f2; fma.rn.f32 %f39, %f1, %f1, %f38; ld.global.nc.f32 %f3, [%rd15]; fma.rn.f32 %f40, %f3, %f3, %f39; setp.eq.f32 %p6, %f40, 0f00000000; @%p6 bra BB0_34; cvt.s64.s32 %rd16, %r6; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs1, [%rd17]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd18, %r49; mul.wide.s32 %rd19, %r49, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd21]; mul.f32 %f42, %f5, %f5; fma.rn.f32 %f43, %f4, %f4, %f42; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f43; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; setp.gt.u16 %p8, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p8; selp.b32 %r53, %r51, %r7, %p8; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd24, %r57, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f44, [%rd25]; mov.f32 %f250, 0f00000000; setp.eq.f32 %p9, %f44, 0f00000000; @%p9 bra BB0_7; setp.eq.f32 %p10, %f7, 0f00000000; selp.f32 %f45, %f1, %f4, %p10; selp.f32 %f46, %f2, %f5, %p10; mul.f32 %f47, %f2, %f46; fma.rn.f32 %f48, %f1, %f45, %f47; selp.f32 %f49, %f3, %f6, %p10; fma.rn.f32 %f50, %f3, %f49, %f48; abs.f32 %f51, %f50; mov.f32 %f52, 0f3F800000; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, 0f3F000000; sqrt.rn.f32 %f55, %f54; setp.gt.f32 %p11, %f51, 0f3F11EB85; selp.f32 %f56, %f55, %f51, %p11; mul.f32 %f57, %f56, %f56; mov.f32 %f58, 0f3C94D2E9; mov.f32 %f59, 0f3D53F941; fma.rn.f32 %f60, %f59, %f57, %f58; mov.f32 %f61, 0f3D3F841F; fma.rn.f32 %f62, %f60, %f57, %f61; mov.f32 %f63, 0f3D994929; fma.rn.f32 %f64, %f62, %f57, %f63; mov.f32 %f65, 0f3E2AAB94; fma.rn.f32 %f66, %f64, %f57, %f65; mul.f32 %f67, %f57, %f66; fma.rn.f32 %f68, %f67, %f56, %f56; add.f32 %f69, %f68, %f68; mov.f32 %f70, 0f3FC90FDB; sub.f32 %f71, %f70, %f68; selp.f32 %f72, %f69, %f71, %p11; setp.lt.f32 %p12, %f50, 0f00000000; mov.f32 %f73, 0f40490FDB; sub.f32 %f74, %f73, %f72; selp.f32 %f75, %f74, %f72, %p12; mov.f32 %f76, 0f00000000; max.f32 %f250, %f76, %f75; BB0_7: add.s32 %r12, %r1, 1; @%p7 bra BB0_9; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_10; BB0_9: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_10: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd26, %r61; mul.wide.s32 %rd27, %r61, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f10, [%rd28]; ld.global.nc.f32 %f11, [%rd29]; mul.f32 %f77, %f11, %f11; fma.rn.f32 %f78, %f10, %f10, %f77; ld.global.nc.f32 %f12, [%rd30]; fma.rn.f32 %f13, %f12, %f12, %f78; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs9, [%rd31]; setp.gt.u16 %p14, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p14; selp.b32 %r65, %r63, %r7, %p14; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd32, %r69, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f79, [%rd33]; setp.eq.f32 %p15, %f79, 0f00000000; @%p15 bra BB0_12; setp.eq.f32 %p16, %f13, 0f00000000; selp.f32 %f80, %f1, %f10, %p16; selp.f32 %f81, %f2, %f11, %p16; mul.f32 %f82, %f2, %f81; fma.rn.f32 %f83, %f1, %f80, %f82; selp.f32 %f84, %f3, %f12, %p16; fma.rn.f32 %f85, %f3, %f84, %f83; abs.f32 %f86, %f85; mov.f32 %f87, 0f3F800000; sub.f32 %f88, %f87, %f86; mul.f32 %f89, %f88, 0f3F000000; sqrt.rn.f32 %f90, %f89; setp.gt.f32 %p17, %f86, 0f3F11EB85; selp.f32 %f91, %f90, %f86, %p17; mul.f32 %f92, %f91, %f91; mov.f32 %f93, 0f3C94D2E9; mov.f32 %f94, 0f3D53F941; fma.rn.f32 %f95, %f94, %f92, %f93; mov.f32 %f96, 0f3D3F841F; fma.rn.f32 %f97, %f95, %f92, %f96; mov.f32 %f98, 0f3D994929; fma.rn.f32 %f99, %f97, %f92, %f98; mov.f32 %f100, 0f3E2AAB94; fma.rn.f32 %f101, %f99, %f92, %f100; mul.f32 %f102, %f92, %f101; fma.rn.f32 %f103, %f102, %f91, %f91; add.f32 %f104, %f103, %f103; mov.f32 %f105, 0f3FC90FDB; sub.f32 %f106, %f105, %f103; selp.f32 %f107, %f104, %f106, %p17; setp.lt.f32 %p18, %f85, 0f00000000; mov.f32 %f108, 0f40490FDB; sub.f32 %f109, %f108, %f107; selp.f32 %f110, %f109, %f107, %p18; max.f32 %f250, %f250, %f110; BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r16, %r2, -1; @%p19 bra BB0_14; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_15; BB0_14: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_15: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f16, [%rd36]; ld.global.nc.f32 %f17, [%rd37]; mul.f32 %f111, %f17, %f17; fma.rn.f32 %f112, %f16, %f16, %f111; ld.global.nc.f32 %f18, [%rd38]; fma.rn.f32 %f19, %f18, %f18, %f112; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs12, [%rd39]; setp.gt.u16 %p20, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p20; selp.b32 %r78, %r76, %r7, %p20; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd40, %r82, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f113, [%rd41]; setp.eq.f32 %p21, %f113, 0f00000000; @%p21 bra BB0_17; setp.eq.f32 %p22, %f19, 0f00000000; selp.f32 %f114, %f1, %f16, %p22; selp.f32 %f115, %f2, %f17, %p22; mul.f32 %f116, %f2, %f115; fma.rn.f32 %f117, %f1, %f114, %f116; selp.f32 %f118, %f3, %f18, %p22; fma.rn.f32 %f119, %f3, %f118, %f117; abs.f32 %f120, %f119; mov.f32 %f121, 0f3F800000; sub.f32 %f122, %f121, %f120; mul.f32 %f123, %f122, 0f3F000000; sqrt.rn.f32 %f124, %f123; setp.gt.f32 %p23, %f120, 0f3F11EB85; selp.f32 %f125, %f124, %f120, %p23; mul.f32 %f126, %f125, %f125; mov.f32 %f127, 0f3C94D2E9; mov.f32 %f128, 0f3D53F941; fma.rn.f32 %f129, %f128, %f126, %f127; mov.f32 %f130, 0f3D3F841F; fma.rn.f32 %f131, %f129, %f126, %f130; mov.f32 %f132, 0f3D994929; fma.rn.f32 %f133, %f131, %f126, %f132; mov.f32 %f134, 0f3E2AAB94; fma.rn.f32 %f135, %f133, %f126, %f134; mul.f32 %f136, %f126, %f135; fma.rn.f32 %f137, %f136, %f125, %f125; add.f32 %f138, %f137, %f137; mov.f32 %f139, 0f3FC90FDB; sub.f32 %f140, %f139, %f137; selp.f32 %f141, %f138, %f140, %p23; setp.lt.f32 %p24, %f119, 0f00000000; mov.f32 %f142, 0f40490FDB; sub.f32 %f143, %f142, %f141; selp.f32 %f144, %f143, %f141, %p24; max.f32 %f250, %f250, %f144; BB0_17: add.s32 %r20, %r2, 1; @%p19 bra BB0_19; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_20; BB0_19: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_20: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd42, %r87; mul.wide.s32 %rd43, %r87, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f22, [%rd44]; ld.global.nc.f32 %f23, [%rd45]; mul.f32 %f145, %f23, %f23; fma.rn.f32 %f146, %f22, %f22, %f145; ld.global.nc.f32 %f24, [%rd46]; fma.rn.f32 %f25, %f24, %f24, %f146; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs16, [%rd47]; setp.gt.u16 %p26, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p26; selp.b32 %r91, %r89, %r7, %p26; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f147, [%rd49]; setp.eq.f32 %p27, %f147, 0f00000000; @%p27 bra BB0_22; setp.eq.f32 %p28, %f25, 0f00000000; selp.f32 %f148, %f1, %f22, %p28; selp.f32 %f149, %f2, %f23, %p28; mul.f32 %f150, %f2, %f149; fma.rn.f32 %f151, %f1, %f148, %f150; selp.f32 %f152, %f3, %f24, %p28; fma.rn.f32 %f153, %f3, %f152, %f151; abs.f32 %f154, %f153; mov.f32 %f155, 0f3F800000; sub.f32 %f156, %f155, %f154; mul.f32 %f157, %f156, 0f3F000000; sqrt.rn.f32 %f158, %f157; setp.gt.f32 %p29, %f154, 0f3F11EB85; selp.f32 %f159, %f158, %f154, %p29; mul.f32 %f160, %f159, %f159; mov.f32 %f161, 0f3C94D2E9; mov.f32 %f162, 0f3D53F941; fma.rn.f32 %f163, %f162, %f160, %f161; mov.f32 %f164, 0f3D3F841F; fma.rn.f32 %f165, %f163, %f160, %f164; mov.f32 %f166, 0f3D994929; fma.rn.f32 %f167, %f165, %f160, %f166; mov.f32 %f168, 0f3E2AAB94; fma.rn.f32 %f169, %f167, %f160, %f168; mul.f32 %f170, %f160, %f169; fma.rn.f32 %f171, %f170, %f159, %f159; add.f32 %f172, %f171, %f171; mov.f32 %f173, 0f3FC90FDB; sub.f32 %f174, %f173, %f171; selp.f32 %f175, %f172, %f174, %p29; setp.lt.f32 %p30, %f153, 0f00000000; mov.f32 %f176, 0f40490FDB; sub.f32 %f177, %f176, %f175; selp.f32 %f178, %f177, %f175, %p30; max.f32 %f250, %f250, %f178; BB0_22: setp.eq.s32 %p31, %r34, 1; @%p31 bra BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r24, %r3, -1; @%p32 bra BB0_25; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_26; BB0_25: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_26: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd50, %r100; mul.wide.s32 %rd51, %r100, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f179, [%rd52]; ld.global.nc.f32 %f180, [%rd53]; mul.f32 %f181, %f180, %f180; fma.rn.f32 %f182, %f179, %f179, %f181; ld.global.nc.f32 %f183, [%rd54]; fma.rn.f32 %f184, %f183, %f183, %f182; setp.eq.f32 %p33, %f184, 0f00000000; selp.f32 %f28, %f1, %f179, %p33; selp.f32 %f29, %f2, %f180, %p33; selp.f32 %f30, %f3, %f183, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs19, [%rd55]; setp.gt.u16 %p34, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p34; selp.b32 %r104, %r102, %r7, %p34; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f185, [%rd57]; setp.eq.f32 %p35, %f185, 0f00000000; @%p35 bra BB0_28; mul.f32 %f186, %f2, %f29; fma.rn.f32 %f187, %f1, %f28, %f186; fma.rn.f32 %f188, %f3, %f30, %f187; abs.f32 %f189, %f188; mov.f32 %f190, 0f3F800000; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f191, 0f3F000000; sqrt.rn.f32 %f193, %f192; setp.gt.f32 %p36, %f189, 0f3F11EB85; selp.f32 %f194, %f193, %f189, %p36; mul.f32 %f195, %f194, %f194; mov.f32 %f196, 0f3C94D2E9; mov.f32 %f197, 0f3D53F941; fma.rn.f32 %f198, %f197, %f195, %f196; mov.f32 %f199, 0f3D3F841F; fma.rn.f32 %f200, %f198, %f195, %f199; mov.f32 %f201, 0f3D994929; fma.rn.f32 %f202, %f200, %f195, %f201; mov.f32 %f203, 0f3E2AAB94; fma.rn.f32 %f204, %f202, %f195, %f203; mul.f32 %f205, %f195, %f204; fma.rn.f32 %f206, %f205, %f194, %f194; add.f32 %f207, %f206, %f206; mov.f32 %f208, 0f3FC90FDB; sub.f32 %f209, %f208, %f206; selp.f32 %f210, %f207, %f209, %p36; setp.lt.f32 %p37, %f188, 0f00000000; mov.f32 %f211, 0f40490FDB; sub.f32 %f212, %f211, %f210; selp.f32 %f213, %f212, %f210, %p37; max.f32 %f250, %f250, %f213; BB0_28: add.s32 %r28, %r3, 1; @%p32 bra BB0_30; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_31; BB0_30: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_31: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd58, %r113; mul.wide.s32 %rd59, %r113, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f214, [%rd60]; ld.global.nc.f32 %f215, [%rd61]; mul.f32 %f216, %f215, %f215; fma.rn.f32 %f217, %f214, %f214, %f216; ld.global.nc.f32 %f218, [%rd62]; fma.rn.f32 %f219, %f218, %f218, %f217; setp.eq.f32 %p39, %f219, 0f00000000; selp.f32 %f33, %f1, %f214, %p39; selp.f32 %f34, %f2, %f215, %p39; selp.f32 %f35, %f3, %f218, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs23, [%rd63]; setp.gt.u16 %p40, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p40; selp.b32 %r117, %r115, %r7, %p40; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd64, %r121, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f220, [%rd65]; setp.eq.f32 %p41, %f220, 0f00000000; @%p41 bra BB0_33; mul.f32 %f221, %f2, %f34; fma.rn.f32 %f222, %f1, %f33, %f221; fma.rn.f32 %f223, %f3, %f35, %f222; abs.f32 %f224, %f223; mov.f32 %f225, 0f3F800000; sub.f32 %f226, %f225, %f224; mul.f32 %f227, %f226, 0f3F000000; sqrt.rn.f32 %f228, %f227; setp.gt.f32 %p42, %f224, 0f3F11EB85; selp.f32 %f229, %f228, %f224, %p42; mul.f32 %f230, %f229, %f229; mov.f32 %f231, 0f3C94D2E9; mov.f32 %f232, 0f3D53F941; fma.rn.f32 %f233, %f232, %f230, %f231; mov.f32 %f234, 0f3D3F841F; fma.rn.f32 %f235, %f233, %f230, %f234; mov.f32 %f236, 0f3D994929; fma.rn.f32 %f237, %f235, %f230, %f236; mov.f32 %f238, 0f3E2AAB94; fma.rn.f32 %f239, %f237, %f230, %f238; mul.f32 %f240, %f230, %f239; fma.rn.f32 %f241, %f240, %f229, %f229; add.f32 %f242, %f241, %f241; mov.f32 %f243, 0f3FC90FDB; sub.f32 %f244, %f243, %f241; selp.f32 %f245, %f242, %f244, %p42; setp.lt.f32 %p43, %f223, 0f00000000; mov.f32 %f246, 0f40490FDB; sub.f32 %f247, %f246, %f245; selp.f32 %f248, %f247, %f245, %p43; max.f32 %f250, %f250, %f248; BB0_33: cvta.to.global.u64 %rd66, %rd6; add.s64 %rd68, %rd66, %rd12; st.global.f32 [%rd68], %f250; BB0_34: ret; } ` setmaxangle_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<26>; .reg .f32 %f<255>; .reg .b32 %r<128>; .reg .b64 %rd<69>; ld.param.u64 %rd6, [setmaxangle_param_0]; ld.param.u64 %rd7, [setmaxangle_param_1]; ld.param.u64 %rd8, [setmaxangle_param_2]; ld.param.u64 %rd9, [setmaxangle_param_3]; ld.param.u64 %rd10, [setmaxangle_param_4]; ld.param.u64 %rd11, [setmaxangle_param_5]; ld.param.u32 %r32, [setmaxangle_param_6]; ld.param.u32 %r33, [setmaxangle_param_7]; ld.param.u32 %r34, [setmaxangle_param_8]; ld.param.u8 %rs5, [setmaxangle_param_9]; cvta.to.global.u64 %rd1, %rd10; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd9; cvta.to.global.u64 %rd4, %rd8; cvta.to.global.u64 %rd5, %rd7; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_34; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd5, %rd12; add.s64 %rd14, %rd4, %rd12; add.s64 %rd15, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f38, %f2, %f2; fma.rn.f32 %f39, %f1, %f1, %f38; ld.global.nc.f32 %f3, [%rd15]; fma.rn.f32 %f40, %f3, %f3, %f39; setp.eq.f32 %p6, %f40, 0f00000000; @%p6 bra BB0_34; cvt.s64.s32 %rd16, %r6; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs1, [%rd17]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd18, %r49; mul.wide.s32 %rd19, %r49, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd21]; mul.f32 %f42, %f5, %f5; fma.rn.f32 %f43, %f4, %f4, %f42; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f43; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; setp.gt.u16 %p8, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p8; selp.b32 %r53, %r51, %r7, %p8; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd24, %r57, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f44, [%rd25]; mov.f32 %f250, 0f00000000; setp.eq.f32 %p9, %f44, 0f00000000; @%p9 bra BB0_7; setp.eq.f32 %p10, %f7, 0f00000000; selp.f32 %f45, %f1, %f4, %p10; selp.f32 %f46, %f2, %f5, %p10; mul.f32 %f47, %f2, %f46; fma.rn.f32 %f48, %f1, %f45, %f47; selp.f32 %f49, %f3, %f6, %p10; fma.rn.f32 %f50, %f3, %f49, %f48; abs.f32 %f51, %f50; mov.f32 %f52, 0f3F800000; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, 0f3F000000; sqrt.rn.f32 %f55, %f54; setp.gt.f32 %p11, %f51, 0f3F11EB85; selp.f32 %f56, %f55, %f51, %p11; mul.f32 %f57, %f56, %f56; mov.f32 %f58, 0f3C94D2E9; mov.f32 %f59, 0f3D53F941; fma.rn.f32 %f60, %f59, %f57, %f58; mov.f32 %f61, 0f3D3F841F; fma.rn.f32 %f62, %f60, %f57, %f61; mov.f32 %f63, 0f3D994929; fma.rn.f32 %f64, %f62, %f57, %f63; mov.f32 %f65, 0f3E2AAB94; fma.rn.f32 %f66, %f64, %f57, %f65; mul.f32 %f67, %f57, %f66; fma.rn.f32 %f68, %f67, %f56, %f56; add.f32 %f69, %f68, %f68; mov.f32 %f70, 0f3FC90FDB; sub.f32 %f71, %f70, %f68; selp.f32 %f72, %f69, %f71, %p11; setp.lt.f32 %p12, %f50, 0f00000000; mov.f32 %f73, 0f40490FDB; sub.f32 %f74, %f73, %f72; selp.f32 %f75, %f74, %f72, %p12; mov.f32 %f76, 0f00000000; max.f32 %f250, %f76, %f75; BB0_7: add.s32 %r12, %r1, 1; @%p7 bra BB0_9; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_10; BB0_9: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_10: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd26, %r61; mul.wide.s32 %rd27, %r61, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f10, [%rd28]; ld.global.nc.f32 %f11, [%rd29]; mul.f32 %f77, %f11, %f11; fma.rn.f32 %f78, %f10, %f10, %f77; ld.global.nc.f32 %f12, [%rd30]; fma.rn.f32 %f13, %f12, %f12, %f78; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs9, [%rd31]; setp.gt.u16 %p14, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p14; selp.b32 %r65, %r63, %r7, %p14; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd32, %r69, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f79, [%rd33]; setp.eq.f32 %p15, %f79, 0f00000000; @%p15 bra BB0_12; setp.eq.f32 %p16, %f13, 0f00000000; selp.f32 %f80, %f1, %f10, %p16; selp.f32 %f81, %f2, %f11, %p16; mul.f32 %f82, %f2, %f81; fma.rn.f32 %f83, %f1, %f80, %f82; selp.f32 %f84, %f3, %f12, %p16; fma.rn.f32 %f85, %f3, %f84, %f83; abs.f32 %f86, %f85; mov.f32 %f87, 0f3F800000; sub.f32 %f88, %f87, %f86; mul.f32 %f89, %f88, 0f3F000000; sqrt.rn.f32 %f90, %f89; setp.gt.f32 %p17, %f86, 0f3F11EB85; selp.f32 %f91, %f90, %f86, %p17; mul.f32 %f92, %f91, %f91; mov.f32 %f93, 0f3C94D2E9; mov.f32 %f94, 0f3D53F941; fma.rn.f32 %f95, %f94, %f92, %f93; mov.f32 %f96, 0f3D3F841F; fma.rn.f32 %f97, %f95, %f92, %f96; mov.f32 %f98, 0f3D994929; fma.rn.f32 %f99, %f97, %f92, %f98; mov.f32 %f100, 0f3E2AAB94; fma.rn.f32 %f101, %f99, %f92, %f100; mul.f32 %f102, %f92, %f101; fma.rn.f32 %f103, %f102, %f91, %f91; add.f32 %f104, %f103, %f103; mov.f32 %f105, 0f3FC90FDB; sub.f32 %f106, %f105, %f103; selp.f32 %f107, %f104, %f106, %p17; setp.lt.f32 %p18, %f85, 0f00000000; mov.f32 %f108, 0f40490FDB; sub.f32 %f109, %f108, %f107; selp.f32 %f110, %f109, %f107, %p18; max.f32 %f250, %f250, %f110; BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r16, %r2, -1; @%p19 bra BB0_14; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_15; BB0_14: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_15: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f16, [%rd36]; ld.global.nc.f32 %f17, [%rd37]; mul.f32 %f111, %f17, %f17; fma.rn.f32 %f112, %f16, %f16, %f111; ld.global.nc.f32 %f18, [%rd38]; fma.rn.f32 %f19, %f18, %f18, %f112; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs12, [%rd39]; setp.gt.u16 %p20, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p20; selp.b32 %r78, %r76, %r7, %p20; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd40, %r82, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f113, [%rd41]; setp.eq.f32 %p21, %f113, 0f00000000; @%p21 bra BB0_17; setp.eq.f32 %p22, %f19, 0f00000000; selp.f32 %f114, %f1, %f16, %p22; selp.f32 %f115, %f2, %f17, %p22; mul.f32 %f116, %f2, %f115; fma.rn.f32 %f117, %f1, %f114, %f116; selp.f32 %f118, %f3, %f18, %p22; fma.rn.f32 %f119, %f3, %f118, %f117; abs.f32 %f120, %f119; mov.f32 %f121, 0f3F800000; sub.f32 %f122, %f121, %f120; mul.f32 %f123, %f122, 0f3F000000; sqrt.rn.f32 %f124, %f123; setp.gt.f32 %p23, %f120, 0f3F11EB85; selp.f32 %f125, %f124, %f120, %p23; mul.f32 %f126, %f125, %f125; mov.f32 %f127, 0f3C94D2E9; mov.f32 %f128, 0f3D53F941; fma.rn.f32 %f129, %f128, %f126, %f127; mov.f32 %f130, 0f3D3F841F; fma.rn.f32 %f131, %f129, %f126, %f130; mov.f32 %f132, 0f3D994929; fma.rn.f32 %f133, %f131, %f126, %f132; mov.f32 %f134, 0f3E2AAB94; fma.rn.f32 %f135, %f133, %f126, %f134; mul.f32 %f136, %f126, %f135; fma.rn.f32 %f137, %f136, %f125, %f125; add.f32 %f138, %f137, %f137; mov.f32 %f139, 0f3FC90FDB; sub.f32 %f140, %f139, %f137; selp.f32 %f141, %f138, %f140, %p23; setp.lt.f32 %p24, %f119, 0f00000000; mov.f32 %f142, 0f40490FDB; sub.f32 %f143, %f142, %f141; selp.f32 %f144, %f143, %f141, %p24; max.f32 %f250, %f250, %f144; BB0_17: add.s32 %r20, %r2, 1; @%p19 bra BB0_19; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_20; BB0_19: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_20: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd42, %r87; mul.wide.s32 %rd43, %r87, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f22, [%rd44]; ld.global.nc.f32 %f23, [%rd45]; mul.f32 %f145, %f23, %f23; fma.rn.f32 %f146, %f22, %f22, %f145; ld.global.nc.f32 %f24, [%rd46]; fma.rn.f32 %f25, %f24, %f24, %f146; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs16, [%rd47]; setp.gt.u16 %p26, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p26; selp.b32 %r91, %r89, %r7, %p26; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f147, [%rd49]; setp.eq.f32 %p27, %f147, 0f00000000; @%p27 bra BB0_22; setp.eq.f32 %p28, %f25, 0f00000000; selp.f32 %f148, %f1, %f22, %p28; selp.f32 %f149, %f2, %f23, %p28; mul.f32 %f150, %f2, %f149; fma.rn.f32 %f151, %f1, %f148, %f150; selp.f32 %f152, %f3, %f24, %p28; fma.rn.f32 %f153, %f3, %f152, %f151; abs.f32 %f154, %f153; mov.f32 %f155, 0f3F800000; sub.f32 %f156, %f155, %f154; mul.f32 %f157, %f156, 0f3F000000; sqrt.rn.f32 %f158, %f157; setp.gt.f32 %p29, %f154, 0f3F11EB85; selp.f32 %f159, %f158, %f154, %p29; mul.f32 %f160, %f159, %f159; mov.f32 %f161, 0f3C94D2E9; mov.f32 %f162, 0f3D53F941; fma.rn.f32 %f163, %f162, %f160, %f161; mov.f32 %f164, 0f3D3F841F; fma.rn.f32 %f165, %f163, %f160, %f164; mov.f32 %f166, 0f3D994929; fma.rn.f32 %f167, %f165, %f160, %f166; mov.f32 %f168, 0f3E2AAB94; fma.rn.f32 %f169, %f167, %f160, %f168; mul.f32 %f170, %f160, %f169; fma.rn.f32 %f171, %f170, %f159, %f159; add.f32 %f172, %f171, %f171; mov.f32 %f173, 0f3FC90FDB; sub.f32 %f174, %f173, %f171; selp.f32 %f175, %f172, %f174, %p29; setp.lt.f32 %p30, %f153, 0f00000000; mov.f32 %f176, 0f40490FDB; sub.f32 %f177, %f176, %f175; selp.f32 %f178, %f177, %f175, %p30; max.f32 %f250, %f250, %f178; BB0_22: setp.eq.s32 %p31, %r34, 1; @%p31 bra BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r24, %r3, -1; @%p32 bra BB0_25; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_26; BB0_25: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_26: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd50, %r100; mul.wide.s32 %rd51, %r100, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f179, [%rd52]; ld.global.nc.f32 %f180, [%rd53]; mul.f32 %f181, %f180, %f180; fma.rn.f32 %f182, %f179, %f179, %f181; ld.global.nc.f32 %f183, [%rd54]; fma.rn.f32 %f184, %f183, %f183, %f182; setp.eq.f32 %p33, %f184, 0f00000000; selp.f32 %f28, %f1, %f179, %p33; selp.f32 %f29, %f2, %f180, %p33; selp.f32 %f30, %f3, %f183, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs19, [%rd55]; setp.gt.u16 %p34, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p34; selp.b32 %r104, %r102, %r7, %p34; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f185, [%rd57]; setp.eq.f32 %p35, %f185, 0f00000000; @%p35 bra BB0_28; mul.f32 %f186, %f2, %f29; fma.rn.f32 %f187, %f1, %f28, %f186; fma.rn.f32 %f188, %f3, %f30, %f187; abs.f32 %f189, %f188; mov.f32 %f190, 0f3F800000; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f191, 0f3F000000; sqrt.rn.f32 %f193, %f192; setp.gt.f32 %p36, %f189, 0f3F11EB85; selp.f32 %f194, %f193, %f189, %p36; mul.f32 %f195, %f194, %f194; mov.f32 %f196, 0f3C94D2E9; mov.f32 %f197, 0f3D53F941; fma.rn.f32 %f198, %f197, %f195, %f196; mov.f32 %f199, 0f3D3F841F; fma.rn.f32 %f200, %f198, %f195, %f199; mov.f32 %f201, 0f3D994929; fma.rn.f32 %f202, %f200, %f195, %f201; mov.f32 %f203, 0f3E2AAB94; fma.rn.f32 %f204, %f202, %f195, %f203; mul.f32 %f205, %f195, %f204; fma.rn.f32 %f206, %f205, %f194, %f194; add.f32 %f207, %f206, %f206; mov.f32 %f208, 0f3FC90FDB; sub.f32 %f209, %f208, %f206; selp.f32 %f210, %f207, %f209, %p36; setp.lt.f32 %p37, %f188, 0f00000000; mov.f32 %f211, 0f40490FDB; sub.f32 %f212, %f211, %f210; selp.f32 %f213, %f212, %f210, %p37; max.f32 %f250, %f250, %f213; BB0_28: add.s32 %r28, %r3, 1; @%p32 bra BB0_30; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_31; BB0_30: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_31: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd58, %r113; mul.wide.s32 %rd59, %r113, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f214, [%rd60]; ld.global.nc.f32 %f215, [%rd61]; mul.f32 %f216, %f215, %f215; fma.rn.f32 %f217, %f214, %f214, %f216; ld.global.nc.f32 %f218, [%rd62]; fma.rn.f32 %f219, %f218, %f218, %f217; setp.eq.f32 %p39, %f219, 0f00000000; selp.f32 %f33, %f1, %f214, %p39; selp.f32 %f34, %f2, %f215, %p39; selp.f32 %f35, %f3, %f218, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs23, [%rd63]; setp.gt.u16 %p40, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p40; selp.b32 %r117, %r115, %r7, %p40; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd64, %r121, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f220, [%rd65]; setp.eq.f32 %p41, %f220, 0f00000000; @%p41 bra BB0_33; mul.f32 %f221, %f2, %f34; fma.rn.f32 %f222, %f1, %f33, %f221; fma.rn.f32 %f223, %f3, %f35, %f222; abs.f32 %f224, %f223; mov.f32 %f225, 0f3F800000; sub.f32 %f226, %f225, %f224; mul.f32 %f227, %f226, 0f3F000000; sqrt.rn.f32 %f228, %f227; setp.gt.f32 %p42, %f224, 0f3F11EB85; selp.f32 %f229, %f228, %f224, %p42; mul.f32 %f230, %f229, %f229; mov.f32 %f231, 0f3C94D2E9; mov.f32 %f232, 0f3D53F941; fma.rn.f32 %f233, %f232, %f230, %f231; mov.f32 %f234, 0f3D3F841F; fma.rn.f32 %f235, %f233, %f230, %f234; mov.f32 %f236, 0f3D994929; fma.rn.f32 %f237, %f235, %f230, %f236; mov.f32 %f238, 0f3E2AAB94; fma.rn.f32 %f239, %f237, %f230, %f238; mul.f32 %f240, %f230, %f239; fma.rn.f32 %f241, %f240, %f229, %f229; add.f32 %f242, %f241, %f241; mov.f32 %f243, 0f3FC90FDB; sub.f32 %f244, %f243, %f241; selp.f32 %f245, %f242, %f244, %p42; setp.lt.f32 %p43, %f223, 0f00000000; mov.f32 %f246, 0f40490FDB; sub.f32 %f247, %f246, %f245; selp.f32 %f248, %f247, %f245, %p43; max.f32 %f250, %f250, %f248; BB0_33: cvta.to.global.u64 %rd66, %rd6; add.s64 %rd68, %rd66, %rd12; st.global.f32 [%rd68], %f250; BB0_34: ret; } ` setmaxangle_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<26>; .reg .f32 %f<255>; .reg .b32 %r<128>; .reg .b64 %rd<69>; ld.param.u64 %rd6, [setmaxangle_param_0]; ld.param.u64 %rd7, [setmaxangle_param_1]; ld.param.u64 %rd8, [setmaxangle_param_2]; ld.param.u64 %rd9, [setmaxangle_param_3]; ld.param.u64 %rd10, [setmaxangle_param_4]; ld.param.u64 %rd11, [setmaxangle_param_5]; ld.param.u32 %r32, [setmaxangle_param_6]; ld.param.u32 %r33, [setmaxangle_param_7]; ld.param.u32 %r34, [setmaxangle_param_8]; ld.param.u8 %rs5, [setmaxangle_param_9]; cvta.to.global.u64 %rd1, %rd10; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd9; cvta.to.global.u64 %rd4, %rd8; cvta.to.global.u64 %rd5, %rd7; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_34; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd5, %rd12; add.s64 %rd14, %rd4, %rd12; add.s64 %rd15, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f38, %f2, %f2; fma.rn.f32 %f39, %f1, %f1, %f38; ld.global.nc.f32 %f3, [%rd15]; fma.rn.f32 %f40, %f3, %f3, %f39; setp.eq.f32 %p6, %f40, 0f00000000; @%p6 bra BB0_34; cvt.s64.s32 %rd16, %r6; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs1, [%rd17]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd18, %r49; mul.wide.s32 %rd19, %r49, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd21]; mul.f32 %f42, %f5, %f5; fma.rn.f32 %f43, %f4, %f4, %f42; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f43; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; setp.gt.u16 %p8, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p8; selp.b32 %r53, %r51, %r7, %p8; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd24, %r57, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f44, [%rd25]; mov.f32 %f250, 0f00000000; setp.eq.f32 %p9, %f44, 0f00000000; @%p9 bra BB0_7; setp.eq.f32 %p10, %f7, 0f00000000; selp.f32 %f45, %f1, %f4, %p10; selp.f32 %f46, %f2, %f5, %p10; mul.f32 %f47, %f2, %f46; fma.rn.f32 %f48, %f1, %f45, %f47; selp.f32 %f49, %f3, %f6, %p10; fma.rn.f32 %f50, %f3, %f49, %f48; abs.f32 %f51, %f50; mov.f32 %f52, 0f3F800000; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, 0f3F000000; sqrt.rn.f32 %f55, %f54; setp.gt.f32 %p11, %f51, 0f3F11EB85; selp.f32 %f56, %f55, %f51, %p11; mul.f32 %f57, %f56, %f56; mov.f32 %f58, 0f3C94D2E9; mov.f32 %f59, 0f3D53F941; fma.rn.f32 %f60, %f59, %f57, %f58; mov.f32 %f61, 0f3D3F841F; fma.rn.f32 %f62, %f60, %f57, %f61; mov.f32 %f63, 0f3D994929; fma.rn.f32 %f64, %f62, %f57, %f63; mov.f32 %f65, 0f3E2AAB94; fma.rn.f32 %f66, %f64, %f57, %f65; mul.f32 %f67, %f57, %f66; fma.rn.f32 %f68, %f67, %f56, %f56; add.f32 %f69, %f68, %f68; mov.f32 %f70, 0f3FC90FDB; sub.f32 %f71, %f70, %f68; selp.f32 %f72, %f69, %f71, %p11; setp.lt.f32 %p12, %f50, 0f00000000; mov.f32 %f73, 0f40490FDB; sub.f32 %f74, %f73, %f72; selp.f32 %f75, %f74, %f72, %p12; mov.f32 %f76, 0f00000000; max.f32 %f250, %f76, %f75; BB0_7: add.s32 %r12, %r1, 1; @%p7 bra BB0_9; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_10; BB0_9: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_10: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd26, %r61; mul.wide.s32 %rd27, %r61, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f10, [%rd28]; ld.global.nc.f32 %f11, [%rd29]; mul.f32 %f77, %f11, %f11; fma.rn.f32 %f78, %f10, %f10, %f77; ld.global.nc.f32 %f12, [%rd30]; fma.rn.f32 %f13, %f12, %f12, %f78; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs9, [%rd31]; setp.gt.u16 %p14, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p14; selp.b32 %r65, %r63, %r7, %p14; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd32, %r69, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f79, [%rd33]; setp.eq.f32 %p15, %f79, 0f00000000; @%p15 bra BB0_12; setp.eq.f32 %p16, %f13, 0f00000000; selp.f32 %f80, %f1, %f10, %p16; selp.f32 %f81, %f2, %f11, %p16; mul.f32 %f82, %f2, %f81; fma.rn.f32 %f83, %f1, %f80, %f82; selp.f32 %f84, %f3, %f12, %p16; fma.rn.f32 %f85, %f3, %f84, %f83; abs.f32 %f86, %f85; mov.f32 %f87, 0f3F800000; sub.f32 %f88, %f87, %f86; mul.f32 %f89, %f88, 0f3F000000; sqrt.rn.f32 %f90, %f89; setp.gt.f32 %p17, %f86, 0f3F11EB85; selp.f32 %f91, %f90, %f86, %p17; mul.f32 %f92, %f91, %f91; mov.f32 %f93, 0f3C94D2E9; mov.f32 %f94, 0f3D53F941; fma.rn.f32 %f95, %f94, %f92, %f93; mov.f32 %f96, 0f3D3F841F; fma.rn.f32 %f97, %f95, %f92, %f96; mov.f32 %f98, 0f3D994929; fma.rn.f32 %f99, %f97, %f92, %f98; mov.f32 %f100, 0f3E2AAB94; fma.rn.f32 %f101, %f99, %f92, %f100; mul.f32 %f102, %f92, %f101; fma.rn.f32 %f103, %f102, %f91, %f91; add.f32 %f104, %f103, %f103; mov.f32 %f105, 0f3FC90FDB; sub.f32 %f106, %f105, %f103; selp.f32 %f107, %f104, %f106, %p17; setp.lt.f32 %p18, %f85, 0f00000000; mov.f32 %f108, 0f40490FDB; sub.f32 %f109, %f108, %f107; selp.f32 %f110, %f109, %f107, %p18; max.f32 %f250, %f250, %f110; BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r16, %r2, -1; @%p19 bra BB0_14; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_15; BB0_14: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_15: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f16, [%rd36]; ld.global.nc.f32 %f17, [%rd37]; mul.f32 %f111, %f17, %f17; fma.rn.f32 %f112, %f16, %f16, %f111; ld.global.nc.f32 %f18, [%rd38]; fma.rn.f32 %f19, %f18, %f18, %f112; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs12, [%rd39]; setp.gt.u16 %p20, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p20; selp.b32 %r78, %r76, %r7, %p20; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd40, %r82, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f113, [%rd41]; setp.eq.f32 %p21, %f113, 0f00000000; @%p21 bra BB0_17; setp.eq.f32 %p22, %f19, 0f00000000; selp.f32 %f114, %f1, %f16, %p22; selp.f32 %f115, %f2, %f17, %p22; mul.f32 %f116, %f2, %f115; fma.rn.f32 %f117, %f1, %f114, %f116; selp.f32 %f118, %f3, %f18, %p22; fma.rn.f32 %f119, %f3, %f118, %f117; abs.f32 %f120, %f119; mov.f32 %f121, 0f3F800000; sub.f32 %f122, %f121, %f120; mul.f32 %f123, %f122, 0f3F000000; sqrt.rn.f32 %f124, %f123; setp.gt.f32 %p23, %f120, 0f3F11EB85; selp.f32 %f125, %f124, %f120, %p23; mul.f32 %f126, %f125, %f125; mov.f32 %f127, 0f3C94D2E9; mov.f32 %f128, 0f3D53F941; fma.rn.f32 %f129, %f128, %f126, %f127; mov.f32 %f130, 0f3D3F841F; fma.rn.f32 %f131, %f129, %f126, %f130; mov.f32 %f132, 0f3D994929; fma.rn.f32 %f133, %f131, %f126, %f132; mov.f32 %f134, 0f3E2AAB94; fma.rn.f32 %f135, %f133, %f126, %f134; mul.f32 %f136, %f126, %f135; fma.rn.f32 %f137, %f136, %f125, %f125; add.f32 %f138, %f137, %f137; mov.f32 %f139, 0f3FC90FDB; sub.f32 %f140, %f139, %f137; selp.f32 %f141, %f138, %f140, %p23; setp.lt.f32 %p24, %f119, 0f00000000; mov.f32 %f142, 0f40490FDB; sub.f32 %f143, %f142, %f141; selp.f32 %f144, %f143, %f141, %p24; max.f32 %f250, %f250, %f144; BB0_17: add.s32 %r20, %r2, 1; @%p19 bra BB0_19; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_20; BB0_19: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_20: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd42, %r87; mul.wide.s32 %rd43, %r87, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f22, [%rd44]; ld.global.nc.f32 %f23, [%rd45]; mul.f32 %f145, %f23, %f23; fma.rn.f32 %f146, %f22, %f22, %f145; ld.global.nc.f32 %f24, [%rd46]; fma.rn.f32 %f25, %f24, %f24, %f146; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs16, [%rd47]; setp.gt.u16 %p26, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p26; selp.b32 %r91, %r89, %r7, %p26; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f147, [%rd49]; setp.eq.f32 %p27, %f147, 0f00000000; @%p27 bra BB0_22; setp.eq.f32 %p28, %f25, 0f00000000; selp.f32 %f148, %f1, %f22, %p28; selp.f32 %f149, %f2, %f23, %p28; mul.f32 %f150, %f2, %f149; fma.rn.f32 %f151, %f1, %f148, %f150; selp.f32 %f152, %f3, %f24, %p28; fma.rn.f32 %f153, %f3, %f152, %f151; abs.f32 %f154, %f153; mov.f32 %f155, 0f3F800000; sub.f32 %f156, %f155, %f154; mul.f32 %f157, %f156, 0f3F000000; sqrt.rn.f32 %f158, %f157; setp.gt.f32 %p29, %f154, 0f3F11EB85; selp.f32 %f159, %f158, %f154, %p29; mul.f32 %f160, %f159, %f159; mov.f32 %f161, 0f3C94D2E9; mov.f32 %f162, 0f3D53F941; fma.rn.f32 %f163, %f162, %f160, %f161; mov.f32 %f164, 0f3D3F841F; fma.rn.f32 %f165, %f163, %f160, %f164; mov.f32 %f166, 0f3D994929; fma.rn.f32 %f167, %f165, %f160, %f166; mov.f32 %f168, 0f3E2AAB94; fma.rn.f32 %f169, %f167, %f160, %f168; mul.f32 %f170, %f160, %f169; fma.rn.f32 %f171, %f170, %f159, %f159; add.f32 %f172, %f171, %f171; mov.f32 %f173, 0f3FC90FDB; sub.f32 %f174, %f173, %f171; selp.f32 %f175, %f172, %f174, %p29; setp.lt.f32 %p30, %f153, 0f00000000; mov.f32 %f176, 0f40490FDB; sub.f32 %f177, %f176, %f175; selp.f32 %f178, %f177, %f175, %p30; max.f32 %f250, %f250, %f178; BB0_22: setp.eq.s32 %p31, %r34, 1; @%p31 bra BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r24, %r3, -1; @%p32 bra BB0_25; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_26; BB0_25: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_26: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd50, %r100; mul.wide.s32 %rd51, %r100, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f179, [%rd52]; ld.global.nc.f32 %f180, [%rd53]; mul.f32 %f181, %f180, %f180; fma.rn.f32 %f182, %f179, %f179, %f181; ld.global.nc.f32 %f183, [%rd54]; fma.rn.f32 %f184, %f183, %f183, %f182; setp.eq.f32 %p33, %f184, 0f00000000; selp.f32 %f28, %f1, %f179, %p33; selp.f32 %f29, %f2, %f180, %p33; selp.f32 %f30, %f3, %f183, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs19, [%rd55]; setp.gt.u16 %p34, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p34; selp.b32 %r104, %r102, %r7, %p34; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f185, [%rd57]; setp.eq.f32 %p35, %f185, 0f00000000; @%p35 bra BB0_28; mul.f32 %f186, %f2, %f29; fma.rn.f32 %f187, %f1, %f28, %f186; fma.rn.f32 %f188, %f3, %f30, %f187; abs.f32 %f189, %f188; mov.f32 %f190, 0f3F800000; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f191, 0f3F000000; sqrt.rn.f32 %f193, %f192; setp.gt.f32 %p36, %f189, 0f3F11EB85; selp.f32 %f194, %f193, %f189, %p36; mul.f32 %f195, %f194, %f194; mov.f32 %f196, 0f3C94D2E9; mov.f32 %f197, 0f3D53F941; fma.rn.f32 %f198, %f197, %f195, %f196; mov.f32 %f199, 0f3D3F841F; fma.rn.f32 %f200, %f198, %f195, %f199; mov.f32 %f201, 0f3D994929; fma.rn.f32 %f202, %f200, %f195, %f201; mov.f32 %f203, 0f3E2AAB94; fma.rn.f32 %f204, %f202, %f195, %f203; mul.f32 %f205, %f195, %f204; fma.rn.f32 %f206, %f205, %f194, %f194; add.f32 %f207, %f206, %f206; mov.f32 %f208, 0f3FC90FDB; sub.f32 %f209, %f208, %f206; selp.f32 %f210, %f207, %f209, %p36; setp.lt.f32 %p37, %f188, 0f00000000; mov.f32 %f211, 0f40490FDB; sub.f32 %f212, %f211, %f210; selp.f32 %f213, %f212, %f210, %p37; max.f32 %f250, %f250, %f213; BB0_28: add.s32 %r28, %r3, 1; @%p32 bra BB0_30; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_31; BB0_30: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_31: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd58, %r113; mul.wide.s32 %rd59, %r113, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f214, [%rd60]; ld.global.nc.f32 %f215, [%rd61]; mul.f32 %f216, %f215, %f215; fma.rn.f32 %f217, %f214, %f214, %f216; ld.global.nc.f32 %f218, [%rd62]; fma.rn.f32 %f219, %f218, %f218, %f217; setp.eq.f32 %p39, %f219, 0f00000000; selp.f32 %f33, %f1, %f214, %p39; selp.f32 %f34, %f2, %f215, %p39; selp.f32 %f35, %f3, %f218, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs23, [%rd63]; setp.gt.u16 %p40, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p40; selp.b32 %r117, %r115, %r7, %p40; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd64, %r121, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f220, [%rd65]; setp.eq.f32 %p41, %f220, 0f00000000; @%p41 bra BB0_33; mul.f32 %f221, %f2, %f34; fma.rn.f32 %f222, %f1, %f33, %f221; fma.rn.f32 %f223, %f3, %f35, %f222; abs.f32 %f224, %f223; mov.f32 %f225, 0f3F800000; sub.f32 %f226, %f225, %f224; mul.f32 %f227, %f226, 0f3F000000; sqrt.rn.f32 %f228, %f227; setp.gt.f32 %p42, %f224, 0f3F11EB85; selp.f32 %f229, %f228, %f224, %p42; mul.f32 %f230, %f229, %f229; mov.f32 %f231, 0f3C94D2E9; mov.f32 %f232, 0f3D53F941; fma.rn.f32 %f233, %f232, %f230, %f231; mov.f32 %f234, 0f3D3F841F; fma.rn.f32 %f235, %f233, %f230, %f234; mov.f32 %f236, 0f3D994929; fma.rn.f32 %f237, %f235, %f230, %f236; mov.f32 %f238, 0f3E2AAB94; fma.rn.f32 %f239, %f237, %f230, %f238; mul.f32 %f240, %f230, %f239; fma.rn.f32 %f241, %f240, %f229, %f229; add.f32 %f242, %f241, %f241; mov.f32 %f243, 0f3FC90FDB; sub.f32 %f244, %f243, %f241; selp.f32 %f245, %f242, %f244, %p42; setp.lt.f32 %p43, %f223, 0f00000000; mov.f32 %f246, 0f40490FDB; sub.f32 %f247, %f246, %f245; selp.f32 %f248, %f247, %f245, %p43; max.f32 %f250, %f250, %f248; BB0_33: cvta.to.global.u64 %rd66, %rd6; add.s64 %rd68, %rd66, %rd12; st.global.f32 [%rd68], %f250; BB0_34: ret; } ` setmaxangle_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<26>; .reg .f32 %f<255>; .reg .b32 %r<128>; .reg .b64 %rd<69>; ld.param.u64 %rd6, [setmaxangle_param_0]; ld.param.u64 %rd7, [setmaxangle_param_1]; ld.param.u64 %rd8, [setmaxangle_param_2]; ld.param.u64 %rd9, [setmaxangle_param_3]; ld.param.u64 %rd10, [setmaxangle_param_4]; ld.param.u64 %rd11, [setmaxangle_param_5]; ld.param.u32 %r32, [setmaxangle_param_6]; ld.param.u32 %r33, [setmaxangle_param_7]; ld.param.u32 %r34, [setmaxangle_param_8]; ld.param.u8 %rs5, [setmaxangle_param_9]; cvta.to.global.u64 %rd1, %rd10; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd9; cvta.to.global.u64 %rd4, %rd8; cvta.to.global.u64 %rd5, %rd7; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_34; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd5, %rd12; add.s64 %rd14, %rd4, %rd12; add.s64 %rd15, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f38, %f2, %f2; fma.rn.f32 %f39, %f1, %f1, %f38; ld.global.nc.f32 %f3, [%rd15]; fma.rn.f32 %f40, %f3, %f3, %f39; setp.eq.f32 %p6, %f40, 0f00000000; @%p6 bra BB0_34; cvt.s64.s32 %rd16, %r6; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs1, [%rd17]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd18, %r49; mul.wide.s32 %rd19, %r49, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd21]; mul.f32 %f42, %f5, %f5; fma.rn.f32 %f43, %f4, %f4, %f42; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f43; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; setp.gt.u16 %p8, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p8; selp.b32 %r53, %r51, %r7, %p8; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd24, %r57, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f44, [%rd25]; mov.f32 %f250, 0f00000000; setp.eq.f32 %p9, %f44, 0f00000000; @%p9 bra BB0_7; setp.eq.f32 %p10, %f7, 0f00000000; selp.f32 %f45, %f1, %f4, %p10; selp.f32 %f46, %f2, %f5, %p10; mul.f32 %f47, %f2, %f46; fma.rn.f32 %f48, %f1, %f45, %f47; selp.f32 %f49, %f3, %f6, %p10; fma.rn.f32 %f50, %f3, %f49, %f48; abs.f32 %f51, %f50; mov.f32 %f52, 0f3F800000; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, 0f3F000000; sqrt.rn.f32 %f55, %f54; setp.gt.f32 %p11, %f51, 0f3F11EB85; selp.f32 %f56, %f55, %f51, %p11; mul.f32 %f57, %f56, %f56; mov.f32 %f58, 0f3C94D2E9; mov.f32 %f59, 0f3D53F941; fma.rn.f32 %f60, %f59, %f57, %f58; mov.f32 %f61, 0f3D3F841F; fma.rn.f32 %f62, %f60, %f57, %f61; mov.f32 %f63, 0f3D994929; fma.rn.f32 %f64, %f62, %f57, %f63; mov.f32 %f65, 0f3E2AAB94; fma.rn.f32 %f66, %f64, %f57, %f65; mul.f32 %f67, %f57, %f66; fma.rn.f32 %f68, %f67, %f56, %f56; add.f32 %f69, %f68, %f68; mov.f32 %f70, 0f3FC90FDB; sub.f32 %f71, %f70, %f68; selp.f32 %f72, %f69, %f71, %p11; setp.lt.f32 %p12, %f50, 0f00000000; mov.f32 %f73, 0f40490FDB; sub.f32 %f74, %f73, %f72; selp.f32 %f75, %f74, %f72, %p12; mov.f32 %f76, 0f00000000; max.f32 %f250, %f76, %f75; BB0_7: add.s32 %r12, %r1, 1; @%p7 bra BB0_9; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_10; BB0_9: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_10: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd26, %r61; mul.wide.s32 %rd27, %r61, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f10, [%rd28]; ld.global.nc.f32 %f11, [%rd29]; mul.f32 %f77, %f11, %f11; fma.rn.f32 %f78, %f10, %f10, %f77; ld.global.nc.f32 %f12, [%rd30]; fma.rn.f32 %f13, %f12, %f12, %f78; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs9, [%rd31]; setp.gt.u16 %p14, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p14; selp.b32 %r65, %r63, %r7, %p14; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd32, %r69, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f79, [%rd33]; setp.eq.f32 %p15, %f79, 0f00000000; @%p15 bra BB0_12; setp.eq.f32 %p16, %f13, 0f00000000; selp.f32 %f80, %f1, %f10, %p16; selp.f32 %f81, %f2, %f11, %p16; mul.f32 %f82, %f2, %f81; fma.rn.f32 %f83, %f1, %f80, %f82; selp.f32 %f84, %f3, %f12, %p16; fma.rn.f32 %f85, %f3, %f84, %f83; abs.f32 %f86, %f85; mov.f32 %f87, 0f3F800000; sub.f32 %f88, %f87, %f86; mul.f32 %f89, %f88, 0f3F000000; sqrt.rn.f32 %f90, %f89; setp.gt.f32 %p17, %f86, 0f3F11EB85; selp.f32 %f91, %f90, %f86, %p17; mul.f32 %f92, %f91, %f91; mov.f32 %f93, 0f3C94D2E9; mov.f32 %f94, 0f3D53F941; fma.rn.f32 %f95, %f94, %f92, %f93; mov.f32 %f96, 0f3D3F841F; fma.rn.f32 %f97, %f95, %f92, %f96; mov.f32 %f98, 0f3D994929; fma.rn.f32 %f99, %f97, %f92, %f98; mov.f32 %f100, 0f3E2AAB94; fma.rn.f32 %f101, %f99, %f92, %f100; mul.f32 %f102, %f92, %f101; fma.rn.f32 %f103, %f102, %f91, %f91; add.f32 %f104, %f103, %f103; mov.f32 %f105, 0f3FC90FDB; sub.f32 %f106, %f105, %f103; selp.f32 %f107, %f104, %f106, %p17; setp.lt.f32 %p18, %f85, 0f00000000; mov.f32 %f108, 0f40490FDB; sub.f32 %f109, %f108, %f107; selp.f32 %f110, %f109, %f107, %p18; max.f32 %f250, %f250, %f110; BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r16, %r2, -1; @%p19 bra BB0_14; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_15; BB0_14: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_15: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f16, [%rd36]; ld.global.nc.f32 %f17, [%rd37]; mul.f32 %f111, %f17, %f17; fma.rn.f32 %f112, %f16, %f16, %f111; ld.global.nc.f32 %f18, [%rd38]; fma.rn.f32 %f19, %f18, %f18, %f112; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs12, [%rd39]; setp.gt.u16 %p20, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p20; selp.b32 %r78, %r76, %r7, %p20; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd40, %r82, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f113, [%rd41]; setp.eq.f32 %p21, %f113, 0f00000000; @%p21 bra BB0_17; setp.eq.f32 %p22, %f19, 0f00000000; selp.f32 %f114, %f1, %f16, %p22; selp.f32 %f115, %f2, %f17, %p22; mul.f32 %f116, %f2, %f115; fma.rn.f32 %f117, %f1, %f114, %f116; selp.f32 %f118, %f3, %f18, %p22; fma.rn.f32 %f119, %f3, %f118, %f117; abs.f32 %f120, %f119; mov.f32 %f121, 0f3F800000; sub.f32 %f122, %f121, %f120; mul.f32 %f123, %f122, 0f3F000000; sqrt.rn.f32 %f124, %f123; setp.gt.f32 %p23, %f120, 0f3F11EB85; selp.f32 %f125, %f124, %f120, %p23; mul.f32 %f126, %f125, %f125; mov.f32 %f127, 0f3C94D2E9; mov.f32 %f128, 0f3D53F941; fma.rn.f32 %f129, %f128, %f126, %f127; mov.f32 %f130, 0f3D3F841F; fma.rn.f32 %f131, %f129, %f126, %f130; mov.f32 %f132, 0f3D994929; fma.rn.f32 %f133, %f131, %f126, %f132; mov.f32 %f134, 0f3E2AAB94; fma.rn.f32 %f135, %f133, %f126, %f134; mul.f32 %f136, %f126, %f135; fma.rn.f32 %f137, %f136, %f125, %f125; add.f32 %f138, %f137, %f137; mov.f32 %f139, 0f3FC90FDB; sub.f32 %f140, %f139, %f137; selp.f32 %f141, %f138, %f140, %p23; setp.lt.f32 %p24, %f119, 0f00000000; mov.f32 %f142, 0f40490FDB; sub.f32 %f143, %f142, %f141; selp.f32 %f144, %f143, %f141, %p24; max.f32 %f250, %f250, %f144; BB0_17: add.s32 %r20, %r2, 1; @%p19 bra BB0_19; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_20; BB0_19: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_20: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd42, %r87; mul.wide.s32 %rd43, %r87, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f22, [%rd44]; ld.global.nc.f32 %f23, [%rd45]; mul.f32 %f145, %f23, %f23; fma.rn.f32 %f146, %f22, %f22, %f145; ld.global.nc.f32 %f24, [%rd46]; fma.rn.f32 %f25, %f24, %f24, %f146; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs16, [%rd47]; setp.gt.u16 %p26, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p26; selp.b32 %r91, %r89, %r7, %p26; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f147, [%rd49]; setp.eq.f32 %p27, %f147, 0f00000000; @%p27 bra BB0_22; setp.eq.f32 %p28, %f25, 0f00000000; selp.f32 %f148, %f1, %f22, %p28; selp.f32 %f149, %f2, %f23, %p28; mul.f32 %f150, %f2, %f149; fma.rn.f32 %f151, %f1, %f148, %f150; selp.f32 %f152, %f3, %f24, %p28; fma.rn.f32 %f153, %f3, %f152, %f151; abs.f32 %f154, %f153; mov.f32 %f155, 0f3F800000; sub.f32 %f156, %f155, %f154; mul.f32 %f157, %f156, 0f3F000000; sqrt.rn.f32 %f158, %f157; setp.gt.f32 %p29, %f154, 0f3F11EB85; selp.f32 %f159, %f158, %f154, %p29; mul.f32 %f160, %f159, %f159; mov.f32 %f161, 0f3C94D2E9; mov.f32 %f162, 0f3D53F941; fma.rn.f32 %f163, %f162, %f160, %f161; mov.f32 %f164, 0f3D3F841F; fma.rn.f32 %f165, %f163, %f160, %f164; mov.f32 %f166, 0f3D994929; fma.rn.f32 %f167, %f165, %f160, %f166; mov.f32 %f168, 0f3E2AAB94; fma.rn.f32 %f169, %f167, %f160, %f168; mul.f32 %f170, %f160, %f169; fma.rn.f32 %f171, %f170, %f159, %f159; add.f32 %f172, %f171, %f171; mov.f32 %f173, 0f3FC90FDB; sub.f32 %f174, %f173, %f171; selp.f32 %f175, %f172, %f174, %p29; setp.lt.f32 %p30, %f153, 0f00000000; mov.f32 %f176, 0f40490FDB; sub.f32 %f177, %f176, %f175; selp.f32 %f178, %f177, %f175, %p30; max.f32 %f250, %f250, %f178; BB0_22: setp.eq.s32 %p31, %r34, 1; @%p31 bra BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r24, %r3, -1; @%p32 bra BB0_25; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_26; BB0_25: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_26: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd50, %r100; mul.wide.s32 %rd51, %r100, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f179, [%rd52]; ld.global.nc.f32 %f180, [%rd53]; mul.f32 %f181, %f180, %f180; fma.rn.f32 %f182, %f179, %f179, %f181; ld.global.nc.f32 %f183, [%rd54]; fma.rn.f32 %f184, %f183, %f183, %f182; setp.eq.f32 %p33, %f184, 0f00000000; selp.f32 %f28, %f1, %f179, %p33; selp.f32 %f29, %f2, %f180, %p33; selp.f32 %f30, %f3, %f183, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs19, [%rd55]; setp.gt.u16 %p34, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p34; selp.b32 %r104, %r102, %r7, %p34; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f185, [%rd57]; setp.eq.f32 %p35, %f185, 0f00000000; @%p35 bra BB0_28; mul.f32 %f186, %f2, %f29; fma.rn.f32 %f187, %f1, %f28, %f186; fma.rn.f32 %f188, %f3, %f30, %f187; abs.f32 %f189, %f188; mov.f32 %f190, 0f3F800000; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f191, 0f3F000000; sqrt.rn.f32 %f193, %f192; setp.gt.f32 %p36, %f189, 0f3F11EB85; selp.f32 %f194, %f193, %f189, %p36; mul.f32 %f195, %f194, %f194; mov.f32 %f196, 0f3C94D2E9; mov.f32 %f197, 0f3D53F941; fma.rn.f32 %f198, %f197, %f195, %f196; mov.f32 %f199, 0f3D3F841F; fma.rn.f32 %f200, %f198, %f195, %f199; mov.f32 %f201, 0f3D994929; fma.rn.f32 %f202, %f200, %f195, %f201; mov.f32 %f203, 0f3E2AAB94; fma.rn.f32 %f204, %f202, %f195, %f203; mul.f32 %f205, %f195, %f204; fma.rn.f32 %f206, %f205, %f194, %f194; add.f32 %f207, %f206, %f206; mov.f32 %f208, 0f3FC90FDB; sub.f32 %f209, %f208, %f206; selp.f32 %f210, %f207, %f209, %p36; setp.lt.f32 %p37, %f188, 0f00000000; mov.f32 %f211, 0f40490FDB; sub.f32 %f212, %f211, %f210; selp.f32 %f213, %f212, %f210, %p37; max.f32 %f250, %f250, %f213; BB0_28: add.s32 %r28, %r3, 1; @%p32 bra BB0_30; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_31; BB0_30: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_31: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd58, %r113; mul.wide.s32 %rd59, %r113, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f214, [%rd60]; ld.global.nc.f32 %f215, [%rd61]; mul.f32 %f216, %f215, %f215; fma.rn.f32 %f217, %f214, %f214, %f216; ld.global.nc.f32 %f218, [%rd62]; fma.rn.f32 %f219, %f218, %f218, %f217; setp.eq.f32 %p39, %f219, 0f00000000; selp.f32 %f33, %f1, %f214, %p39; selp.f32 %f34, %f2, %f215, %p39; selp.f32 %f35, %f3, %f218, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs23, [%rd63]; setp.gt.u16 %p40, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p40; selp.b32 %r117, %r115, %r7, %p40; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd64, %r121, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f220, [%rd65]; setp.eq.f32 %p41, %f220, 0f00000000; @%p41 bra BB0_33; mul.f32 %f221, %f2, %f34; fma.rn.f32 %f222, %f1, %f33, %f221; fma.rn.f32 %f223, %f3, %f35, %f222; abs.f32 %f224, %f223; mov.f32 %f225, 0f3F800000; sub.f32 %f226, %f225, %f224; mul.f32 %f227, %f226, 0f3F000000; sqrt.rn.f32 %f228, %f227; setp.gt.f32 %p42, %f224, 0f3F11EB85; selp.f32 %f229, %f228, %f224, %p42; mul.f32 %f230, %f229, %f229; mov.f32 %f231, 0f3C94D2E9; mov.f32 %f232, 0f3D53F941; fma.rn.f32 %f233, %f232, %f230, %f231; mov.f32 %f234, 0f3D3F841F; fma.rn.f32 %f235, %f233, %f230, %f234; mov.f32 %f236, 0f3D994929; fma.rn.f32 %f237, %f235, %f230, %f236; mov.f32 %f238, 0f3E2AAB94; fma.rn.f32 %f239, %f237, %f230, %f238; mul.f32 %f240, %f230, %f239; fma.rn.f32 %f241, %f240, %f229, %f229; add.f32 %f242, %f241, %f241; mov.f32 %f243, 0f3FC90FDB; sub.f32 %f244, %f243, %f241; selp.f32 %f245, %f242, %f244, %p42; setp.lt.f32 %p43, %f223, 0f00000000; mov.f32 %f246, 0f40490FDB; sub.f32 %f247, %f246, %f245; selp.f32 %f248, %f247, %f245, %p43; max.f32 %f250, %f250, %f248; BB0_33: cvta.to.global.u64 %rd66, %rd6; add.s64 %rd68, %rd66, %rd12; st.global.f32 [%rd68], %f250; BB0_34: ret; } ` setmaxangle_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<26>; .reg .f32 %f<255>; .reg .b32 %r<128>; .reg .b64 %rd<69>; ld.param.u64 %rd6, [setmaxangle_param_0]; ld.param.u64 %rd7, [setmaxangle_param_1]; ld.param.u64 %rd8, [setmaxangle_param_2]; ld.param.u64 %rd9, [setmaxangle_param_3]; ld.param.u64 %rd10, [setmaxangle_param_4]; ld.param.u64 %rd11, [setmaxangle_param_5]; ld.param.u32 %r32, [setmaxangle_param_6]; ld.param.u32 %r33, [setmaxangle_param_7]; ld.param.u32 %r34, [setmaxangle_param_8]; ld.param.u8 %rs5, [setmaxangle_param_9]; cvta.to.global.u64 %rd1, %rd10; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd9; cvta.to.global.u64 %rd4, %rd8; cvta.to.global.u64 %rd5, %rd7; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_34; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd5, %rd12; add.s64 %rd14, %rd4, %rd12; add.s64 %rd15, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f38, %f2, %f2; fma.rn.f32 %f39, %f1, %f1, %f38; ld.global.nc.f32 %f3, [%rd15]; fma.rn.f32 %f40, %f3, %f3, %f39; setp.eq.f32 %p6, %f40, 0f00000000; @%p6 bra BB0_34; cvt.s64.s32 %rd16, %r6; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs1, [%rd17]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd18, %r49; mul.wide.s32 %rd19, %r49, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd21]; mul.f32 %f42, %f5, %f5; fma.rn.f32 %f43, %f4, %f4, %f42; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f43; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; setp.gt.u16 %p8, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p8; selp.b32 %r53, %r51, %r7, %p8; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd24, %r57, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f44, [%rd25]; mov.f32 %f250, 0f00000000; setp.eq.f32 %p9, %f44, 0f00000000; @%p9 bra BB0_7; setp.eq.f32 %p10, %f7, 0f00000000; selp.f32 %f45, %f1, %f4, %p10; selp.f32 %f46, %f2, %f5, %p10; mul.f32 %f47, %f2, %f46; fma.rn.f32 %f48, %f1, %f45, %f47; selp.f32 %f49, %f3, %f6, %p10; fma.rn.f32 %f50, %f3, %f49, %f48; abs.f32 %f51, %f50; mov.f32 %f52, 0f3F800000; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, 0f3F000000; sqrt.rn.f32 %f55, %f54; setp.gt.f32 %p11, %f51, 0f3F11EB85; selp.f32 %f56, %f55, %f51, %p11; mul.f32 %f57, %f56, %f56; mov.f32 %f58, 0f3C94D2E9; mov.f32 %f59, 0f3D53F941; fma.rn.f32 %f60, %f59, %f57, %f58; mov.f32 %f61, 0f3D3F841F; fma.rn.f32 %f62, %f60, %f57, %f61; mov.f32 %f63, 0f3D994929; fma.rn.f32 %f64, %f62, %f57, %f63; mov.f32 %f65, 0f3E2AAB94; fma.rn.f32 %f66, %f64, %f57, %f65; mul.f32 %f67, %f57, %f66; fma.rn.f32 %f68, %f67, %f56, %f56; add.f32 %f69, %f68, %f68; mov.f32 %f70, 0f3FC90FDB; sub.f32 %f71, %f70, %f68; selp.f32 %f72, %f69, %f71, %p11; setp.lt.f32 %p12, %f50, 0f00000000; mov.f32 %f73, 0f40490FDB; sub.f32 %f74, %f73, %f72; selp.f32 %f75, %f74, %f72, %p12; mov.f32 %f76, 0f00000000; max.f32 %f250, %f76, %f75; BB0_7: add.s32 %r12, %r1, 1; @%p7 bra BB0_9; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_10; BB0_9: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_10: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd26, %r61; mul.wide.s32 %rd27, %r61, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f10, [%rd28]; ld.global.nc.f32 %f11, [%rd29]; mul.f32 %f77, %f11, %f11; fma.rn.f32 %f78, %f10, %f10, %f77; ld.global.nc.f32 %f12, [%rd30]; fma.rn.f32 %f13, %f12, %f12, %f78; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs9, [%rd31]; setp.gt.u16 %p14, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p14; selp.b32 %r65, %r63, %r7, %p14; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd32, %r69, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f79, [%rd33]; setp.eq.f32 %p15, %f79, 0f00000000; @%p15 bra BB0_12; setp.eq.f32 %p16, %f13, 0f00000000; selp.f32 %f80, %f1, %f10, %p16; selp.f32 %f81, %f2, %f11, %p16; mul.f32 %f82, %f2, %f81; fma.rn.f32 %f83, %f1, %f80, %f82; selp.f32 %f84, %f3, %f12, %p16; fma.rn.f32 %f85, %f3, %f84, %f83; abs.f32 %f86, %f85; mov.f32 %f87, 0f3F800000; sub.f32 %f88, %f87, %f86; mul.f32 %f89, %f88, 0f3F000000; sqrt.rn.f32 %f90, %f89; setp.gt.f32 %p17, %f86, 0f3F11EB85; selp.f32 %f91, %f90, %f86, %p17; mul.f32 %f92, %f91, %f91; mov.f32 %f93, 0f3C94D2E9; mov.f32 %f94, 0f3D53F941; fma.rn.f32 %f95, %f94, %f92, %f93; mov.f32 %f96, 0f3D3F841F; fma.rn.f32 %f97, %f95, %f92, %f96; mov.f32 %f98, 0f3D994929; fma.rn.f32 %f99, %f97, %f92, %f98; mov.f32 %f100, 0f3E2AAB94; fma.rn.f32 %f101, %f99, %f92, %f100; mul.f32 %f102, %f92, %f101; fma.rn.f32 %f103, %f102, %f91, %f91; add.f32 %f104, %f103, %f103; mov.f32 %f105, 0f3FC90FDB; sub.f32 %f106, %f105, %f103; selp.f32 %f107, %f104, %f106, %p17; setp.lt.f32 %p18, %f85, 0f00000000; mov.f32 %f108, 0f40490FDB; sub.f32 %f109, %f108, %f107; selp.f32 %f110, %f109, %f107, %p18; max.f32 %f250, %f250, %f110; BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r16, %r2, -1; @%p19 bra BB0_14; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_15; BB0_14: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_15: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f16, [%rd36]; ld.global.nc.f32 %f17, [%rd37]; mul.f32 %f111, %f17, %f17; fma.rn.f32 %f112, %f16, %f16, %f111; ld.global.nc.f32 %f18, [%rd38]; fma.rn.f32 %f19, %f18, %f18, %f112; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs12, [%rd39]; setp.gt.u16 %p20, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p20; selp.b32 %r78, %r76, %r7, %p20; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd40, %r82, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f113, [%rd41]; setp.eq.f32 %p21, %f113, 0f00000000; @%p21 bra BB0_17; setp.eq.f32 %p22, %f19, 0f00000000; selp.f32 %f114, %f1, %f16, %p22; selp.f32 %f115, %f2, %f17, %p22; mul.f32 %f116, %f2, %f115; fma.rn.f32 %f117, %f1, %f114, %f116; selp.f32 %f118, %f3, %f18, %p22; fma.rn.f32 %f119, %f3, %f118, %f117; abs.f32 %f120, %f119; mov.f32 %f121, 0f3F800000; sub.f32 %f122, %f121, %f120; mul.f32 %f123, %f122, 0f3F000000; sqrt.rn.f32 %f124, %f123; setp.gt.f32 %p23, %f120, 0f3F11EB85; selp.f32 %f125, %f124, %f120, %p23; mul.f32 %f126, %f125, %f125; mov.f32 %f127, 0f3C94D2E9; mov.f32 %f128, 0f3D53F941; fma.rn.f32 %f129, %f128, %f126, %f127; mov.f32 %f130, 0f3D3F841F; fma.rn.f32 %f131, %f129, %f126, %f130; mov.f32 %f132, 0f3D994929; fma.rn.f32 %f133, %f131, %f126, %f132; mov.f32 %f134, 0f3E2AAB94; fma.rn.f32 %f135, %f133, %f126, %f134; mul.f32 %f136, %f126, %f135; fma.rn.f32 %f137, %f136, %f125, %f125; add.f32 %f138, %f137, %f137; mov.f32 %f139, 0f3FC90FDB; sub.f32 %f140, %f139, %f137; selp.f32 %f141, %f138, %f140, %p23; setp.lt.f32 %p24, %f119, 0f00000000; mov.f32 %f142, 0f40490FDB; sub.f32 %f143, %f142, %f141; selp.f32 %f144, %f143, %f141, %p24; max.f32 %f250, %f250, %f144; BB0_17: add.s32 %r20, %r2, 1; @%p19 bra BB0_19; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_20; BB0_19: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_20: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd42, %r87; mul.wide.s32 %rd43, %r87, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f22, [%rd44]; ld.global.nc.f32 %f23, [%rd45]; mul.f32 %f145, %f23, %f23; fma.rn.f32 %f146, %f22, %f22, %f145; ld.global.nc.f32 %f24, [%rd46]; fma.rn.f32 %f25, %f24, %f24, %f146; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs16, [%rd47]; setp.gt.u16 %p26, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p26; selp.b32 %r91, %r89, %r7, %p26; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f147, [%rd49]; setp.eq.f32 %p27, %f147, 0f00000000; @%p27 bra BB0_22; setp.eq.f32 %p28, %f25, 0f00000000; selp.f32 %f148, %f1, %f22, %p28; selp.f32 %f149, %f2, %f23, %p28; mul.f32 %f150, %f2, %f149; fma.rn.f32 %f151, %f1, %f148, %f150; selp.f32 %f152, %f3, %f24, %p28; fma.rn.f32 %f153, %f3, %f152, %f151; abs.f32 %f154, %f153; mov.f32 %f155, 0f3F800000; sub.f32 %f156, %f155, %f154; mul.f32 %f157, %f156, 0f3F000000; sqrt.rn.f32 %f158, %f157; setp.gt.f32 %p29, %f154, 0f3F11EB85; selp.f32 %f159, %f158, %f154, %p29; mul.f32 %f160, %f159, %f159; mov.f32 %f161, 0f3C94D2E9; mov.f32 %f162, 0f3D53F941; fma.rn.f32 %f163, %f162, %f160, %f161; mov.f32 %f164, 0f3D3F841F; fma.rn.f32 %f165, %f163, %f160, %f164; mov.f32 %f166, 0f3D994929; fma.rn.f32 %f167, %f165, %f160, %f166; mov.f32 %f168, 0f3E2AAB94; fma.rn.f32 %f169, %f167, %f160, %f168; mul.f32 %f170, %f160, %f169; fma.rn.f32 %f171, %f170, %f159, %f159; add.f32 %f172, %f171, %f171; mov.f32 %f173, 0f3FC90FDB; sub.f32 %f174, %f173, %f171; selp.f32 %f175, %f172, %f174, %p29; setp.lt.f32 %p30, %f153, 0f00000000; mov.f32 %f176, 0f40490FDB; sub.f32 %f177, %f176, %f175; selp.f32 %f178, %f177, %f175, %p30; max.f32 %f250, %f250, %f178; BB0_22: setp.eq.s32 %p31, %r34, 1; @%p31 bra BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r24, %r3, -1; @%p32 bra BB0_25; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_26; BB0_25: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_26: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd50, %r100; mul.wide.s32 %rd51, %r100, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f179, [%rd52]; ld.global.nc.f32 %f180, [%rd53]; mul.f32 %f181, %f180, %f180; fma.rn.f32 %f182, %f179, %f179, %f181; ld.global.nc.f32 %f183, [%rd54]; fma.rn.f32 %f184, %f183, %f183, %f182; setp.eq.f32 %p33, %f184, 0f00000000; selp.f32 %f28, %f1, %f179, %p33; selp.f32 %f29, %f2, %f180, %p33; selp.f32 %f30, %f3, %f183, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs19, [%rd55]; setp.gt.u16 %p34, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p34; selp.b32 %r104, %r102, %r7, %p34; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f185, [%rd57]; setp.eq.f32 %p35, %f185, 0f00000000; @%p35 bra BB0_28; mul.f32 %f186, %f2, %f29; fma.rn.f32 %f187, %f1, %f28, %f186; fma.rn.f32 %f188, %f3, %f30, %f187; abs.f32 %f189, %f188; mov.f32 %f190, 0f3F800000; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f191, 0f3F000000; sqrt.rn.f32 %f193, %f192; setp.gt.f32 %p36, %f189, 0f3F11EB85; selp.f32 %f194, %f193, %f189, %p36; mul.f32 %f195, %f194, %f194; mov.f32 %f196, 0f3C94D2E9; mov.f32 %f197, 0f3D53F941; fma.rn.f32 %f198, %f197, %f195, %f196; mov.f32 %f199, 0f3D3F841F; fma.rn.f32 %f200, %f198, %f195, %f199; mov.f32 %f201, 0f3D994929; fma.rn.f32 %f202, %f200, %f195, %f201; mov.f32 %f203, 0f3E2AAB94; fma.rn.f32 %f204, %f202, %f195, %f203; mul.f32 %f205, %f195, %f204; fma.rn.f32 %f206, %f205, %f194, %f194; add.f32 %f207, %f206, %f206; mov.f32 %f208, 0f3FC90FDB; sub.f32 %f209, %f208, %f206; selp.f32 %f210, %f207, %f209, %p36; setp.lt.f32 %p37, %f188, 0f00000000; mov.f32 %f211, 0f40490FDB; sub.f32 %f212, %f211, %f210; selp.f32 %f213, %f212, %f210, %p37; max.f32 %f250, %f250, %f213; BB0_28: add.s32 %r28, %r3, 1; @%p32 bra BB0_30; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_31; BB0_30: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_31: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd58, %r113; mul.wide.s32 %rd59, %r113, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f214, [%rd60]; ld.global.nc.f32 %f215, [%rd61]; mul.f32 %f216, %f215, %f215; fma.rn.f32 %f217, %f214, %f214, %f216; ld.global.nc.f32 %f218, [%rd62]; fma.rn.f32 %f219, %f218, %f218, %f217; setp.eq.f32 %p39, %f219, 0f00000000; selp.f32 %f33, %f1, %f214, %p39; selp.f32 %f34, %f2, %f215, %p39; selp.f32 %f35, %f3, %f218, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs23, [%rd63]; setp.gt.u16 %p40, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p40; selp.b32 %r117, %r115, %r7, %p40; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd64, %r121, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f220, [%rd65]; setp.eq.f32 %p41, %f220, 0f00000000; @%p41 bra BB0_33; mul.f32 %f221, %f2, %f34; fma.rn.f32 %f222, %f1, %f33, %f221; fma.rn.f32 %f223, %f3, %f35, %f222; abs.f32 %f224, %f223; mov.f32 %f225, 0f3F800000; sub.f32 %f226, %f225, %f224; mul.f32 %f227, %f226, 0f3F000000; sqrt.rn.f32 %f228, %f227; setp.gt.f32 %p42, %f224, 0f3F11EB85; selp.f32 %f229, %f228, %f224, %p42; mul.f32 %f230, %f229, %f229; mov.f32 %f231, 0f3C94D2E9; mov.f32 %f232, 0f3D53F941; fma.rn.f32 %f233, %f232, %f230, %f231; mov.f32 %f234, 0f3D3F841F; fma.rn.f32 %f235, %f233, %f230, %f234; mov.f32 %f236, 0f3D994929; fma.rn.f32 %f237, %f235, %f230, %f236; mov.f32 %f238, 0f3E2AAB94; fma.rn.f32 %f239, %f237, %f230, %f238; mul.f32 %f240, %f230, %f239; fma.rn.f32 %f241, %f240, %f229, %f229; add.f32 %f242, %f241, %f241; mov.f32 %f243, 0f3FC90FDB; sub.f32 %f244, %f243, %f241; selp.f32 %f245, %f242, %f244, %p42; setp.lt.f32 %p43, %f223, 0f00000000; mov.f32 %f246, 0f40490FDB; sub.f32 %f247, %f246, %f245; selp.f32 %f248, %f247, %f245, %p43; max.f32 %f250, %f250, %f248; BB0_33: cvta.to.global.u64 %rd66, %rd6; add.s64 %rd68, %rd66, %rd12; st.global.f32 [%rd68], %f250; BB0_34: ret; } ` setmaxangle_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<26>; .reg .f32 %f<255>; .reg .b32 %r<128>; .reg .b64 %rd<69>; ld.param.u64 %rd6, [setmaxangle_param_0]; ld.param.u64 %rd7, [setmaxangle_param_1]; ld.param.u64 %rd8, [setmaxangle_param_2]; ld.param.u64 %rd9, [setmaxangle_param_3]; ld.param.u64 %rd10, [setmaxangle_param_4]; ld.param.u64 %rd11, [setmaxangle_param_5]; ld.param.u32 %r32, [setmaxangle_param_6]; ld.param.u32 %r33, [setmaxangle_param_7]; ld.param.u32 %r34, [setmaxangle_param_8]; ld.param.u8 %rs5, [setmaxangle_param_9]; cvta.to.global.u64 %rd1, %rd10; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd9; cvta.to.global.u64 %rd4, %rd8; cvta.to.global.u64 %rd5, %rd7; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_34; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd5, %rd12; add.s64 %rd14, %rd4, %rd12; add.s64 %rd15, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f38, %f2, %f2; fma.rn.f32 %f39, %f1, %f1, %f38; ld.global.nc.f32 %f3, [%rd15]; fma.rn.f32 %f40, %f3, %f3, %f39; setp.eq.f32 %p6, %f40, 0f00000000; @%p6 bra BB0_34; cvt.s64.s32 %rd16, %r6; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs1, [%rd17]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd18, %r49; mul.wide.s32 %rd19, %r49, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd21]; mul.f32 %f42, %f5, %f5; fma.rn.f32 %f43, %f4, %f4, %f42; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f43; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; setp.gt.u16 %p8, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p8; selp.b32 %r53, %r51, %r7, %p8; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd24, %r57, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f44, [%rd25]; mov.f32 %f250, 0f00000000; setp.eq.f32 %p9, %f44, 0f00000000; @%p9 bra BB0_7; setp.eq.f32 %p10, %f7, 0f00000000; selp.f32 %f45, %f1, %f4, %p10; selp.f32 %f46, %f2, %f5, %p10; mul.f32 %f47, %f2, %f46; fma.rn.f32 %f48, %f1, %f45, %f47; selp.f32 %f49, %f3, %f6, %p10; fma.rn.f32 %f50, %f3, %f49, %f48; abs.f32 %f51, %f50; mov.f32 %f52, 0f3F800000; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, 0f3F000000; sqrt.rn.f32 %f55, %f54; setp.gt.f32 %p11, %f51, 0f3F11EB85; selp.f32 %f56, %f55, %f51, %p11; mul.f32 %f57, %f56, %f56; mov.f32 %f58, 0f3C94D2E9; mov.f32 %f59, 0f3D53F941; fma.rn.f32 %f60, %f59, %f57, %f58; mov.f32 %f61, 0f3D3F841F; fma.rn.f32 %f62, %f60, %f57, %f61; mov.f32 %f63, 0f3D994929; fma.rn.f32 %f64, %f62, %f57, %f63; mov.f32 %f65, 0f3E2AAB94; fma.rn.f32 %f66, %f64, %f57, %f65; mul.f32 %f67, %f57, %f66; fma.rn.f32 %f68, %f67, %f56, %f56; add.f32 %f69, %f68, %f68; mov.f32 %f70, 0f3FC90FDB; sub.f32 %f71, %f70, %f68; selp.f32 %f72, %f69, %f71, %p11; setp.lt.f32 %p12, %f50, 0f00000000; mov.f32 %f73, 0f40490FDB; sub.f32 %f74, %f73, %f72; selp.f32 %f75, %f74, %f72, %p12; mov.f32 %f76, 0f00000000; max.f32 %f250, %f76, %f75; BB0_7: add.s32 %r12, %r1, 1; @%p7 bra BB0_9; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_10; BB0_9: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_10: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd26, %r61; mul.wide.s32 %rd27, %r61, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f10, [%rd28]; ld.global.nc.f32 %f11, [%rd29]; mul.f32 %f77, %f11, %f11; fma.rn.f32 %f78, %f10, %f10, %f77; ld.global.nc.f32 %f12, [%rd30]; fma.rn.f32 %f13, %f12, %f12, %f78; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs9, [%rd31]; setp.gt.u16 %p14, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p14; selp.b32 %r65, %r63, %r7, %p14; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd32, %r69, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f79, [%rd33]; setp.eq.f32 %p15, %f79, 0f00000000; @%p15 bra BB0_12; setp.eq.f32 %p16, %f13, 0f00000000; selp.f32 %f80, %f1, %f10, %p16; selp.f32 %f81, %f2, %f11, %p16; mul.f32 %f82, %f2, %f81; fma.rn.f32 %f83, %f1, %f80, %f82; selp.f32 %f84, %f3, %f12, %p16; fma.rn.f32 %f85, %f3, %f84, %f83; abs.f32 %f86, %f85; mov.f32 %f87, 0f3F800000; sub.f32 %f88, %f87, %f86; mul.f32 %f89, %f88, 0f3F000000; sqrt.rn.f32 %f90, %f89; setp.gt.f32 %p17, %f86, 0f3F11EB85; selp.f32 %f91, %f90, %f86, %p17; mul.f32 %f92, %f91, %f91; mov.f32 %f93, 0f3C94D2E9; mov.f32 %f94, 0f3D53F941; fma.rn.f32 %f95, %f94, %f92, %f93; mov.f32 %f96, 0f3D3F841F; fma.rn.f32 %f97, %f95, %f92, %f96; mov.f32 %f98, 0f3D994929; fma.rn.f32 %f99, %f97, %f92, %f98; mov.f32 %f100, 0f3E2AAB94; fma.rn.f32 %f101, %f99, %f92, %f100; mul.f32 %f102, %f92, %f101; fma.rn.f32 %f103, %f102, %f91, %f91; add.f32 %f104, %f103, %f103; mov.f32 %f105, 0f3FC90FDB; sub.f32 %f106, %f105, %f103; selp.f32 %f107, %f104, %f106, %p17; setp.lt.f32 %p18, %f85, 0f00000000; mov.f32 %f108, 0f40490FDB; sub.f32 %f109, %f108, %f107; selp.f32 %f110, %f109, %f107, %p18; max.f32 %f250, %f250, %f110; BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r16, %r2, -1; @%p19 bra BB0_14; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_15; BB0_14: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_15: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f16, [%rd36]; ld.global.nc.f32 %f17, [%rd37]; mul.f32 %f111, %f17, %f17; fma.rn.f32 %f112, %f16, %f16, %f111; ld.global.nc.f32 %f18, [%rd38]; fma.rn.f32 %f19, %f18, %f18, %f112; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs12, [%rd39]; setp.gt.u16 %p20, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p20; selp.b32 %r78, %r76, %r7, %p20; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd40, %r82, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f113, [%rd41]; setp.eq.f32 %p21, %f113, 0f00000000; @%p21 bra BB0_17; setp.eq.f32 %p22, %f19, 0f00000000; selp.f32 %f114, %f1, %f16, %p22; selp.f32 %f115, %f2, %f17, %p22; mul.f32 %f116, %f2, %f115; fma.rn.f32 %f117, %f1, %f114, %f116; selp.f32 %f118, %f3, %f18, %p22; fma.rn.f32 %f119, %f3, %f118, %f117; abs.f32 %f120, %f119; mov.f32 %f121, 0f3F800000; sub.f32 %f122, %f121, %f120; mul.f32 %f123, %f122, 0f3F000000; sqrt.rn.f32 %f124, %f123; setp.gt.f32 %p23, %f120, 0f3F11EB85; selp.f32 %f125, %f124, %f120, %p23; mul.f32 %f126, %f125, %f125; mov.f32 %f127, 0f3C94D2E9; mov.f32 %f128, 0f3D53F941; fma.rn.f32 %f129, %f128, %f126, %f127; mov.f32 %f130, 0f3D3F841F; fma.rn.f32 %f131, %f129, %f126, %f130; mov.f32 %f132, 0f3D994929; fma.rn.f32 %f133, %f131, %f126, %f132; mov.f32 %f134, 0f3E2AAB94; fma.rn.f32 %f135, %f133, %f126, %f134; mul.f32 %f136, %f126, %f135; fma.rn.f32 %f137, %f136, %f125, %f125; add.f32 %f138, %f137, %f137; mov.f32 %f139, 0f3FC90FDB; sub.f32 %f140, %f139, %f137; selp.f32 %f141, %f138, %f140, %p23; setp.lt.f32 %p24, %f119, 0f00000000; mov.f32 %f142, 0f40490FDB; sub.f32 %f143, %f142, %f141; selp.f32 %f144, %f143, %f141, %p24; max.f32 %f250, %f250, %f144; BB0_17: add.s32 %r20, %r2, 1; @%p19 bra BB0_19; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_20; BB0_19: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_20: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd42, %r87; mul.wide.s32 %rd43, %r87, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f22, [%rd44]; ld.global.nc.f32 %f23, [%rd45]; mul.f32 %f145, %f23, %f23; fma.rn.f32 %f146, %f22, %f22, %f145; ld.global.nc.f32 %f24, [%rd46]; fma.rn.f32 %f25, %f24, %f24, %f146; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs16, [%rd47]; setp.gt.u16 %p26, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p26; selp.b32 %r91, %r89, %r7, %p26; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f147, [%rd49]; setp.eq.f32 %p27, %f147, 0f00000000; @%p27 bra BB0_22; setp.eq.f32 %p28, %f25, 0f00000000; selp.f32 %f148, %f1, %f22, %p28; selp.f32 %f149, %f2, %f23, %p28; mul.f32 %f150, %f2, %f149; fma.rn.f32 %f151, %f1, %f148, %f150; selp.f32 %f152, %f3, %f24, %p28; fma.rn.f32 %f153, %f3, %f152, %f151; abs.f32 %f154, %f153; mov.f32 %f155, 0f3F800000; sub.f32 %f156, %f155, %f154; mul.f32 %f157, %f156, 0f3F000000; sqrt.rn.f32 %f158, %f157; setp.gt.f32 %p29, %f154, 0f3F11EB85; selp.f32 %f159, %f158, %f154, %p29; mul.f32 %f160, %f159, %f159; mov.f32 %f161, 0f3C94D2E9; mov.f32 %f162, 0f3D53F941; fma.rn.f32 %f163, %f162, %f160, %f161; mov.f32 %f164, 0f3D3F841F; fma.rn.f32 %f165, %f163, %f160, %f164; mov.f32 %f166, 0f3D994929; fma.rn.f32 %f167, %f165, %f160, %f166; mov.f32 %f168, 0f3E2AAB94; fma.rn.f32 %f169, %f167, %f160, %f168; mul.f32 %f170, %f160, %f169; fma.rn.f32 %f171, %f170, %f159, %f159; add.f32 %f172, %f171, %f171; mov.f32 %f173, 0f3FC90FDB; sub.f32 %f174, %f173, %f171; selp.f32 %f175, %f172, %f174, %p29; setp.lt.f32 %p30, %f153, 0f00000000; mov.f32 %f176, 0f40490FDB; sub.f32 %f177, %f176, %f175; selp.f32 %f178, %f177, %f175, %p30; max.f32 %f250, %f250, %f178; BB0_22: setp.eq.s32 %p31, %r34, 1; @%p31 bra BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r24, %r3, -1; @%p32 bra BB0_25; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_26; BB0_25: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_26: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd50, %r100; mul.wide.s32 %rd51, %r100, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f179, [%rd52]; ld.global.nc.f32 %f180, [%rd53]; mul.f32 %f181, %f180, %f180; fma.rn.f32 %f182, %f179, %f179, %f181; ld.global.nc.f32 %f183, [%rd54]; fma.rn.f32 %f184, %f183, %f183, %f182; setp.eq.f32 %p33, %f184, 0f00000000; selp.f32 %f28, %f1, %f179, %p33; selp.f32 %f29, %f2, %f180, %p33; selp.f32 %f30, %f3, %f183, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs19, [%rd55]; setp.gt.u16 %p34, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p34; selp.b32 %r104, %r102, %r7, %p34; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f185, [%rd57]; setp.eq.f32 %p35, %f185, 0f00000000; @%p35 bra BB0_28; mul.f32 %f186, %f2, %f29; fma.rn.f32 %f187, %f1, %f28, %f186; fma.rn.f32 %f188, %f3, %f30, %f187; abs.f32 %f189, %f188; mov.f32 %f190, 0f3F800000; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f191, 0f3F000000; sqrt.rn.f32 %f193, %f192; setp.gt.f32 %p36, %f189, 0f3F11EB85; selp.f32 %f194, %f193, %f189, %p36; mul.f32 %f195, %f194, %f194; mov.f32 %f196, 0f3C94D2E9; mov.f32 %f197, 0f3D53F941; fma.rn.f32 %f198, %f197, %f195, %f196; mov.f32 %f199, 0f3D3F841F; fma.rn.f32 %f200, %f198, %f195, %f199; mov.f32 %f201, 0f3D994929; fma.rn.f32 %f202, %f200, %f195, %f201; mov.f32 %f203, 0f3E2AAB94; fma.rn.f32 %f204, %f202, %f195, %f203; mul.f32 %f205, %f195, %f204; fma.rn.f32 %f206, %f205, %f194, %f194; add.f32 %f207, %f206, %f206; mov.f32 %f208, 0f3FC90FDB; sub.f32 %f209, %f208, %f206; selp.f32 %f210, %f207, %f209, %p36; setp.lt.f32 %p37, %f188, 0f00000000; mov.f32 %f211, 0f40490FDB; sub.f32 %f212, %f211, %f210; selp.f32 %f213, %f212, %f210, %p37; max.f32 %f250, %f250, %f213; BB0_28: add.s32 %r28, %r3, 1; @%p32 bra BB0_30; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_31; BB0_30: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_31: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd58, %r113; mul.wide.s32 %rd59, %r113, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f214, [%rd60]; ld.global.nc.f32 %f215, [%rd61]; mul.f32 %f216, %f215, %f215; fma.rn.f32 %f217, %f214, %f214, %f216; ld.global.nc.f32 %f218, [%rd62]; fma.rn.f32 %f219, %f218, %f218, %f217; setp.eq.f32 %p39, %f219, 0f00000000; selp.f32 %f33, %f1, %f214, %p39; selp.f32 %f34, %f2, %f215, %p39; selp.f32 %f35, %f3, %f218, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs23, [%rd63]; setp.gt.u16 %p40, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p40; selp.b32 %r117, %r115, %r7, %p40; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd64, %r121, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f220, [%rd65]; setp.eq.f32 %p41, %f220, 0f00000000; @%p41 bra BB0_33; mul.f32 %f221, %f2, %f34; fma.rn.f32 %f222, %f1, %f33, %f221; fma.rn.f32 %f223, %f3, %f35, %f222; abs.f32 %f224, %f223; mov.f32 %f225, 0f3F800000; sub.f32 %f226, %f225, %f224; mul.f32 %f227, %f226, 0f3F000000; sqrt.rn.f32 %f228, %f227; setp.gt.f32 %p42, %f224, 0f3F11EB85; selp.f32 %f229, %f228, %f224, %p42; mul.f32 %f230, %f229, %f229; mov.f32 %f231, 0f3C94D2E9; mov.f32 %f232, 0f3D53F941; fma.rn.f32 %f233, %f232, %f230, %f231; mov.f32 %f234, 0f3D3F841F; fma.rn.f32 %f235, %f233, %f230, %f234; mov.f32 %f236, 0f3D994929; fma.rn.f32 %f237, %f235, %f230, %f236; mov.f32 %f238, 0f3E2AAB94; fma.rn.f32 %f239, %f237, %f230, %f238; mul.f32 %f240, %f230, %f239; fma.rn.f32 %f241, %f240, %f229, %f229; add.f32 %f242, %f241, %f241; mov.f32 %f243, 0f3FC90FDB; sub.f32 %f244, %f243, %f241; selp.f32 %f245, %f242, %f244, %p42; setp.lt.f32 %p43, %f223, 0f00000000; mov.f32 %f246, 0f40490FDB; sub.f32 %f247, %f246, %f245; selp.f32 %f248, %f247, %f245, %p43; max.f32 %f250, %f250, %f248; BB0_33: cvta.to.global.u64 %rd66, %rd6; add.s64 %rd68, %rd66, %rd12; st.global.f32 [%rd68], %f250; BB0_34: ret; } ` setmaxangle_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<26>; .reg .f32 %f<255>; .reg .b32 %r<128>; .reg .b64 %rd<69>; ld.param.u64 %rd6, [setmaxangle_param_0]; ld.param.u64 %rd7, [setmaxangle_param_1]; ld.param.u64 %rd8, [setmaxangle_param_2]; ld.param.u64 %rd9, [setmaxangle_param_3]; ld.param.u64 %rd10, [setmaxangle_param_4]; ld.param.u64 %rd11, [setmaxangle_param_5]; ld.param.u32 %r32, [setmaxangle_param_6]; ld.param.u32 %r33, [setmaxangle_param_7]; ld.param.u32 %r34, [setmaxangle_param_8]; ld.param.u8 %rs5, [setmaxangle_param_9]; cvta.to.global.u64 %rd1, %rd10; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd9; cvta.to.global.u64 %rd4, %rd8; cvta.to.global.u64 %rd5, %rd7; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_34; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd5, %rd12; add.s64 %rd14, %rd4, %rd12; add.s64 %rd15, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f38, %f2, %f2; fma.rn.f32 %f39, %f1, %f1, %f38; ld.global.nc.f32 %f3, [%rd15]; fma.rn.f32 %f40, %f3, %f3, %f39; setp.eq.f32 %p6, %f40, 0f00000000; @%p6 bra BB0_34; cvt.s64.s32 %rd16, %r6; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs1, [%rd17]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd18, %r49; mul.wide.s32 %rd19, %r49, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd21]; mul.f32 %f42, %f5, %f5; fma.rn.f32 %f43, %f4, %f4, %f42; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f43; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; setp.gt.u16 %p8, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p8; selp.b32 %r53, %r51, %r7, %p8; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd24, %r57, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f44, [%rd25]; mov.f32 %f250, 0f00000000; setp.eq.f32 %p9, %f44, 0f00000000; @%p9 bra BB0_7; setp.eq.f32 %p10, %f7, 0f00000000; selp.f32 %f45, %f1, %f4, %p10; selp.f32 %f46, %f2, %f5, %p10; mul.f32 %f47, %f2, %f46; fma.rn.f32 %f48, %f1, %f45, %f47; selp.f32 %f49, %f3, %f6, %p10; fma.rn.f32 %f50, %f3, %f49, %f48; abs.f32 %f51, %f50; mov.f32 %f52, 0f3F800000; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, 0f3F000000; sqrt.rn.f32 %f55, %f54; setp.gt.f32 %p11, %f51, 0f3F11EB85; selp.f32 %f56, %f55, %f51, %p11; mul.f32 %f57, %f56, %f56; mov.f32 %f58, 0f3C94D2E9; mov.f32 %f59, 0f3D53F941; fma.rn.f32 %f60, %f59, %f57, %f58; mov.f32 %f61, 0f3D3F841F; fma.rn.f32 %f62, %f60, %f57, %f61; mov.f32 %f63, 0f3D994929; fma.rn.f32 %f64, %f62, %f57, %f63; mov.f32 %f65, 0f3E2AAB94; fma.rn.f32 %f66, %f64, %f57, %f65; mul.f32 %f67, %f57, %f66; fma.rn.f32 %f68, %f67, %f56, %f56; add.f32 %f69, %f68, %f68; mov.f32 %f70, 0f3FC90FDB; sub.f32 %f71, %f70, %f68; selp.f32 %f72, %f69, %f71, %p11; setp.lt.f32 %p12, %f50, 0f00000000; mov.f32 %f73, 0f40490FDB; sub.f32 %f74, %f73, %f72; selp.f32 %f75, %f74, %f72, %p12; mov.f32 %f76, 0f00000000; max.f32 %f250, %f76, %f75; BB0_7: add.s32 %r12, %r1, 1; @%p7 bra BB0_9; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_10; BB0_9: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_10: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd26, %r61; mul.wide.s32 %rd27, %r61, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f10, [%rd28]; ld.global.nc.f32 %f11, [%rd29]; mul.f32 %f77, %f11, %f11; fma.rn.f32 %f78, %f10, %f10, %f77; ld.global.nc.f32 %f12, [%rd30]; fma.rn.f32 %f13, %f12, %f12, %f78; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs9, [%rd31]; setp.gt.u16 %p14, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p14; selp.b32 %r65, %r63, %r7, %p14; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd32, %r69, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f79, [%rd33]; setp.eq.f32 %p15, %f79, 0f00000000; @%p15 bra BB0_12; setp.eq.f32 %p16, %f13, 0f00000000; selp.f32 %f80, %f1, %f10, %p16; selp.f32 %f81, %f2, %f11, %p16; mul.f32 %f82, %f2, %f81; fma.rn.f32 %f83, %f1, %f80, %f82; selp.f32 %f84, %f3, %f12, %p16; fma.rn.f32 %f85, %f3, %f84, %f83; abs.f32 %f86, %f85; mov.f32 %f87, 0f3F800000; sub.f32 %f88, %f87, %f86; mul.f32 %f89, %f88, 0f3F000000; sqrt.rn.f32 %f90, %f89; setp.gt.f32 %p17, %f86, 0f3F11EB85; selp.f32 %f91, %f90, %f86, %p17; mul.f32 %f92, %f91, %f91; mov.f32 %f93, 0f3C94D2E9; mov.f32 %f94, 0f3D53F941; fma.rn.f32 %f95, %f94, %f92, %f93; mov.f32 %f96, 0f3D3F841F; fma.rn.f32 %f97, %f95, %f92, %f96; mov.f32 %f98, 0f3D994929; fma.rn.f32 %f99, %f97, %f92, %f98; mov.f32 %f100, 0f3E2AAB94; fma.rn.f32 %f101, %f99, %f92, %f100; mul.f32 %f102, %f92, %f101; fma.rn.f32 %f103, %f102, %f91, %f91; add.f32 %f104, %f103, %f103; mov.f32 %f105, 0f3FC90FDB; sub.f32 %f106, %f105, %f103; selp.f32 %f107, %f104, %f106, %p17; setp.lt.f32 %p18, %f85, 0f00000000; mov.f32 %f108, 0f40490FDB; sub.f32 %f109, %f108, %f107; selp.f32 %f110, %f109, %f107, %p18; max.f32 %f250, %f250, %f110; BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r16, %r2, -1; @%p19 bra BB0_14; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_15; BB0_14: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_15: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f16, [%rd36]; ld.global.nc.f32 %f17, [%rd37]; mul.f32 %f111, %f17, %f17; fma.rn.f32 %f112, %f16, %f16, %f111; ld.global.nc.f32 %f18, [%rd38]; fma.rn.f32 %f19, %f18, %f18, %f112; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs12, [%rd39]; setp.gt.u16 %p20, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p20; selp.b32 %r78, %r76, %r7, %p20; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd40, %r82, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f113, [%rd41]; setp.eq.f32 %p21, %f113, 0f00000000; @%p21 bra BB0_17; setp.eq.f32 %p22, %f19, 0f00000000; selp.f32 %f114, %f1, %f16, %p22; selp.f32 %f115, %f2, %f17, %p22; mul.f32 %f116, %f2, %f115; fma.rn.f32 %f117, %f1, %f114, %f116; selp.f32 %f118, %f3, %f18, %p22; fma.rn.f32 %f119, %f3, %f118, %f117; abs.f32 %f120, %f119; mov.f32 %f121, 0f3F800000; sub.f32 %f122, %f121, %f120; mul.f32 %f123, %f122, 0f3F000000; sqrt.rn.f32 %f124, %f123; setp.gt.f32 %p23, %f120, 0f3F11EB85; selp.f32 %f125, %f124, %f120, %p23; mul.f32 %f126, %f125, %f125; mov.f32 %f127, 0f3C94D2E9; mov.f32 %f128, 0f3D53F941; fma.rn.f32 %f129, %f128, %f126, %f127; mov.f32 %f130, 0f3D3F841F; fma.rn.f32 %f131, %f129, %f126, %f130; mov.f32 %f132, 0f3D994929; fma.rn.f32 %f133, %f131, %f126, %f132; mov.f32 %f134, 0f3E2AAB94; fma.rn.f32 %f135, %f133, %f126, %f134; mul.f32 %f136, %f126, %f135; fma.rn.f32 %f137, %f136, %f125, %f125; add.f32 %f138, %f137, %f137; mov.f32 %f139, 0f3FC90FDB; sub.f32 %f140, %f139, %f137; selp.f32 %f141, %f138, %f140, %p23; setp.lt.f32 %p24, %f119, 0f00000000; mov.f32 %f142, 0f40490FDB; sub.f32 %f143, %f142, %f141; selp.f32 %f144, %f143, %f141, %p24; max.f32 %f250, %f250, %f144; BB0_17: add.s32 %r20, %r2, 1; @%p19 bra BB0_19; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_20; BB0_19: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_20: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd42, %r87; mul.wide.s32 %rd43, %r87, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f22, [%rd44]; ld.global.nc.f32 %f23, [%rd45]; mul.f32 %f145, %f23, %f23; fma.rn.f32 %f146, %f22, %f22, %f145; ld.global.nc.f32 %f24, [%rd46]; fma.rn.f32 %f25, %f24, %f24, %f146; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs16, [%rd47]; setp.gt.u16 %p26, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p26; selp.b32 %r91, %r89, %r7, %p26; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f147, [%rd49]; setp.eq.f32 %p27, %f147, 0f00000000; @%p27 bra BB0_22; setp.eq.f32 %p28, %f25, 0f00000000; selp.f32 %f148, %f1, %f22, %p28; selp.f32 %f149, %f2, %f23, %p28; mul.f32 %f150, %f2, %f149; fma.rn.f32 %f151, %f1, %f148, %f150; selp.f32 %f152, %f3, %f24, %p28; fma.rn.f32 %f153, %f3, %f152, %f151; abs.f32 %f154, %f153; mov.f32 %f155, 0f3F800000; sub.f32 %f156, %f155, %f154; mul.f32 %f157, %f156, 0f3F000000; sqrt.rn.f32 %f158, %f157; setp.gt.f32 %p29, %f154, 0f3F11EB85; selp.f32 %f159, %f158, %f154, %p29; mul.f32 %f160, %f159, %f159; mov.f32 %f161, 0f3C94D2E9; mov.f32 %f162, 0f3D53F941; fma.rn.f32 %f163, %f162, %f160, %f161; mov.f32 %f164, 0f3D3F841F; fma.rn.f32 %f165, %f163, %f160, %f164; mov.f32 %f166, 0f3D994929; fma.rn.f32 %f167, %f165, %f160, %f166; mov.f32 %f168, 0f3E2AAB94; fma.rn.f32 %f169, %f167, %f160, %f168; mul.f32 %f170, %f160, %f169; fma.rn.f32 %f171, %f170, %f159, %f159; add.f32 %f172, %f171, %f171; mov.f32 %f173, 0f3FC90FDB; sub.f32 %f174, %f173, %f171; selp.f32 %f175, %f172, %f174, %p29; setp.lt.f32 %p30, %f153, 0f00000000; mov.f32 %f176, 0f40490FDB; sub.f32 %f177, %f176, %f175; selp.f32 %f178, %f177, %f175, %p30; max.f32 %f250, %f250, %f178; BB0_22: setp.eq.s32 %p31, %r34, 1; @%p31 bra BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r24, %r3, -1; @%p32 bra BB0_25; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_26; BB0_25: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_26: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd50, %r100; mul.wide.s32 %rd51, %r100, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f179, [%rd52]; ld.global.nc.f32 %f180, [%rd53]; mul.f32 %f181, %f180, %f180; fma.rn.f32 %f182, %f179, %f179, %f181; ld.global.nc.f32 %f183, [%rd54]; fma.rn.f32 %f184, %f183, %f183, %f182; setp.eq.f32 %p33, %f184, 0f00000000; selp.f32 %f28, %f1, %f179, %p33; selp.f32 %f29, %f2, %f180, %p33; selp.f32 %f30, %f3, %f183, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs19, [%rd55]; setp.gt.u16 %p34, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p34; selp.b32 %r104, %r102, %r7, %p34; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f185, [%rd57]; setp.eq.f32 %p35, %f185, 0f00000000; @%p35 bra BB0_28; mul.f32 %f186, %f2, %f29; fma.rn.f32 %f187, %f1, %f28, %f186; fma.rn.f32 %f188, %f3, %f30, %f187; abs.f32 %f189, %f188; mov.f32 %f190, 0f3F800000; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f191, 0f3F000000; sqrt.rn.f32 %f193, %f192; setp.gt.f32 %p36, %f189, 0f3F11EB85; selp.f32 %f194, %f193, %f189, %p36; mul.f32 %f195, %f194, %f194; mov.f32 %f196, 0f3C94D2E9; mov.f32 %f197, 0f3D53F941; fma.rn.f32 %f198, %f197, %f195, %f196; mov.f32 %f199, 0f3D3F841F; fma.rn.f32 %f200, %f198, %f195, %f199; mov.f32 %f201, 0f3D994929; fma.rn.f32 %f202, %f200, %f195, %f201; mov.f32 %f203, 0f3E2AAB94; fma.rn.f32 %f204, %f202, %f195, %f203; mul.f32 %f205, %f195, %f204; fma.rn.f32 %f206, %f205, %f194, %f194; add.f32 %f207, %f206, %f206; mov.f32 %f208, 0f3FC90FDB; sub.f32 %f209, %f208, %f206; selp.f32 %f210, %f207, %f209, %p36; setp.lt.f32 %p37, %f188, 0f00000000; mov.f32 %f211, 0f40490FDB; sub.f32 %f212, %f211, %f210; selp.f32 %f213, %f212, %f210, %p37; max.f32 %f250, %f250, %f213; BB0_28: add.s32 %r28, %r3, 1; @%p32 bra BB0_30; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_31; BB0_30: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_31: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd58, %r113; mul.wide.s32 %rd59, %r113, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f214, [%rd60]; ld.global.nc.f32 %f215, [%rd61]; mul.f32 %f216, %f215, %f215; fma.rn.f32 %f217, %f214, %f214, %f216; ld.global.nc.f32 %f218, [%rd62]; fma.rn.f32 %f219, %f218, %f218, %f217; setp.eq.f32 %p39, %f219, 0f00000000; selp.f32 %f33, %f1, %f214, %p39; selp.f32 %f34, %f2, %f215, %p39; selp.f32 %f35, %f3, %f218, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs23, [%rd63]; setp.gt.u16 %p40, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p40; selp.b32 %r117, %r115, %r7, %p40; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd64, %r121, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f220, [%rd65]; setp.eq.f32 %p41, %f220, 0f00000000; @%p41 bra BB0_33; mul.f32 %f221, %f2, %f34; fma.rn.f32 %f222, %f1, %f33, %f221; fma.rn.f32 %f223, %f3, %f35, %f222; abs.f32 %f224, %f223; mov.f32 %f225, 0f3F800000; sub.f32 %f226, %f225, %f224; mul.f32 %f227, %f226, 0f3F000000; sqrt.rn.f32 %f228, %f227; setp.gt.f32 %p42, %f224, 0f3F11EB85; selp.f32 %f229, %f228, %f224, %p42; mul.f32 %f230, %f229, %f229; mov.f32 %f231, 0f3C94D2E9; mov.f32 %f232, 0f3D53F941; fma.rn.f32 %f233, %f232, %f230, %f231; mov.f32 %f234, 0f3D3F841F; fma.rn.f32 %f235, %f233, %f230, %f234; mov.f32 %f236, 0f3D994929; fma.rn.f32 %f237, %f235, %f230, %f236; mov.f32 %f238, 0f3E2AAB94; fma.rn.f32 %f239, %f237, %f230, %f238; mul.f32 %f240, %f230, %f239; fma.rn.f32 %f241, %f240, %f229, %f229; add.f32 %f242, %f241, %f241; mov.f32 %f243, 0f3FC90FDB; sub.f32 %f244, %f243, %f241; selp.f32 %f245, %f242, %f244, %p42; setp.lt.f32 %p43, %f223, 0f00000000; mov.f32 %f246, 0f40490FDB; sub.f32 %f247, %f246, %f245; selp.f32 %f248, %f247, %f245, %p43; max.f32 %f250, %f250, %f248; BB0_33: cvta.to.global.u64 %rd66, %rd6; add.s64 %rd68, %rd66, %rd12; st.global.f32 [%rd68], %f250; BB0_34: ret; } ` setmaxangle_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl setmaxangle .visible .entry setmaxangle( .param .u64 setmaxangle_param_0, .param .u64 setmaxangle_param_1, .param .u64 setmaxangle_param_2, .param .u64 setmaxangle_param_3, .param .u64 setmaxangle_param_4, .param .u64 setmaxangle_param_5, .param .u32 setmaxangle_param_6, .param .u32 setmaxangle_param_7, .param .u32 setmaxangle_param_8, .param .u8 setmaxangle_param_9 ) { .reg .pred %p<44>; .reg .b16 %rs<26>; .reg .f32 %f<255>; .reg .b32 %r<128>; .reg .b64 %rd<69>; ld.param.u64 %rd6, [setmaxangle_param_0]; ld.param.u64 %rd7, [setmaxangle_param_1]; ld.param.u64 %rd8, [setmaxangle_param_2]; ld.param.u64 %rd9, [setmaxangle_param_3]; ld.param.u64 %rd10, [setmaxangle_param_4]; ld.param.u64 %rd11, [setmaxangle_param_5]; ld.param.u32 %r32, [setmaxangle_param_6]; ld.param.u32 %r33, [setmaxangle_param_7]; ld.param.u32 %r34, [setmaxangle_param_8]; ld.param.u8 %rs5, [setmaxangle_param_9]; cvta.to.global.u64 %rd1, %rd10; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd9; cvta.to.global.u64 %rd4, %rd8; cvta.to.global.u64 %rd5, %rd7; mov.u32 %r35, %ntid.x; mov.u32 %r36, %ctaid.x; mov.u32 %r37, %tid.x; mad.lo.s32 %r1, %r35, %r36, %r37; mov.u32 %r38, %ntid.y; mov.u32 %r39, %ctaid.y; mov.u32 %r40, %tid.y; mad.lo.s32 %r2, %r38, %r39, %r40; mov.u32 %r41, %ntid.z; mov.u32 %r42, %ctaid.z; mov.u32 %r43, %tid.z; mad.lo.s32 %r3, %r41, %r42, %r43; setp.ge.s32 %p1, %r2, %r33; setp.ge.s32 %p2, %r1, %r32; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r34; or.pred %p5, %p3, %p4; @%p5 bra BB0_34; mul.lo.s32 %r4, %r3, %r33; add.s32 %r44, %r4, %r2; mul.lo.s32 %r5, %r44, %r32; add.s32 %r6, %r5, %r1; mul.wide.s32 %rd12, %r6, 4; add.s64 %rd13, %rd5, %rd12; add.s64 %rd14, %rd4, %rd12; add.s64 %rd15, %rd3, %rd12; ld.global.nc.f32 %f1, [%rd13]; ld.global.nc.f32 %f2, [%rd14]; mul.f32 %f38, %f2, %f2; fma.rn.f32 %f39, %f1, %f1, %f38; ld.global.nc.f32 %f3, [%rd15]; fma.rn.f32 %f40, %f3, %f3, %f39; setp.eq.f32 %p6, %f40, 0f00000000; @%p6 bra BB0_34; cvt.s64.s32 %rd16, %r6; add.s64 %rd17, %rd2, %rd16; ld.global.nc.u8 %rs1, [%rd17]; cvt.u32.u16 %r45, %rs1; and.b32 %r7, %r45, 255; and.b16 %rs2, %rs5, 1; setp.eq.s16 %p7, %rs2, 0; add.s32 %r8, %r1, -1; @%p7 bra BB0_4; rem.s32 %r46, %r8, %r32; add.s32 %r47, %r46, %r32; rem.s32 %r122, %r47, %r32; bra.uni BB0_5; BB0_4: mov.u32 %r48, 0; max.s32 %r122, %r8, %r48; BB0_5: add.s32 %r49, %r122, %r5; cvt.s64.s32 %rd18, %r49; mul.wide.s32 %rd19, %r49, 4; add.s64 %rd20, %rd5, %rd19; add.s64 %rd21, %rd4, %rd19; add.s64 %rd22, %rd3, %rd19; ld.global.nc.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd21]; mul.f32 %f42, %f5, %f5; fma.rn.f32 %f43, %f4, %f4, %f42; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f43; add.s64 %rd23, %rd2, %rd18; ld.global.nc.u8 %rs6, [%rd23]; setp.gt.u16 %p8, %rs6, %rs1; cvt.u32.u16 %r50, %rs6; and.b32 %r51, %r50, 255; selp.b32 %r52, %r7, %r51, %p8; selp.b32 %r53, %r51, %r7, %p8; add.s32 %r54, %r53, 1; mul.lo.s32 %r55, %r54, %r53; shr.u32 %r56, %r55, 1; add.s32 %r57, %r56, %r52; mul.wide.s32 %rd24, %r57, 4; add.s64 %rd25, %rd1, %rd24; ld.global.nc.f32 %f44, [%rd25]; mov.f32 %f250, 0f00000000; setp.eq.f32 %p9, %f44, 0f00000000; @%p9 bra BB0_7; setp.eq.f32 %p10, %f7, 0f00000000; selp.f32 %f45, %f1, %f4, %p10; selp.f32 %f46, %f2, %f5, %p10; mul.f32 %f47, %f2, %f46; fma.rn.f32 %f48, %f1, %f45, %f47; selp.f32 %f49, %f3, %f6, %p10; fma.rn.f32 %f50, %f3, %f49, %f48; abs.f32 %f51, %f50; mov.f32 %f52, 0f3F800000; sub.f32 %f53, %f52, %f51; mul.f32 %f54, %f53, 0f3F000000; sqrt.rn.f32 %f55, %f54; setp.gt.f32 %p11, %f51, 0f3F11EB85; selp.f32 %f56, %f55, %f51, %p11; mul.f32 %f57, %f56, %f56; mov.f32 %f58, 0f3C94D2E9; mov.f32 %f59, 0f3D53F941; fma.rn.f32 %f60, %f59, %f57, %f58; mov.f32 %f61, 0f3D3F841F; fma.rn.f32 %f62, %f60, %f57, %f61; mov.f32 %f63, 0f3D994929; fma.rn.f32 %f64, %f62, %f57, %f63; mov.f32 %f65, 0f3E2AAB94; fma.rn.f32 %f66, %f64, %f57, %f65; mul.f32 %f67, %f57, %f66; fma.rn.f32 %f68, %f67, %f56, %f56; add.f32 %f69, %f68, %f68; mov.f32 %f70, 0f3FC90FDB; sub.f32 %f71, %f70, %f68; selp.f32 %f72, %f69, %f71, %p11; setp.lt.f32 %p12, %f50, 0f00000000; mov.f32 %f73, 0f40490FDB; sub.f32 %f74, %f73, %f72; selp.f32 %f75, %f74, %f72, %p12; mov.f32 %f76, 0f00000000; max.f32 %f250, %f76, %f75; BB0_7: add.s32 %r12, %r1, 1; @%p7 bra BB0_9; rem.s32 %r58, %r12, %r32; add.s32 %r59, %r58, %r32; rem.s32 %r123, %r59, %r32; bra.uni BB0_10; BB0_9: add.s32 %r60, %r32, -1; min.s32 %r123, %r12, %r60; BB0_10: add.s32 %r61, %r123, %r5; cvt.s64.s32 %rd26, %r61; mul.wide.s32 %rd27, %r61, 4; add.s64 %rd28, %rd5, %rd27; add.s64 %rd29, %rd4, %rd27; add.s64 %rd30, %rd3, %rd27; ld.global.nc.f32 %f10, [%rd28]; ld.global.nc.f32 %f11, [%rd29]; mul.f32 %f77, %f11, %f11; fma.rn.f32 %f78, %f10, %f10, %f77; ld.global.nc.f32 %f12, [%rd30]; fma.rn.f32 %f13, %f12, %f12, %f78; add.s64 %rd31, %rd2, %rd26; ld.global.nc.u8 %rs9, [%rd31]; setp.gt.u16 %p14, %rs9, %rs1; cvt.u32.u16 %r62, %rs9; and.b32 %r63, %r62, 255; selp.b32 %r64, %r7, %r63, %p14; selp.b32 %r65, %r63, %r7, %p14; add.s32 %r66, %r65, 1; mul.lo.s32 %r67, %r66, %r65; shr.u32 %r68, %r67, 1; add.s32 %r69, %r68, %r64; mul.wide.s32 %rd32, %r69, 4; add.s64 %rd33, %rd1, %rd32; ld.global.nc.f32 %f79, [%rd33]; setp.eq.f32 %p15, %f79, 0f00000000; @%p15 bra BB0_12; setp.eq.f32 %p16, %f13, 0f00000000; selp.f32 %f80, %f1, %f10, %p16; selp.f32 %f81, %f2, %f11, %p16; mul.f32 %f82, %f2, %f81; fma.rn.f32 %f83, %f1, %f80, %f82; selp.f32 %f84, %f3, %f12, %p16; fma.rn.f32 %f85, %f3, %f84, %f83; abs.f32 %f86, %f85; mov.f32 %f87, 0f3F800000; sub.f32 %f88, %f87, %f86; mul.f32 %f89, %f88, 0f3F000000; sqrt.rn.f32 %f90, %f89; setp.gt.f32 %p17, %f86, 0f3F11EB85; selp.f32 %f91, %f90, %f86, %p17; mul.f32 %f92, %f91, %f91; mov.f32 %f93, 0f3C94D2E9; mov.f32 %f94, 0f3D53F941; fma.rn.f32 %f95, %f94, %f92, %f93; mov.f32 %f96, 0f3D3F841F; fma.rn.f32 %f97, %f95, %f92, %f96; mov.f32 %f98, 0f3D994929; fma.rn.f32 %f99, %f97, %f92, %f98; mov.f32 %f100, 0f3E2AAB94; fma.rn.f32 %f101, %f99, %f92, %f100; mul.f32 %f102, %f92, %f101; fma.rn.f32 %f103, %f102, %f91, %f91; add.f32 %f104, %f103, %f103; mov.f32 %f105, 0f3FC90FDB; sub.f32 %f106, %f105, %f103; selp.f32 %f107, %f104, %f106, %p17; setp.lt.f32 %p18, %f85, 0f00000000; mov.f32 %f108, 0f40490FDB; sub.f32 %f109, %f108, %f107; selp.f32 %f110, %f109, %f107, %p18; max.f32 %f250, %f250, %f110; BB0_12: and.b16 %rs3, %rs5, 2; setp.eq.s16 %p19, %rs3, 0; add.s32 %r16, %r2, -1; @%p19 bra BB0_14; rem.s32 %r70, %r16, %r33; add.s32 %r71, %r70, %r33; rem.s32 %r124, %r71, %r33; bra.uni BB0_15; BB0_14: mov.u32 %r72, 0; max.s32 %r124, %r16, %r72; BB0_15: add.s32 %r73, %r124, %r4; mad.lo.s32 %r74, %r73, %r32, %r1; cvt.s64.s32 %rd34, %r74; mul.wide.s32 %rd35, %r74, 4; add.s64 %rd36, %rd5, %rd35; add.s64 %rd37, %rd4, %rd35; add.s64 %rd38, %rd3, %rd35; ld.global.nc.f32 %f16, [%rd36]; ld.global.nc.f32 %f17, [%rd37]; mul.f32 %f111, %f17, %f17; fma.rn.f32 %f112, %f16, %f16, %f111; ld.global.nc.f32 %f18, [%rd38]; fma.rn.f32 %f19, %f18, %f18, %f112; add.s64 %rd39, %rd2, %rd34; ld.global.nc.u8 %rs12, [%rd39]; setp.gt.u16 %p20, %rs12, %rs1; cvt.u32.u16 %r75, %rs12; and.b32 %r76, %r75, 255; selp.b32 %r77, %r7, %r76, %p20; selp.b32 %r78, %r76, %r7, %p20; add.s32 %r79, %r78, 1; mul.lo.s32 %r80, %r79, %r78; shr.u32 %r81, %r80, 1; add.s32 %r82, %r81, %r77; mul.wide.s32 %rd40, %r82, 4; add.s64 %rd41, %rd1, %rd40; ld.global.nc.f32 %f113, [%rd41]; setp.eq.f32 %p21, %f113, 0f00000000; @%p21 bra BB0_17; setp.eq.f32 %p22, %f19, 0f00000000; selp.f32 %f114, %f1, %f16, %p22; selp.f32 %f115, %f2, %f17, %p22; mul.f32 %f116, %f2, %f115; fma.rn.f32 %f117, %f1, %f114, %f116; selp.f32 %f118, %f3, %f18, %p22; fma.rn.f32 %f119, %f3, %f118, %f117; abs.f32 %f120, %f119; mov.f32 %f121, 0f3F800000; sub.f32 %f122, %f121, %f120; mul.f32 %f123, %f122, 0f3F000000; sqrt.rn.f32 %f124, %f123; setp.gt.f32 %p23, %f120, 0f3F11EB85; selp.f32 %f125, %f124, %f120, %p23; mul.f32 %f126, %f125, %f125; mov.f32 %f127, 0f3C94D2E9; mov.f32 %f128, 0f3D53F941; fma.rn.f32 %f129, %f128, %f126, %f127; mov.f32 %f130, 0f3D3F841F; fma.rn.f32 %f131, %f129, %f126, %f130; mov.f32 %f132, 0f3D994929; fma.rn.f32 %f133, %f131, %f126, %f132; mov.f32 %f134, 0f3E2AAB94; fma.rn.f32 %f135, %f133, %f126, %f134; mul.f32 %f136, %f126, %f135; fma.rn.f32 %f137, %f136, %f125, %f125; add.f32 %f138, %f137, %f137; mov.f32 %f139, 0f3FC90FDB; sub.f32 %f140, %f139, %f137; selp.f32 %f141, %f138, %f140, %p23; setp.lt.f32 %p24, %f119, 0f00000000; mov.f32 %f142, 0f40490FDB; sub.f32 %f143, %f142, %f141; selp.f32 %f144, %f143, %f141, %p24; max.f32 %f250, %f250, %f144; BB0_17: add.s32 %r20, %r2, 1; @%p19 bra BB0_19; rem.s32 %r83, %r20, %r33; add.s32 %r84, %r83, %r33; rem.s32 %r125, %r84, %r33; bra.uni BB0_20; BB0_19: add.s32 %r85, %r33, -1; min.s32 %r125, %r20, %r85; BB0_20: add.s32 %r86, %r125, %r4; mad.lo.s32 %r87, %r86, %r32, %r1; cvt.s64.s32 %rd42, %r87; mul.wide.s32 %rd43, %r87, 4; add.s64 %rd44, %rd5, %rd43; add.s64 %rd45, %rd4, %rd43; add.s64 %rd46, %rd3, %rd43; ld.global.nc.f32 %f22, [%rd44]; ld.global.nc.f32 %f23, [%rd45]; mul.f32 %f145, %f23, %f23; fma.rn.f32 %f146, %f22, %f22, %f145; ld.global.nc.f32 %f24, [%rd46]; fma.rn.f32 %f25, %f24, %f24, %f146; add.s64 %rd47, %rd2, %rd42; ld.global.nc.u8 %rs16, [%rd47]; setp.gt.u16 %p26, %rs16, %rs1; cvt.u32.u16 %r88, %rs16; and.b32 %r89, %r88, 255; selp.b32 %r90, %r7, %r89, %p26; selp.b32 %r91, %r89, %r7, %p26; add.s32 %r92, %r91, 1; mul.lo.s32 %r93, %r92, %r91; shr.u32 %r94, %r93, 1; add.s32 %r95, %r94, %r90; mul.wide.s32 %rd48, %r95, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f147, [%rd49]; setp.eq.f32 %p27, %f147, 0f00000000; @%p27 bra BB0_22; setp.eq.f32 %p28, %f25, 0f00000000; selp.f32 %f148, %f1, %f22, %p28; selp.f32 %f149, %f2, %f23, %p28; mul.f32 %f150, %f2, %f149; fma.rn.f32 %f151, %f1, %f148, %f150; selp.f32 %f152, %f3, %f24, %p28; fma.rn.f32 %f153, %f3, %f152, %f151; abs.f32 %f154, %f153; mov.f32 %f155, 0f3F800000; sub.f32 %f156, %f155, %f154; mul.f32 %f157, %f156, 0f3F000000; sqrt.rn.f32 %f158, %f157; setp.gt.f32 %p29, %f154, 0f3F11EB85; selp.f32 %f159, %f158, %f154, %p29; mul.f32 %f160, %f159, %f159; mov.f32 %f161, 0f3C94D2E9; mov.f32 %f162, 0f3D53F941; fma.rn.f32 %f163, %f162, %f160, %f161; mov.f32 %f164, 0f3D3F841F; fma.rn.f32 %f165, %f163, %f160, %f164; mov.f32 %f166, 0f3D994929; fma.rn.f32 %f167, %f165, %f160, %f166; mov.f32 %f168, 0f3E2AAB94; fma.rn.f32 %f169, %f167, %f160, %f168; mul.f32 %f170, %f160, %f169; fma.rn.f32 %f171, %f170, %f159, %f159; add.f32 %f172, %f171, %f171; mov.f32 %f173, 0f3FC90FDB; sub.f32 %f174, %f173, %f171; selp.f32 %f175, %f172, %f174, %p29; setp.lt.f32 %p30, %f153, 0f00000000; mov.f32 %f176, 0f40490FDB; sub.f32 %f177, %f176, %f175; selp.f32 %f178, %f177, %f175, %p30; max.f32 %f250, %f250, %f178; BB0_22: setp.eq.s32 %p31, %r34, 1; @%p31 bra BB0_33; and.b16 %rs4, %rs5, 4; setp.eq.s16 %p32, %rs4, 0; add.s32 %r24, %r3, -1; @%p32 bra BB0_25; rem.s32 %r96, %r24, %r34; add.s32 %r97, %r96, %r34; rem.s32 %r126, %r97, %r34; bra.uni BB0_26; BB0_25: mov.u32 %r98, 0; max.s32 %r126, %r24, %r98; BB0_26: mad.lo.s32 %r99, %r126, %r33, %r2; mad.lo.s32 %r100, %r99, %r32, %r1; cvt.s64.s32 %rd50, %r100; mul.wide.s32 %rd51, %r100, 4; add.s64 %rd52, %rd5, %rd51; add.s64 %rd53, %rd4, %rd51; add.s64 %rd54, %rd3, %rd51; ld.global.nc.f32 %f179, [%rd52]; ld.global.nc.f32 %f180, [%rd53]; mul.f32 %f181, %f180, %f180; fma.rn.f32 %f182, %f179, %f179, %f181; ld.global.nc.f32 %f183, [%rd54]; fma.rn.f32 %f184, %f183, %f183, %f182; setp.eq.f32 %p33, %f184, 0f00000000; selp.f32 %f28, %f1, %f179, %p33; selp.f32 %f29, %f2, %f180, %p33; selp.f32 %f30, %f3, %f183, %p33; add.s64 %rd55, %rd2, %rd50; ld.global.nc.u8 %rs19, [%rd55]; setp.gt.u16 %p34, %rs19, %rs1; cvt.u32.u16 %r101, %rs19; and.b32 %r102, %r101, 255; selp.b32 %r103, %r7, %r102, %p34; selp.b32 %r104, %r102, %r7, %p34; add.s32 %r105, %r104, 1; mul.lo.s32 %r106, %r105, %r104; shr.u32 %r107, %r106, 1; add.s32 %r108, %r107, %r103; mul.wide.s32 %rd56, %r108, 4; add.s64 %rd57, %rd1, %rd56; ld.global.nc.f32 %f185, [%rd57]; setp.eq.f32 %p35, %f185, 0f00000000; @%p35 bra BB0_28; mul.f32 %f186, %f2, %f29; fma.rn.f32 %f187, %f1, %f28, %f186; fma.rn.f32 %f188, %f3, %f30, %f187; abs.f32 %f189, %f188; mov.f32 %f190, 0f3F800000; sub.f32 %f191, %f190, %f189; mul.f32 %f192, %f191, 0f3F000000; sqrt.rn.f32 %f193, %f192; setp.gt.f32 %p36, %f189, 0f3F11EB85; selp.f32 %f194, %f193, %f189, %p36; mul.f32 %f195, %f194, %f194; mov.f32 %f196, 0f3C94D2E9; mov.f32 %f197, 0f3D53F941; fma.rn.f32 %f198, %f197, %f195, %f196; mov.f32 %f199, 0f3D3F841F; fma.rn.f32 %f200, %f198, %f195, %f199; mov.f32 %f201, 0f3D994929; fma.rn.f32 %f202, %f200, %f195, %f201; mov.f32 %f203, 0f3E2AAB94; fma.rn.f32 %f204, %f202, %f195, %f203; mul.f32 %f205, %f195, %f204; fma.rn.f32 %f206, %f205, %f194, %f194; add.f32 %f207, %f206, %f206; mov.f32 %f208, 0f3FC90FDB; sub.f32 %f209, %f208, %f206; selp.f32 %f210, %f207, %f209, %p36; setp.lt.f32 %p37, %f188, 0f00000000; mov.f32 %f211, 0f40490FDB; sub.f32 %f212, %f211, %f210; selp.f32 %f213, %f212, %f210, %p37; max.f32 %f250, %f250, %f213; BB0_28: add.s32 %r28, %r3, 1; @%p32 bra BB0_30; rem.s32 %r109, %r28, %r34; add.s32 %r110, %r109, %r34; rem.s32 %r127, %r110, %r34; bra.uni BB0_31; BB0_30: add.s32 %r111, %r34, -1; min.s32 %r127, %r28, %r111; BB0_31: mad.lo.s32 %r112, %r127, %r33, %r2; mad.lo.s32 %r113, %r112, %r32, %r1; cvt.s64.s32 %rd58, %r113; mul.wide.s32 %rd59, %r113, 4; add.s64 %rd60, %rd5, %rd59; add.s64 %rd61, %rd4, %rd59; add.s64 %rd62, %rd3, %rd59; ld.global.nc.f32 %f214, [%rd60]; ld.global.nc.f32 %f215, [%rd61]; mul.f32 %f216, %f215, %f215; fma.rn.f32 %f217, %f214, %f214, %f216; ld.global.nc.f32 %f218, [%rd62]; fma.rn.f32 %f219, %f218, %f218, %f217; setp.eq.f32 %p39, %f219, 0f00000000; selp.f32 %f33, %f1, %f214, %p39; selp.f32 %f34, %f2, %f215, %p39; selp.f32 %f35, %f3, %f218, %p39; add.s64 %rd63, %rd2, %rd58; ld.global.nc.u8 %rs23, [%rd63]; setp.gt.u16 %p40, %rs23, %rs1; cvt.u32.u16 %r114, %rs23; and.b32 %r115, %r114, 255; selp.b32 %r116, %r7, %r115, %p40; selp.b32 %r117, %r115, %r7, %p40; add.s32 %r118, %r117, 1; mul.lo.s32 %r119, %r118, %r117; shr.u32 %r120, %r119, 1; add.s32 %r121, %r120, %r116; mul.wide.s32 %rd64, %r121, 4; add.s64 %rd65, %rd1, %rd64; ld.global.nc.f32 %f220, [%rd65]; setp.eq.f32 %p41, %f220, 0f00000000; @%p41 bra BB0_33; mul.f32 %f221, %f2, %f34; fma.rn.f32 %f222, %f1, %f33, %f221; fma.rn.f32 %f223, %f3, %f35, %f222; abs.f32 %f224, %f223; mov.f32 %f225, 0f3F800000; sub.f32 %f226, %f225, %f224; mul.f32 %f227, %f226, 0f3F000000; sqrt.rn.f32 %f228, %f227; setp.gt.f32 %p42, %f224, 0f3F11EB85; selp.f32 %f229, %f228, %f224, %p42; mul.f32 %f230, %f229, %f229; mov.f32 %f231, 0f3C94D2E9; mov.f32 %f232, 0f3D53F941; fma.rn.f32 %f233, %f232, %f230, %f231; mov.f32 %f234, 0f3D3F841F; fma.rn.f32 %f235, %f233, %f230, %f234; mov.f32 %f236, 0f3D994929; fma.rn.f32 %f237, %f235, %f230, %f236; mov.f32 %f238, 0f3E2AAB94; fma.rn.f32 %f239, %f237, %f230, %f238; mul.f32 %f240, %f230, %f239; fma.rn.f32 %f241, %f240, %f229, %f229; add.f32 %f242, %f241, %f241; mov.f32 %f243, 0f3FC90FDB; sub.f32 %f244, %f243, %f241; selp.f32 %f245, %f242, %f244, %p42; setp.lt.f32 %p43, %f223, 0f00000000; mov.f32 %f246, 0f40490FDB; sub.f32 %f247, %f246, %f245; selp.f32 %f248, %f247, %f245, %p43; max.f32 %f250, %f250, %f248; BB0_33: cvta.to.global.u64 %rd66, %rd6; add.s64 %rd68, %rd66, %rd12; st.global.f32 [%rd68], %f250; BB0_34: ret; } ` ) mumax3-3.10/cuda/minimize.cu000066400000000000000000000014531371432437400157500ustar00rootroot00000000000000#include #include "float3.h" // Steepest descent energy minimizer extern "C" __global__ void minimize(float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ m0x, float* __restrict__ m0y, float* __restrict__ m0z, float* __restrict__ tx, float* __restrict__ ty, float* __restrict__ tz, float dt, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { float3 m0 = {m0x[i], m0y[i], m0z[i]}; float3 t = {tx[i], ty[i], tz[i]}; float t2 = dt*dt*dot(t, t); float3 result = (4 - t2) * m0 + 4 * dt * t; float divisor = 4 + t2; mx[i] = result.x / divisor; my[i] = result.y / divisor; mz[i] = result.z / divisor; } } mumax3-3.10/cuda/minimize.go000066400000000000000000000006711371432437400157470ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" ) // m = 1 / (4 + τ²(m x H)²) [{4 - τ²(m x H)²} m - 4τ(m x m x H)] // note: torque from LLNoPrecess has negative sign func Minimize(m, m0, torque *data.Slice, dt float32) { N := m.Len() cfg := make1DConf(N) k_minimize_async(m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), m0.DevPtr(X), m0.DevPtr(Y), m0.DevPtr(Z), torque.DevPtr(X), torque.DevPtr(Y), torque.DevPtr(Z), dt, N, cfg) } mumax3-3.10/cuda/minimize_wrapper.go000066400000000000000000001123031371432437400175030ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for minimize kernel var minimize_code cu.Function // Stores the arguments for minimize kernel invocation type minimize_args_t struct { arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_m0x unsafe.Pointer arg_m0y unsafe.Pointer arg_m0z unsafe.Pointer arg_tx unsafe.Pointer arg_ty unsafe.Pointer arg_tz unsafe.Pointer arg_dt float32 arg_N int argptr [11]unsafe.Pointer sync.Mutex } // Stores the arguments for minimize kernel invocation var minimize_args minimize_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. minimize_args.argptr[0] = unsafe.Pointer(&minimize_args.arg_mx) minimize_args.argptr[1] = unsafe.Pointer(&minimize_args.arg_my) minimize_args.argptr[2] = unsafe.Pointer(&minimize_args.arg_mz) minimize_args.argptr[3] = unsafe.Pointer(&minimize_args.arg_m0x) minimize_args.argptr[4] = unsafe.Pointer(&minimize_args.arg_m0y) minimize_args.argptr[5] = unsafe.Pointer(&minimize_args.arg_m0z) minimize_args.argptr[6] = unsafe.Pointer(&minimize_args.arg_tx) minimize_args.argptr[7] = unsafe.Pointer(&minimize_args.arg_ty) minimize_args.argptr[8] = unsafe.Pointer(&minimize_args.arg_tz) minimize_args.argptr[9] = unsafe.Pointer(&minimize_args.arg_dt) minimize_args.argptr[10] = unsafe.Pointer(&minimize_args.arg_N) } // Wrapper for minimize CUDA kernel, asynchronous. func k_minimize_async(mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, m0x unsafe.Pointer, m0y unsafe.Pointer, m0z unsafe.Pointer, tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, dt float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("minimize") } minimize_args.Lock() defer minimize_args.Unlock() if minimize_code == 0 { minimize_code = fatbinLoad(minimize_map, "minimize") } minimize_args.arg_mx = mx minimize_args.arg_my = my minimize_args.arg_mz = mz minimize_args.arg_m0x = m0x minimize_args.arg_m0y = m0y minimize_args.arg_m0z = m0z minimize_args.arg_tx = tx minimize_args.arg_ty = ty minimize_args.arg_tz = tz minimize_args.arg_dt = dt minimize_args.arg_N = N args := minimize_args.argptr[:] cu.LaunchKernel(minimize_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("minimize") } } // maps compute capability on PTX code for minimize kernel. var minimize_map = map[int]string{0: "", 30: minimize_ptx_30, 32: minimize_ptx_32, 35: minimize_ptx_35, 37: minimize_ptx_37, 50: minimize_ptx_50, 52: minimize_ptx_52, 53: minimize_ptx_53, 60: minimize_ptx_60, 61: minimize_ptx_61, 62: minimize_ptx_62, 70: minimize_ptx_70, 72: minimize_ptx_72, 75: minimize_ptx_75} // minimize PTX code for various compute capabilities. const ( minimize_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.f32 %f2, [%rd18]; ld.global.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; BB0_2: ret; } ` minimize_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; BB0_2: ret; } ` minimize_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; BB0_2: ret; } ` minimize_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; BB0_2: ret; } ` minimize_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; BB0_2: ret; } ` minimize_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; BB0_2: ret; } ` minimize_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; BB0_2: ret; } ` minimize_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; BB0_2: ret; } ` minimize_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; BB0_2: ret; } ` minimize_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; BB0_2: ret; } ` minimize_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; BB0_2: ret; } ` minimize_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; BB0_2: ret; } ` minimize_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl minimize .visible .entry minimize( .param .u64 minimize_param_0, .param .u64 minimize_param_1, .param .u64 minimize_param_2, .param .u64 minimize_param_3, .param .u64 minimize_param_4, .param .u64 minimize_param_5, .param .u64 minimize_param_6, .param .u64 minimize_param_7, .param .u64 minimize_param_8, .param .f32 minimize_param_9, .param .u32 minimize_param_10 ) { .reg .pred %p<2>; .reg .f32 %f<26>; .reg .b32 %r<9>; .reg .b64 %rd<29>; ld.param.u64 %rd1, [minimize_param_0]; ld.param.u64 %rd2, [minimize_param_1]; ld.param.u64 %rd3, [minimize_param_2]; ld.param.u64 %rd4, [minimize_param_3]; ld.param.u64 %rd5, [minimize_param_4]; ld.param.u64 %rd6, [minimize_param_5]; ld.param.u64 %rd7, [minimize_param_6]; ld.param.u64 %rd8, [minimize_param_7]; ld.param.u64 %rd9, [minimize_param_8]; ld.param.f32 %f1, [minimize_param_9]; ld.param.u32 %r2, [minimize_param_10]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd10, %rd4; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; cvta.to.global.u64 %rd13, %rd5; add.s64 %rd14, %rd13, %rd11; cvta.to.global.u64 %rd15, %rd6; add.s64 %rd16, %rd15, %rd11; cvta.to.global.u64 %rd17, %rd7; add.s64 %rd18, %rd17, %rd11; cvta.to.global.u64 %rd19, %rd8; add.s64 %rd20, %rd19, %rd11; cvta.to.global.u64 %rd21, %rd9; add.s64 %rd22, %rd21, %rd11; ld.global.nc.f32 %f2, [%rd18]; ld.global.nc.f32 %f3, [%rd20]; mul.f32 %f4, %f3, %f3; fma.rn.f32 %f5, %f2, %f2, %f4; ld.global.nc.f32 %f6, [%rd22]; fma.rn.f32 %f7, %f6, %f6, %f5; mul.f32 %f8, %f1, %f1; mul.f32 %f9, %f8, %f7; mov.f32 %f10, 0f40800000; sub.f32 %f11, %f10, %f9; ld.global.nc.f32 %f12, [%rd12]; mul.f32 %f13, %f12, %f11; ld.global.nc.f32 %f14, [%rd14]; mul.f32 %f15, %f14, %f11; ld.global.nc.f32 %f16, [%rd16]; mul.f32 %f17, %f16, %f11; mul.f32 %f18, %f1, 0f40800000; fma.rn.f32 %f19, %f18, %f2, %f13; fma.rn.f32 %f20, %f18, %f3, %f15; fma.rn.f32 %f21, %f18, %f6, %f17; add.f32 %f22, %f9, 0f40800000; div.rn.f32 %f23, %f19, %f22; cvta.to.global.u64 %rd23, %rd1; add.s64 %rd24, %rd23, %rd11; st.global.f32 [%rd24], %f23; div.rn.f32 %f24, %f20, %f22; cvta.to.global.u64 %rd25, %rd2; add.s64 %rd26, %rd25, %rd11; st.global.f32 [%rd26], %f24; div.rn.f32 %f25, %f21, %f22; cvta.to.global.u64 %rd27, %rd3; add.s64 %rd28, %rd27, %rd11; st.global.f32 [%rd28], %f25; BB0_2: ret; } ` ) mumax3-3.10/cuda/mslice.go000066400000000000000000000014621371432437400154010ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "unsafe" ) // Slice + scalar multiplier. type MSlice struct { arr *data.Slice mul []float64 } func ToMSlice(s *data.Slice) MSlice { return MSlice{ arr: s, mul: ones(s.NComp()), } } func MakeMSlice(arr *data.Slice, mul []float64) MSlice { return MSlice{arr, mul} } func (m MSlice) Size() [3]int { return m.arr.Size() } func (m MSlice) Len() int { return m.arr.Len() } func (m MSlice) DevPtr(c int) unsafe.Pointer { return m.arr.DevPtr(c) } func (m MSlice) Mul(c int) float32 { return float32(m.mul[c]) } func (m MSlice) SetMul(c int, mul float32) { m.mul[c] = float64(mul) } func (m MSlice) Recycle() { if m.arr != nil { Recycle(m.arr) m.arr = nil } } var _ones = [4]float64{1, 1, 1, 1} func ones(n int) []float64 { return _ones[:n] } mumax3-3.10/cuda/mul.cu000066400000000000000000000004211371432437400147160ustar00rootroot00000000000000// dst[i] = a[i] * b[i] extern "C" __global__ void mul(float* __restrict__ dst, float* __restrict__ a, float* __restrict__ b, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if(i < N) { dst[i] = a[i] * b[i]; } } mumax3-3.10/cuda/mul_wrapper.go000066400000000000000000000352361371432437400164700ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for mul kernel var mul_code cu.Function // Stores the arguments for mul kernel invocation type mul_args_t struct { arg_dst unsafe.Pointer arg_a unsafe.Pointer arg_b unsafe.Pointer arg_N int argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for mul kernel invocation var mul_args mul_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. mul_args.argptr[0] = unsafe.Pointer(&mul_args.arg_dst) mul_args.argptr[1] = unsafe.Pointer(&mul_args.arg_a) mul_args.argptr[2] = unsafe.Pointer(&mul_args.arg_b) mul_args.argptr[3] = unsafe.Pointer(&mul_args.arg_N) } // Wrapper for mul CUDA kernel, asynchronous. func k_mul_async(dst unsafe.Pointer, a unsafe.Pointer, b unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("mul") } mul_args.Lock() defer mul_args.Unlock() if mul_code == 0 { mul_code = fatbinLoad(mul_map, "mul") } mul_args.arg_dst = dst mul_args.arg_a = a mul_args.arg_b = b mul_args.arg_N = N args := mul_args.argptr[:] cu.LaunchKernel(mul_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("mul") } } // maps compute capability on PTX code for mul kernel. var mul_map = map[int]string{0: "", 30: mul_ptx_30, 32: mul_ptx_32, 35: mul_ptx_35, 37: mul_ptx_37, 50: mul_ptx_50, 52: mul_ptx_52, 53: mul_ptx_53, 60: mul_ptx_60, 61: mul_ptx_61, 62: mul_ptx_62, 70: mul_ptx_70, 72: mul_ptx_72, 75: mul_ptx_75} // mul PTX code for various compute capabilities. const ( mul_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.f32 %f1, [%rd8]; ld.global.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; BB0_2: ret; } ` mul_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; BB0_2: ret; } ` mul_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; BB0_2: ret; } ` mul_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; BB0_2: ret; } ` mul_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; BB0_2: ret; } ` mul_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; BB0_2: ret; } ` mul_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; BB0_2: ret; } ` mul_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; BB0_2: ret; } ` mul_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; BB0_2: ret; } ` mul_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; BB0_2: ret; } ` mul_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; BB0_2: ret; } ` mul_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; BB0_2: ret; } ` mul_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl mul .visible .entry mul( .param .u64 mul_param_0, .param .u64 mul_param_1, .param .u64 mul_param_2, .param .u32 mul_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<9>; .reg .b64 %rd<11>; ld.param.u64 %rd1, [mul_param_0]; ld.param.u64 %rd2, [mul_param_1]; ld.param.u64 %rd3, [mul_param_2]; ld.param.u32 %r2, [mul_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd2; mul.wide.s32 %rd5, %r1, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd3; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f1, [%rd8]; ld.global.nc.f32 %f2, [%rd6]; mul.f32 %f3, %f2, %f1; cvta.to.global.u64 %rd9, %rd1; add.s64 %rd10, %rd9, %rd5; st.global.f32 [%rd10], %f3; BB0_2: ret; } ` ) mumax3-3.10/cuda/normalize.cu000066400000000000000000000010301371432437400161160ustar00rootroot00000000000000#include "float3.h" // normalize vector {vx, vy, vz} to unit length, unless length or vol are zero. extern "C" __global__ void normalize(float* __restrict__ vx, float* __restrict__ vy, float* __restrict__ vz, float* __restrict__ vol, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { float v = (vol == NULL? 1.0f: vol[i]); float3 V = {v*vx[i], v*vy[i], v*vz[i]}; V = normalized(V); vx[i] = V.x; vy[i] = V.y; vz[i] = V.z; } } mumax3-3.10/cuda/normalize.go000066400000000000000000000005451371432437400161260ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Normalize vec to unit length, unless length or vol are zero. func Normalize(vec, vol *data.Slice) { util.Argument(vol == nil || vol.NComp() == 1) N := vec.Len() cfg := make1DConf(N) k_normalize_async(vec.DevPtr(X), vec.DevPtr(Y), vec.DevPtr(Z), vol.DevPtr(0), N, cfg) } mumax3-3.10/cuda/normalize_wrapper.go000066400000000000000000000626601371432437400176740ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for normalize kernel var normalize_code cu.Function // Stores the arguments for normalize kernel invocation type normalize_args_t struct { arg_vx unsafe.Pointer arg_vy unsafe.Pointer arg_vz unsafe.Pointer arg_vol unsafe.Pointer arg_N int argptr [5]unsafe.Pointer sync.Mutex } // Stores the arguments for normalize kernel invocation var normalize_args normalize_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. normalize_args.argptr[0] = unsafe.Pointer(&normalize_args.arg_vx) normalize_args.argptr[1] = unsafe.Pointer(&normalize_args.arg_vy) normalize_args.argptr[2] = unsafe.Pointer(&normalize_args.arg_vz) normalize_args.argptr[3] = unsafe.Pointer(&normalize_args.arg_vol) normalize_args.argptr[4] = unsafe.Pointer(&normalize_args.arg_N) } // Wrapper for normalize CUDA kernel, asynchronous. func k_normalize_async(vx unsafe.Pointer, vy unsafe.Pointer, vz unsafe.Pointer, vol unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("normalize") } normalize_args.Lock() defer normalize_args.Unlock() if normalize_code == 0 { normalize_code = fatbinLoad(normalize_map, "normalize") } normalize_args.arg_vx = vx normalize_args.arg_vy = vy normalize_args.arg_vz = vz normalize_args.arg_vol = vol normalize_args.arg_N = N args := normalize_args.argptr[:] cu.LaunchKernel(normalize_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("normalize") } } // maps compute capability on PTX code for normalize kernel. var normalize_map = map[int]string{0: "", 30: normalize_ptx_30, 32: normalize_ptx_32, 35: normalize_ptx_35, 37: normalize_ptx_37, 50: normalize_ptx_50, 52: normalize_ptx_52, 53: normalize_ptx_53, 60: normalize_ptx_60, 61: normalize_ptx_61, 62: normalize_ptx_62, 70: normalize_ptx_70, 72: normalize_ptx_72, 75: normalize_ptx_75} // normalize PTX code for various compute capabilities. const ( normalize_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_6; setp.eq.s64 %p2, %rd7, 0; mov.f32 %f20, 0f3F800000; @%p2 bra BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.f32 %f20, [%rd10]; BB0_3: cvta.to.global.u64 %rd11, %rd6; cvta.to.global.u64 %rd12, %rd5; cvta.to.global.u64 %rd13, %rd4; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd1, %rd13, %rd14; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; add.s64 %rd2, %rd12, %rd14; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; add.s64 %rd3, %rd11, %rd14; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; mov.f32 %f21, 0f00000000; setp.eq.f32 %p3, %f6, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f21, %f6; BB0_5: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; BB0_6: ret; } ` normalize_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_6; setp.eq.s64 %p2, %rd7, 0; mov.f32 %f20, 0f3F800000; @%p2 bra BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; BB0_3: cvta.to.global.u64 %rd11, %rd6; cvta.to.global.u64 %rd12, %rd5; cvta.to.global.u64 %rd13, %rd4; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd1, %rd13, %rd14; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; add.s64 %rd2, %rd12, %rd14; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; add.s64 %rd3, %rd11, %rd14; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; mov.f32 %f21, 0f00000000; setp.eq.f32 %p3, %f6, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f21, %f6; BB0_5: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; BB0_6: ret; } ` normalize_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_6; setp.eq.s64 %p2, %rd7, 0; mov.f32 %f20, 0f3F800000; @%p2 bra BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; BB0_3: cvta.to.global.u64 %rd11, %rd6; cvta.to.global.u64 %rd12, %rd5; cvta.to.global.u64 %rd13, %rd4; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd1, %rd13, %rd14; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; add.s64 %rd2, %rd12, %rd14; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; add.s64 %rd3, %rd11, %rd14; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; mov.f32 %f21, 0f00000000; setp.eq.f32 %p3, %f6, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f21, %f6; BB0_5: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; BB0_6: ret; } ` normalize_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_6; setp.eq.s64 %p2, %rd7, 0; mov.f32 %f20, 0f3F800000; @%p2 bra BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; BB0_3: cvta.to.global.u64 %rd11, %rd6; cvta.to.global.u64 %rd12, %rd5; cvta.to.global.u64 %rd13, %rd4; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd1, %rd13, %rd14; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; add.s64 %rd2, %rd12, %rd14; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; add.s64 %rd3, %rd11, %rd14; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; mov.f32 %f21, 0f00000000; setp.eq.f32 %p3, %f6, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f21, %f6; BB0_5: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; BB0_6: ret; } ` normalize_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_6; setp.eq.s64 %p2, %rd7, 0; mov.f32 %f20, 0f3F800000; @%p2 bra BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; BB0_3: cvta.to.global.u64 %rd11, %rd6; cvta.to.global.u64 %rd12, %rd5; cvta.to.global.u64 %rd13, %rd4; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd1, %rd13, %rd14; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; add.s64 %rd2, %rd12, %rd14; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; add.s64 %rd3, %rd11, %rd14; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; mov.f32 %f21, 0f00000000; setp.eq.f32 %p3, %f6, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f21, %f6; BB0_5: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; BB0_6: ret; } ` normalize_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_6; setp.eq.s64 %p2, %rd7, 0; mov.f32 %f20, 0f3F800000; @%p2 bra BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; BB0_3: cvta.to.global.u64 %rd11, %rd6; cvta.to.global.u64 %rd12, %rd5; cvta.to.global.u64 %rd13, %rd4; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd1, %rd13, %rd14; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; add.s64 %rd2, %rd12, %rd14; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; add.s64 %rd3, %rd11, %rd14; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; mov.f32 %f21, 0f00000000; setp.eq.f32 %p3, %f6, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f21, %f6; BB0_5: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; BB0_6: ret; } ` normalize_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_6; setp.eq.s64 %p2, %rd7, 0; mov.f32 %f20, 0f3F800000; @%p2 bra BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; BB0_3: cvta.to.global.u64 %rd11, %rd6; cvta.to.global.u64 %rd12, %rd5; cvta.to.global.u64 %rd13, %rd4; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd1, %rd13, %rd14; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; add.s64 %rd2, %rd12, %rd14; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; add.s64 %rd3, %rd11, %rd14; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; mov.f32 %f21, 0f00000000; setp.eq.f32 %p3, %f6, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f21, %f6; BB0_5: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; BB0_6: ret; } ` normalize_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_6; setp.eq.s64 %p2, %rd7, 0; mov.f32 %f20, 0f3F800000; @%p2 bra BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; BB0_3: cvta.to.global.u64 %rd11, %rd6; cvta.to.global.u64 %rd12, %rd5; cvta.to.global.u64 %rd13, %rd4; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd1, %rd13, %rd14; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; add.s64 %rd2, %rd12, %rd14; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; add.s64 %rd3, %rd11, %rd14; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; mov.f32 %f21, 0f00000000; setp.eq.f32 %p3, %f6, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f21, %f6; BB0_5: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; BB0_6: ret; } ` normalize_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_6; setp.eq.s64 %p2, %rd7, 0; mov.f32 %f20, 0f3F800000; @%p2 bra BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; BB0_3: cvta.to.global.u64 %rd11, %rd6; cvta.to.global.u64 %rd12, %rd5; cvta.to.global.u64 %rd13, %rd4; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd1, %rd13, %rd14; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; add.s64 %rd2, %rd12, %rd14; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; add.s64 %rd3, %rd11, %rd14; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; mov.f32 %f21, 0f00000000; setp.eq.f32 %p3, %f6, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f21, %f6; BB0_5: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; BB0_6: ret; } ` normalize_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_6; setp.eq.s64 %p2, %rd7, 0; mov.f32 %f20, 0f3F800000; @%p2 bra BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; BB0_3: cvta.to.global.u64 %rd11, %rd6; cvta.to.global.u64 %rd12, %rd5; cvta.to.global.u64 %rd13, %rd4; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd1, %rd13, %rd14; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; add.s64 %rd2, %rd12, %rd14; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; add.s64 %rd3, %rd11, %rd14; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; mov.f32 %f21, 0f00000000; setp.eq.f32 %p3, %f6, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f21, %f6; BB0_5: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; BB0_6: ret; } ` normalize_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_6; setp.eq.s64 %p2, %rd7, 0; mov.f32 %f20, 0f3F800000; @%p2 bra BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; BB0_3: cvta.to.global.u64 %rd11, %rd6; cvta.to.global.u64 %rd12, %rd5; cvta.to.global.u64 %rd13, %rd4; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd1, %rd13, %rd14; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; add.s64 %rd2, %rd12, %rd14; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; add.s64 %rd3, %rd11, %rd14; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; mov.f32 %f21, 0f00000000; setp.eq.f32 %p3, %f6, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f21, %f6; BB0_5: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; BB0_6: ret; } ` normalize_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_6; setp.eq.s64 %p2, %rd7, 0; mov.f32 %f20, 0f3F800000; @%p2 bra BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; BB0_3: cvta.to.global.u64 %rd11, %rd6; cvta.to.global.u64 %rd12, %rd5; cvta.to.global.u64 %rd13, %rd4; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd1, %rd13, %rd14; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; add.s64 %rd2, %rd12, %rd14; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; add.s64 %rd3, %rd11, %rd14; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; mov.f32 %f21, 0f00000000; setp.eq.f32 %p3, %f6, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f21, %f6; BB0_5: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; BB0_6: ret; } ` normalize_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl normalize .visible .entry normalize( .param .u64 normalize_param_0, .param .u64 normalize_param_1, .param .u64 normalize_param_2, .param .u64 normalize_param_3, .param .u32 normalize_param_4 ) { .reg .pred %p<4>; .reg .f32 %f<22>; .reg .b32 %r<9>; .reg .b64 %rd<15>; ld.param.u64 %rd4, [normalize_param_0]; ld.param.u64 %rd5, [normalize_param_1]; ld.param.u64 %rd6, [normalize_param_2]; ld.param.u64 %rd7, [normalize_param_3]; ld.param.u32 %r2, [normalize_param_4]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_6; setp.eq.s64 %p2, %rd7, 0; mov.f32 %f20, 0f3F800000; @%p2 bra BB0_3; cvta.to.global.u64 %rd8, %rd7; mul.wide.s32 %rd9, %r1, 4; add.s64 %rd10, %rd8, %rd9; ld.global.nc.f32 %f20, [%rd10]; BB0_3: cvta.to.global.u64 %rd11, %rd6; cvta.to.global.u64 %rd12, %rd5; cvta.to.global.u64 %rd13, %rd4; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd1, %rd13, %rd14; ld.global.f32 %f11, [%rd1]; mul.f32 %f3, %f20, %f11; add.s64 %rd2, %rd12, %rd14; ld.global.f32 %f12, [%rd2]; mul.f32 %f4, %f20, %f12; add.s64 %rd3, %rd11, %rd14; ld.global.f32 %f13, [%rd3]; mul.f32 %f5, %f20, %f13; mul.f32 %f14, %f4, %f4; fma.rn.f32 %f15, %f3, %f3, %f14; fma.rn.f32 %f16, %f5, %f5, %f15; sqrt.rn.f32 %f6, %f16; mov.f32 %f21, 0f00000000; setp.eq.f32 %p3, %f6, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f21, %f6; BB0_5: mul.f32 %f17, %f3, %f21; st.global.f32 [%rd1], %f17; mul.f32 %f18, %f4, %f21; st.global.f32 [%rd2], %f18; mul.f32 %f19, %f5, %f21; st.global.f32 [%rd3], %f19; BB0_6: ret; } ` ) mumax3-3.10/cuda/phi.cu000066400000000000000000000007541371432437400147120ustar00rootroot00000000000000#include "stencil.h" extern "C" __global__ void setPhi(float* __restrict__ phi, float* __restrict__ mx, float* __restrict__ my, int Nx, int Ny, int Nz) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } int I = idx(ix, iy, iz); // central cell index phi[I] = atan2f(my[I], mx[I]); }mumax3-3.10/cuda/phi_wrapper.go000066400000000000000000001242221371432437400164450ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for setPhi kernel var setPhi_code cu.Function // Stores the arguments for setPhi kernel invocation type setPhi_args_t struct { arg_phi unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_Nx int arg_Ny int arg_Nz int argptr [6]unsafe.Pointer sync.Mutex } // Stores the arguments for setPhi kernel invocation var setPhi_args setPhi_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. setPhi_args.argptr[0] = unsafe.Pointer(&setPhi_args.arg_phi) setPhi_args.argptr[1] = unsafe.Pointer(&setPhi_args.arg_mx) setPhi_args.argptr[2] = unsafe.Pointer(&setPhi_args.arg_my) setPhi_args.argptr[3] = unsafe.Pointer(&setPhi_args.arg_Nx) setPhi_args.argptr[4] = unsafe.Pointer(&setPhi_args.arg_Ny) setPhi_args.argptr[5] = unsafe.Pointer(&setPhi_args.arg_Nz) } // Wrapper for setPhi CUDA kernel, asynchronous. func k_setPhi_async(phi unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, Nx int, Ny int, Nz int, cfg *config) { if Synchronous { // debug Sync() timer.Start("setPhi") } setPhi_args.Lock() defer setPhi_args.Unlock() if setPhi_code == 0 { setPhi_code = fatbinLoad(setPhi_map, "setPhi") } setPhi_args.arg_phi = phi setPhi_args.arg_mx = mx setPhi_args.arg_my = my setPhi_args.arg_Nx = Nx setPhi_args.arg_Ny = Ny setPhi_args.arg_Nz = Nz args := setPhi_args.argptr[:] cu.LaunchKernel(setPhi_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("setPhi") } } // maps compute capability on PTX code for setPhi kernel. var setPhi_map = map[int]string{0: "", 30: setPhi_ptx_30, 32: setPhi_ptx_32, 35: setPhi_ptx_35, 37: setPhi_ptx_37, 50: setPhi_ptx_50, 52: setPhi_ptx_52, 53: setPhi_ptx_53, 60: setPhi_ptx_60, 61: setPhi_ptx_61, 62: setPhi_ptx_62, 70: setPhi_ptx_70, 72: setPhi_ptx_72, 75: setPhi_ptx_75} // setPhi PTX code for various compute capabilities. const ( setPhi_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<15>; .reg .f32 %f<36>; .reg .b32 %r<30>; .reg .b64 %rd<12>; ld.param.u64 %rd1, [setPhi_param_0]; ld.param.u64 %rd2, [setPhi_param_1]; ld.param.u64 %rd3, [setPhi_param_2]; ld.param.u32 %r7, [setPhi_param_3]; ld.param.u32 %r8, [setPhi_param_4]; ld.param.u32 %r9, [setPhi_param_5]; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %ntid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r11, %r10, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r2, %r8; setp.ge.s32 %p2, %r1, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_7; cvta.to.global.u64 %rd4, %rd3; mad.lo.s32 %r19, %r3, %r8, %r2; mad.lo.s32 %r4, %r19, %r7, %r1; mul.wide.s32 %rd5, %r4, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd2; add.s64 %rd8, %rd7, %rd5; ld.global.f32 %f7, [%rd8]; abs.f32 %f1, %f7; ld.global.f32 %f8, [%rd6]; abs.f32 %f2, %f8; setp.eq.f32 %p6, %f1, 0f00000000; setp.eq.f32 %p7, %f2, 0f00000000; and.pred %p8, %p6, %p7; mov.b32 %r5, %f7; mov.b32 %r20, %f8; and.b32 %r6, %r20, -2147483648; @%p8 bra BB0_5; bra.uni BB0_2; BB0_5: shr.s32 %r27, %r5, 31; and.b32 %r28, %r27, 1078530011; or.b32 %r29, %r28, %r6; mov.b32 %f35, %r29; bra.uni BB0_6; BB0_2: setp.eq.f32 %p9, %f1, 0f7F800000; setp.eq.f32 %p10, %f2, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra BB0_4; bra.uni BB0_3; BB0_4: shr.s32 %r23, %r5, 31; and.b32 %r24, %r23, 13483017; add.s32 %r25, %r24, 1061752795; or.b32 %r26, %r25, %r6; mov.b32 %f35, %r26; bra.uni BB0_6; BB0_3: max.f32 %f9, %f2, %f1; min.f32 %f10, %f2, %f1; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f2, %f1; selp.f32 %f29, %f28, %f26, %p12; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; setp.lt.s32 %p13, %r5, 0; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r21, %f32; or.b32 %r22, %r21, %r6; mov.b32 %f33, %r22; add.f32 %f34, %f1, %f2; setp.gtu.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f34, %f33, %p14; BB0_6: cvta.to.global.u64 %rd9, %rd1; add.s64 %rd11, %rd9, %rd5; st.global.f32 [%rd11], %f35; BB0_7: ret; } ` setPhi_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<15>; .reg .f32 %f<36>; .reg .b32 %r<44>; .reg .b64 %rd<12>; ld.param.u64 %rd1, [setPhi_param_0]; ld.param.u64 %rd2, [setPhi_param_1]; ld.param.u64 %rd3, [setPhi_param_2]; ld.param.u32 %r6, [setPhi_param_3]; ld.param.u32 %r7, [setPhi_param_4]; ld.param.u32 %r8, [setPhi_param_5]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r2, %r7; setp.ge.s32 %p2, %r1, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_7; cvta.to.global.u64 %rd4, %rd3; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd5, %r19, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd2; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f7, [%rd8]; abs.f32 %f1, %f7; ld.global.nc.f32 %f8, [%rd6]; abs.f32 %f2, %f8; setp.eq.f32 %p6, %f1, 0f00000000; setp.eq.f32 %p7, %f2, 0f00000000; and.pred %p8, %p6, %p7; mov.b32 %r4, %f7; mov.b32 %r20, %f8; and.b32 %r5, %r20, -2147483648; @%p8 bra BB0_5; bra.uni BB0_2; BB0_5: shr.s32 %r27, %r4, 31; and.b32 %r28, %r27, 1078530011; or.b32 %r29, %r28, %r5; mov.b32 %f35, %r29; bra.uni BB0_6; BB0_2: setp.eq.f32 %p9, %f1, 0f7F800000; setp.eq.f32 %p10, %f2, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra BB0_4; bra.uni BB0_3; BB0_4: shr.s32 %r23, %r4, 31; and.b32 %r24, %r23, 13483017; add.s32 %r25, %r24, 1061752795; or.b32 %r26, %r25, %r5; mov.b32 %f35, %r26; bra.uni BB0_6; BB0_3: max.f32 %f9, %f2, %f1; min.f32 %f10, %f2, %f1; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f2, %f1; selp.f32 %f29, %f28, %f26, %p12; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; setp.lt.s32 %p13, %r4, 0; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r21, %f32; or.b32 %r22, %r21, %r5; mov.b32 %f33, %r22; add.f32 %f34, %f1, %f2; setp.gtu.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f34, %f33, %p14; BB0_6: cvta.to.global.u64 %rd9, %rd1; add.s64 %rd11, %rd9, %rd5; st.global.f32 [%rd11], %f35; BB0_7: ret; } ` setPhi_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<15>; .reg .f32 %f<36>; .reg .b32 %r<44>; .reg .b64 %rd<12>; ld.param.u64 %rd1, [setPhi_param_0]; ld.param.u64 %rd2, [setPhi_param_1]; ld.param.u64 %rd3, [setPhi_param_2]; ld.param.u32 %r6, [setPhi_param_3]; ld.param.u32 %r7, [setPhi_param_4]; ld.param.u32 %r8, [setPhi_param_5]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r2, %r7; setp.ge.s32 %p2, %r1, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_7; cvta.to.global.u64 %rd4, %rd3; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd5, %r19, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd2; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f7, [%rd8]; abs.f32 %f1, %f7; ld.global.nc.f32 %f8, [%rd6]; abs.f32 %f2, %f8; setp.eq.f32 %p6, %f1, 0f00000000; setp.eq.f32 %p7, %f2, 0f00000000; and.pred %p8, %p6, %p7; mov.b32 %r4, %f7; mov.b32 %r20, %f8; and.b32 %r5, %r20, -2147483648; @%p8 bra BB0_5; bra.uni BB0_2; BB0_5: shr.s32 %r27, %r4, 31; and.b32 %r28, %r27, 1078530011; or.b32 %r29, %r28, %r5; mov.b32 %f35, %r29; bra.uni BB0_6; BB0_2: setp.eq.f32 %p9, %f1, 0f7F800000; setp.eq.f32 %p10, %f2, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra BB0_4; bra.uni BB0_3; BB0_4: shr.s32 %r23, %r4, 31; and.b32 %r24, %r23, 13483017; add.s32 %r25, %r24, 1061752795; or.b32 %r26, %r25, %r5; mov.b32 %f35, %r26; bra.uni BB0_6; BB0_3: max.f32 %f9, %f2, %f1; min.f32 %f10, %f2, %f1; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f2, %f1; selp.f32 %f29, %f28, %f26, %p12; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; setp.lt.s32 %p13, %r4, 0; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r21, %f32; or.b32 %r22, %r21, %r5; mov.b32 %f33, %r22; add.f32 %f34, %f1, %f2; setp.gtu.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f34, %f33, %p14; BB0_6: cvta.to.global.u64 %rd9, %rd1; add.s64 %rd11, %rd9, %rd5; st.global.f32 [%rd11], %f35; BB0_7: ret; } ` setPhi_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<15>; .reg .f32 %f<36>; .reg .b32 %r<44>; .reg .b64 %rd<12>; ld.param.u64 %rd1, [setPhi_param_0]; ld.param.u64 %rd2, [setPhi_param_1]; ld.param.u64 %rd3, [setPhi_param_2]; ld.param.u32 %r6, [setPhi_param_3]; ld.param.u32 %r7, [setPhi_param_4]; ld.param.u32 %r8, [setPhi_param_5]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r2, %r7; setp.ge.s32 %p2, %r1, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_7; cvta.to.global.u64 %rd4, %rd3; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd5, %r19, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd2; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f7, [%rd8]; abs.f32 %f1, %f7; ld.global.nc.f32 %f8, [%rd6]; abs.f32 %f2, %f8; setp.eq.f32 %p6, %f1, 0f00000000; setp.eq.f32 %p7, %f2, 0f00000000; and.pred %p8, %p6, %p7; mov.b32 %r4, %f7; mov.b32 %r20, %f8; and.b32 %r5, %r20, -2147483648; @%p8 bra BB0_5; bra.uni BB0_2; BB0_5: shr.s32 %r27, %r4, 31; and.b32 %r28, %r27, 1078530011; or.b32 %r29, %r28, %r5; mov.b32 %f35, %r29; bra.uni BB0_6; BB0_2: setp.eq.f32 %p9, %f1, 0f7F800000; setp.eq.f32 %p10, %f2, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra BB0_4; bra.uni BB0_3; BB0_4: shr.s32 %r23, %r4, 31; and.b32 %r24, %r23, 13483017; add.s32 %r25, %r24, 1061752795; or.b32 %r26, %r25, %r5; mov.b32 %f35, %r26; bra.uni BB0_6; BB0_3: max.f32 %f9, %f2, %f1; min.f32 %f10, %f2, %f1; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f2, %f1; selp.f32 %f29, %f28, %f26, %p12; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; setp.lt.s32 %p13, %r4, 0; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r21, %f32; or.b32 %r22, %r21, %r5; mov.b32 %f33, %r22; add.f32 %f34, %f1, %f2; setp.gtu.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f34, %f33, %p14; BB0_6: cvta.to.global.u64 %rd9, %rd1; add.s64 %rd11, %rd9, %rd5; st.global.f32 [%rd11], %f35; BB0_7: ret; } ` setPhi_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<15>; .reg .f32 %f<36>; .reg .b32 %r<44>; .reg .b64 %rd<12>; ld.param.u64 %rd1, [setPhi_param_0]; ld.param.u64 %rd2, [setPhi_param_1]; ld.param.u64 %rd3, [setPhi_param_2]; ld.param.u32 %r6, [setPhi_param_3]; ld.param.u32 %r7, [setPhi_param_4]; ld.param.u32 %r8, [setPhi_param_5]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r2, %r7; setp.ge.s32 %p2, %r1, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_7; cvta.to.global.u64 %rd4, %rd3; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd5, %r19, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd2; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f7, [%rd8]; abs.f32 %f1, %f7; ld.global.nc.f32 %f8, [%rd6]; abs.f32 %f2, %f8; setp.eq.f32 %p6, %f1, 0f00000000; setp.eq.f32 %p7, %f2, 0f00000000; and.pred %p8, %p6, %p7; mov.b32 %r4, %f7; mov.b32 %r20, %f8; and.b32 %r5, %r20, -2147483648; @%p8 bra BB0_5; bra.uni BB0_2; BB0_5: shr.s32 %r27, %r4, 31; and.b32 %r28, %r27, 1078530011; or.b32 %r29, %r28, %r5; mov.b32 %f35, %r29; bra.uni BB0_6; BB0_2: setp.eq.f32 %p9, %f1, 0f7F800000; setp.eq.f32 %p10, %f2, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra BB0_4; bra.uni BB0_3; BB0_4: shr.s32 %r23, %r4, 31; and.b32 %r24, %r23, 13483017; add.s32 %r25, %r24, 1061752795; or.b32 %r26, %r25, %r5; mov.b32 %f35, %r26; bra.uni BB0_6; BB0_3: max.f32 %f9, %f2, %f1; min.f32 %f10, %f2, %f1; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f2, %f1; selp.f32 %f29, %f28, %f26, %p12; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; setp.lt.s32 %p13, %r4, 0; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r21, %f32; or.b32 %r22, %r21, %r5; mov.b32 %f33, %r22; add.f32 %f34, %f1, %f2; setp.gtu.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f34, %f33, %p14; BB0_6: cvta.to.global.u64 %rd9, %rd1; add.s64 %rd11, %rd9, %rd5; st.global.f32 [%rd11], %f35; BB0_7: ret; } ` setPhi_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<15>; .reg .f32 %f<36>; .reg .b32 %r<44>; .reg .b64 %rd<12>; ld.param.u64 %rd1, [setPhi_param_0]; ld.param.u64 %rd2, [setPhi_param_1]; ld.param.u64 %rd3, [setPhi_param_2]; ld.param.u32 %r6, [setPhi_param_3]; ld.param.u32 %r7, [setPhi_param_4]; ld.param.u32 %r8, [setPhi_param_5]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r2, %r7; setp.ge.s32 %p2, %r1, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_7; cvta.to.global.u64 %rd4, %rd3; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd5, %r19, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd2; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f7, [%rd8]; abs.f32 %f1, %f7; ld.global.nc.f32 %f8, [%rd6]; abs.f32 %f2, %f8; setp.eq.f32 %p6, %f1, 0f00000000; setp.eq.f32 %p7, %f2, 0f00000000; and.pred %p8, %p6, %p7; mov.b32 %r4, %f7; mov.b32 %r20, %f8; and.b32 %r5, %r20, -2147483648; @%p8 bra BB0_5; bra.uni BB0_2; BB0_5: shr.s32 %r27, %r4, 31; and.b32 %r28, %r27, 1078530011; or.b32 %r29, %r28, %r5; mov.b32 %f35, %r29; bra.uni BB0_6; BB0_2: setp.eq.f32 %p9, %f1, 0f7F800000; setp.eq.f32 %p10, %f2, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra BB0_4; bra.uni BB0_3; BB0_4: shr.s32 %r23, %r4, 31; and.b32 %r24, %r23, 13483017; add.s32 %r25, %r24, 1061752795; or.b32 %r26, %r25, %r5; mov.b32 %f35, %r26; bra.uni BB0_6; BB0_3: max.f32 %f9, %f2, %f1; min.f32 %f10, %f2, %f1; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f2, %f1; selp.f32 %f29, %f28, %f26, %p12; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; setp.lt.s32 %p13, %r4, 0; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r21, %f32; or.b32 %r22, %r21, %r5; mov.b32 %f33, %r22; add.f32 %f34, %f1, %f2; setp.gtu.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f34, %f33, %p14; BB0_6: cvta.to.global.u64 %rd9, %rd1; add.s64 %rd11, %rd9, %rd5; st.global.f32 [%rd11], %f35; BB0_7: ret; } ` setPhi_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<15>; .reg .f32 %f<36>; .reg .b32 %r<44>; .reg .b64 %rd<12>; ld.param.u64 %rd1, [setPhi_param_0]; ld.param.u64 %rd2, [setPhi_param_1]; ld.param.u64 %rd3, [setPhi_param_2]; ld.param.u32 %r6, [setPhi_param_3]; ld.param.u32 %r7, [setPhi_param_4]; ld.param.u32 %r8, [setPhi_param_5]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r2, %r7; setp.ge.s32 %p2, %r1, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_7; cvta.to.global.u64 %rd4, %rd3; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd5, %r19, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd2; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f7, [%rd8]; abs.f32 %f1, %f7; ld.global.nc.f32 %f8, [%rd6]; abs.f32 %f2, %f8; setp.eq.f32 %p6, %f1, 0f00000000; setp.eq.f32 %p7, %f2, 0f00000000; and.pred %p8, %p6, %p7; mov.b32 %r4, %f7; mov.b32 %r20, %f8; and.b32 %r5, %r20, -2147483648; @%p8 bra BB0_5; bra.uni BB0_2; BB0_5: shr.s32 %r27, %r4, 31; and.b32 %r28, %r27, 1078530011; or.b32 %r29, %r28, %r5; mov.b32 %f35, %r29; bra.uni BB0_6; BB0_2: setp.eq.f32 %p9, %f1, 0f7F800000; setp.eq.f32 %p10, %f2, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra BB0_4; bra.uni BB0_3; BB0_4: shr.s32 %r23, %r4, 31; and.b32 %r24, %r23, 13483017; add.s32 %r25, %r24, 1061752795; or.b32 %r26, %r25, %r5; mov.b32 %f35, %r26; bra.uni BB0_6; BB0_3: max.f32 %f9, %f2, %f1; min.f32 %f10, %f2, %f1; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f2, %f1; selp.f32 %f29, %f28, %f26, %p12; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; setp.lt.s32 %p13, %r4, 0; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r21, %f32; or.b32 %r22, %r21, %r5; mov.b32 %f33, %r22; add.f32 %f34, %f1, %f2; setp.gtu.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f34, %f33, %p14; BB0_6: cvta.to.global.u64 %rd9, %rd1; add.s64 %rd11, %rd9, %rd5; st.global.f32 [%rd11], %f35; BB0_7: ret; } ` setPhi_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<15>; .reg .f32 %f<36>; .reg .b32 %r<44>; .reg .b64 %rd<12>; ld.param.u64 %rd1, [setPhi_param_0]; ld.param.u64 %rd2, [setPhi_param_1]; ld.param.u64 %rd3, [setPhi_param_2]; ld.param.u32 %r6, [setPhi_param_3]; ld.param.u32 %r7, [setPhi_param_4]; ld.param.u32 %r8, [setPhi_param_5]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r2, %r7; setp.ge.s32 %p2, %r1, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_7; cvta.to.global.u64 %rd4, %rd3; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd5, %r19, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd2; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f7, [%rd8]; abs.f32 %f1, %f7; ld.global.nc.f32 %f8, [%rd6]; abs.f32 %f2, %f8; setp.eq.f32 %p6, %f1, 0f00000000; setp.eq.f32 %p7, %f2, 0f00000000; and.pred %p8, %p6, %p7; mov.b32 %r4, %f7; mov.b32 %r20, %f8; and.b32 %r5, %r20, -2147483648; @%p8 bra BB0_5; bra.uni BB0_2; BB0_5: shr.s32 %r27, %r4, 31; and.b32 %r28, %r27, 1078530011; or.b32 %r29, %r28, %r5; mov.b32 %f35, %r29; bra.uni BB0_6; BB0_2: setp.eq.f32 %p9, %f1, 0f7F800000; setp.eq.f32 %p10, %f2, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra BB0_4; bra.uni BB0_3; BB0_4: shr.s32 %r23, %r4, 31; and.b32 %r24, %r23, 13483017; add.s32 %r25, %r24, 1061752795; or.b32 %r26, %r25, %r5; mov.b32 %f35, %r26; bra.uni BB0_6; BB0_3: max.f32 %f9, %f2, %f1; min.f32 %f10, %f2, %f1; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f2, %f1; selp.f32 %f29, %f28, %f26, %p12; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; setp.lt.s32 %p13, %r4, 0; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r21, %f32; or.b32 %r22, %r21, %r5; mov.b32 %f33, %r22; add.f32 %f34, %f1, %f2; setp.gtu.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f34, %f33, %p14; BB0_6: cvta.to.global.u64 %rd9, %rd1; add.s64 %rd11, %rd9, %rd5; st.global.f32 [%rd11], %f35; BB0_7: ret; } ` setPhi_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<15>; .reg .f32 %f<36>; .reg .b32 %r<44>; .reg .b64 %rd<12>; ld.param.u64 %rd1, [setPhi_param_0]; ld.param.u64 %rd2, [setPhi_param_1]; ld.param.u64 %rd3, [setPhi_param_2]; ld.param.u32 %r6, [setPhi_param_3]; ld.param.u32 %r7, [setPhi_param_4]; ld.param.u32 %r8, [setPhi_param_5]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r2, %r7; setp.ge.s32 %p2, %r1, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_7; cvta.to.global.u64 %rd4, %rd3; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd5, %r19, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd2; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f7, [%rd8]; abs.f32 %f1, %f7; ld.global.nc.f32 %f8, [%rd6]; abs.f32 %f2, %f8; setp.eq.f32 %p6, %f1, 0f00000000; setp.eq.f32 %p7, %f2, 0f00000000; and.pred %p8, %p6, %p7; mov.b32 %r4, %f7; mov.b32 %r20, %f8; and.b32 %r5, %r20, -2147483648; @%p8 bra BB0_5; bra.uni BB0_2; BB0_5: shr.s32 %r27, %r4, 31; and.b32 %r28, %r27, 1078530011; or.b32 %r29, %r28, %r5; mov.b32 %f35, %r29; bra.uni BB0_6; BB0_2: setp.eq.f32 %p9, %f1, 0f7F800000; setp.eq.f32 %p10, %f2, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra BB0_4; bra.uni BB0_3; BB0_4: shr.s32 %r23, %r4, 31; and.b32 %r24, %r23, 13483017; add.s32 %r25, %r24, 1061752795; or.b32 %r26, %r25, %r5; mov.b32 %f35, %r26; bra.uni BB0_6; BB0_3: max.f32 %f9, %f2, %f1; min.f32 %f10, %f2, %f1; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f2, %f1; selp.f32 %f29, %f28, %f26, %p12; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; setp.lt.s32 %p13, %r4, 0; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r21, %f32; or.b32 %r22, %r21, %r5; mov.b32 %f33, %r22; add.f32 %f34, %f1, %f2; setp.gtu.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f34, %f33, %p14; BB0_6: cvta.to.global.u64 %rd9, %rd1; add.s64 %rd11, %rd9, %rd5; st.global.f32 [%rd11], %f35; BB0_7: ret; } ` setPhi_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<15>; .reg .f32 %f<36>; .reg .b32 %r<44>; .reg .b64 %rd<12>; ld.param.u64 %rd1, [setPhi_param_0]; ld.param.u64 %rd2, [setPhi_param_1]; ld.param.u64 %rd3, [setPhi_param_2]; ld.param.u32 %r6, [setPhi_param_3]; ld.param.u32 %r7, [setPhi_param_4]; ld.param.u32 %r8, [setPhi_param_5]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r2, %r7; setp.ge.s32 %p2, %r1, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_7; cvta.to.global.u64 %rd4, %rd3; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd5, %r19, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd2; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f7, [%rd8]; abs.f32 %f1, %f7; ld.global.nc.f32 %f8, [%rd6]; abs.f32 %f2, %f8; setp.eq.f32 %p6, %f1, 0f00000000; setp.eq.f32 %p7, %f2, 0f00000000; and.pred %p8, %p6, %p7; mov.b32 %r4, %f7; mov.b32 %r20, %f8; and.b32 %r5, %r20, -2147483648; @%p8 bra BB0_5; bra.uni BB0_2; BB0_5: shr.s32 %r27, %r4, 31; and.b32 %r28, %r27, 1078530011; or.b32 %r29, %r28, %r5; mov.b32 %f35, %r29; bra.uni BB0_6; BB0_2: setp.eq.f32 %p9, %f1, 0f7F800000; setp.eq.f32 %p10, %f2, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra BB0_4; bra.uni BB0_3; BB0_4: shr.s32 %r23, %r4, 31; and.b32 %r24, %r23, 13483017; add.s32 %r25, %r24, 1061752795; or.b32 %r26, %r25, %r5; mov.b32 %f35, %r26; bra.uni BB0_6; BB0_3: max.f32 %f9, %f2, %f1; min.f32 %f10, %f2, %f1; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f2, %f1; selp.f32 %f29, %f28, %f26, %p12; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; setp.lt.s32 %p13, %r4, 0; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r21, %f32; or.b32 %r22, %r21, %r5; mov.b32 %f33, %r22; add.f32 %f34, %f1, %f2; setp.gtu.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f34, %f33, %p14; BB0_6: cvta.to.global.u64 %rd9, %rd1; add.s64 %rd11, %rd9, %rd5; st.global.f32 [%rd11], %f35; BB0_7: ret; } ` setPhi_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<15>; .reg .f32 %f<36>; .reg .b32 %r<44>; .reg .b64 %rd<12>; ld.param.u64 %rd1, [setPhi_param_0]; ld.param.u64 %rd2, [setPhi_param_1]; ld.param.u64 %rd3, [setPhi_param_2]; ld.param.u32 %r6, [setPhi_param_3]; ld.param.u32 %r7, [setPhi_param_4]; ld.param.u32 %r8, [setPhi_param_5]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r2, %r7; setp.ge.s32 %p2, %r1, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_7; cvta.to.global.u64 %rd4, %rd3; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd5, %r19, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd2; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f7, [%rd8]; abs.f32 %f1, %f7; ld.global.nc.f32 %f8, [%rd6]; abs.f32 %f2, %f8; setp.eq.f32 %p6, %f1, 0f00000000; setp.eq.f32 %p7, %f2, 0f00000000; and.pred %p8, %p6, %p7; mov.b32 %r4, %f7; mov.b32 %r20, %f8; and.b32 %r5, %r20, -2147483648; @%p8 bra BB0_5; bra.uni BB0_2; BB0_5: shr.s32 %r27, %r4, 31; and.b32 %r28, %r27, 1078530011; or.b32 %r29, %r28, %r5; mov.b32 %f35, %r29; bra.uni BB0_6; BB0_2: setp.eq.f32 %p9, %f1, 0f7F800000; setp.eq.f32 %p10, %f2, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra BB0_4; bra.uni BB0_3; BB0_4: shr.s32 %r23, %r4, 31; and.b32 %r24, %r23, 13483017; add.s32 %r25, %r24, 1061752795; or.b32 %r26, %r25, %r5; mov.b32 %f35, %r26; bra.uni BB0_6; BB0_3: max.f32 %f9, %f2, %f1; min.f32 %f10, %f2, %f1; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f2, %f1; selp.f32 %f29, %f28, %f26, %p12; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; setp.lt.s32 %p13, %r4, 0; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r21, %f32; or.b32 %r22, %r21, %r5; mov.b32 %f33, %r22; add.f32 %f34, %f1, %f2; setp.gtu.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f34, %f33, %p14; BB0_6: cvta.to.global.u64 %rd9, %rd1; add.s64 %rd11, %rd9, %rd5; st.global.f32 [%rd11], %f35; BB0_7: ret; } ` setPhi_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<15>; .reg .f32 %f<36>; .reg .b32 %r<44>; .reg .b64 %rd<12>; ld.param.u64 %rd1, [setPhi_param_0]; ld.param.u64 %rd2, [setPhi_param_1]; ld.param.u64 %rd3, [setPhi_param_2]; ld.param.u32 %r6, [setPhi_param_3]; ld.param.u32 %r7, [setPhi_param_4]; ld.param.u32 %r8, [setPhi_param_5]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r2, %r7; setp.ge.s32 %p2, %r1, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_7; cvta.to.global.u64 %rd4, %rd3; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd5, %r19, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd2; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f7, [%rd8]; abs.f32 %f1, %f7; ld.global.nc.f32 %f8, [%rd6]; abs.f32 %f2, %f8; setp.eq.f32 %p6, %f1, 0f00000000; setp.eq.f32 %p7, %f2, 0f00000000; and.pred %p8, %p6, %p7; mov.b32 %r4, %f7; mov.b32 %r20, %f8; and.b32 %r5, %r20, -2147483648; @%p8 bra BB0_5; bra.uni BB0_2; BB0_5: shr.s32 %r27, %r4, 31; and.b32 %r28, %r27, 1078530011; or.b32 %r29, %r28, %r5; mov.b32 %f35, %r29; bra.uni BB0_6; BB0_2: setp.eq.f32 %p9, %f1, 0f7F800000; setp.eq.f32 %p10, %f2, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra BB0_4; bra.uni BB0_3; BB0_4: shr.s32 %r23, %r4, 31; and.b32 %r24, %r23, 13483017; add.s32 %r25, %r24, 1061752795; or.b32 %r26, %r25, %r5; mov.b32 %f35, %r26; bra.uni BB0_6; BB0_3: max.f32 %f9, %f2, %f1; min.f32 %f10, %f2, %f1; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f2, %f1; selp.f32 %f29, %f28, %f26, %p12; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; setp.lt.s32 %p13, %r4, 0; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r21, %f32; or.b32 %r22, %r21, %r5; mov.b32 %f33, %r22; add.f32 %f34, %f1, %f2; setp.gtu.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f34, %f33, %p14; BB0_6: cvta.to.global.u64 %rd9, %rd1; add.s64 %rd11, %rd9, %rd5; st.global.f32 [%rd11], %f35; BB0_7: ret; } ` setPhi_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl setPhi .visible .entry setPhi( .param .u64 setPhi_param_0, .param .u64 setPhi_param_1, .param .u64 setPhi_param_2, .param .u32 setPhi_param_3, .param .u32 setPhi_param_4, .param .u32 setPhi_param_5 ) { .reg .pred %p<15>; .reg .f32 %f<36>; .reg .b32 %r<44>; .reg .b64 %rd<12>; ld.param.u64 %rd1, [setPhi_param_0]; ld.param.u64 %rd2, [setPhi_param_1]; ld.param.u64 %rd3, [setPhi_param_2]; ld.param.u32 %r6, [setPhi_param_3]; ld.param.u32 %r7, [setPhi_param_4]; ld.param.u32 %r8, [setPhi_param_5]; mov.u32 %r9, %ctaid.x; mov.u32 %r10, %ntid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r10, %r9, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r2, %r7; setp.ge.s32 %p2, %r1, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_7; cvta.to.global.u64 %rd4, %rd3; mad.lo.s32 %r18, %r3, %r7, %r2; mad.lo.s32 %r19, %r18, %r6, %r1; mul.wide.s32 %rd5, %r19, 4; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd2; add.s64 %rd8, %rd7, %rd5; ld.global.nc.f32 %f7, [%rd8]; abs.f32 %f1, %f7; ld.global.nc.f32 %f8, [%rd6]; abs.f32 %f2, %f8; setp.eq.f32 %p6, %f1, 0f00000000; setp.eq.f32 %p7, %f2, 0f00000000; and.pred %p8, %p6, %p7; mov.b32 %r4, %f7; mov.b32 %r20, %f8; and.b32 %r5, %r20, -2147483648; @%p8 bra BB0_5; bra.uni BB0_2; BB0_5: shr.s32 %r27, %r4, 31; and.b32 %r28, %r27, 1078530011; or.b32 %r29, %r28, %r5; mov.b32 %f35, %r29; bra.uni BB0_6; BB0_2: setp.eq.f32 %p9, %f1, 0f7F800000; setp.eq.f32 %p10, %f2, 0f7F800000; and.pred %p11, %p9, %p10; @%p11 bra BB0_4; bra.uni BB0_3; BB0_4: shr.s32 %r23, %r4, 31; and.b32 %r24, %r23, 13483017; add.s32 %r25, %r24, 1061752795; or.b32 %r26, %r25, %r5; mov.b32 %f35, %r26; bra.uni BB0_6; BB0_3: max.f32 %f9, %f2, %f1; min.f32 %f10, %f2, %f1; div.rn.f32 %f11, %f10, %f9; mul.rn.f32 %f12, %f11, %f11; mov.f32 %f13, 0fC0B59883; mov.f32 %f14, 0fBF52C7EA; fma.rn.f32 %f15, %f12, %f14, %f13; mov.f32 %f16, 0fC0D21907; fma.rn.f32 %f17, %f15, %f12, %f16; mul.f32 %f18, %f12, %f17; mul.f32 %f19, %f11, %f18; add.f32 %f20, %f12, 0f41355DC0; mov.f32 %f21, 0f41E6BD60; fma.rn.f32 %f22, %f20, %f12, %f21; mov.f32 %f23, 0f419D92C8; fma.rn.f32 %f24, %f22, %f12, %f23; rcp.rn.f32 %f25, %f24; fma.rn.f32 %f26, %f19, %f25, %f11; mov.f32 %f27, 0f3FC90FDB; sub.f32 %f28, %f27, %f26; setp.gt.f32 %p12, %f2, %f1; selp.f32 %f29, %f28, %f26, %p12; mov.f32 %f30, 0f40490FDB; sub.f32 %f31, %f30, %f29; setp.lt.s32 %p13, %r4, 0; selp.f32 %f32, %f31, %f29, %p13; mov.b32 %r21, %f32; or.b32 %r22, %r21, %r5; mov.b32 %f33, %r22; add.f32 %f34, %f1, %f2; setp.gtu.f32 %p14, %f34, 0f7F800000; selp.f32 %f35, %f34, %f33, %p14; BB0_6: cvta.to.global.u64 %rd9, %rd1; add.s64 %rd11, %rd9, %rd5; st.global.f32 [%rd11], %f35; BB0_7: ret; } ` ) mumax3-3.10/cuda/reduce.go000066400000000000000000000054471371432437400154030ustar00rootroot00000000000000package cuda import ( "math" "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/data" "github.com/mumax/3/util" ) //#include "reduce.h" import "C" // Block size for reduce kernels. const REDUCE_BLOCKSIZE = C.REDUCE_BLOCKSIZE // Sum of all elements. func Sum(in *data.Slice) float32 { util.Argument(in.NComp() == 1) out := reduceBuf(0) k_reducesum_async(in.DevPtr(0), out, 0, in.Len(), reducecfg) return copyback(out) } // Dot product. func Dot(a, b *data.Slice) float32 { nComp := a.NComp() util.Argument(nComp == b.NComp()) out := reduceBuf(0) // not async over components for c := 0; c < nComp; c++ { k_reducedot_async(a.DevPtr(c), b.DevPtr(c), out, 0, a.Len(), reducecfg) // all components add to out } return copyback(out) } // Maximum of absolute values of all elements. func MaxAbs(in *data.Slice) float32 { util.Argument(in.NComp() == 1) out := reduceBuf(0) k_reducemaxabs_async(in.DevPtr(0), out, 0, in.Len(), reducecfg) return copyback(out) } // Maximum of the norms of all vectors (x[i], y[i], z[i]). // max_i sqrt( x[i]*x[i] + y[i]*y[i] + z[i]*z[i] ) func MaxVecNorm(v *data.Slice) float64 { out := reduceBuf(0) k_reducemaxvecnorm2_async(v.DevPtr(0), v.DevPtr(1), v.DevPtr(2), out, 0, v.Len(), reducecfg) return math.Sqrt(float64(copyback(out))) } // Maximum of the norms of the difference between all vectors (x1,y1,z1) and (x2,y2,z2) // (dx, dy, dz) = (x1, y1, z1) - (x2, y2, z2) // max_i sqrt( dx[i]*dx[i] + dy[i]*dy[i] + dz[i]*dz[i] ) func MaxVecDiff(x, y *data.Slice) float64 { util.Argument(x.Len() == y.Len()) out := reduceBuf(0) k_reducemaxvecdiff2_async(x.DevPtr(0), x.DevPtr(1), x.DevPtr(2), y.DevPtr(0), y.DevPtr(1), y.DevPtr(2), out, 0, x.Len(), reducecfg) return math.Sqrt(float64(copyback(out))) } var reduceBuffers chan unsafe.Pointer // pool of 1-float CUDA buffers for reduce // return a 1-float CUDA reduction buffer from a pool // initialized to initVal func reduceBuf(initVal float32) unsafe.Pointer { if reduceBuffers == nil { initReduceBuf() } buf := <-reduceBuffers cu.MemsetD32Async(cu.DevicePtr(uintptr(buf)), math.Float32bits(initVal), 1, stream0) return buf } // copy back single float result from GPU and recycle buffer func copyback(buf unsafe.Pointer) float32 { var result float32 MemCpyDtoH(unsafe.Pointer(&result), buf, cu.SIZEOF_FLOAT32) reduceBuffers <- buf return result } // initialize pool of 1-float CUDA reduction buffers func initReduceBuf() { const N = 128 reduceBuffers = make(chan unsafe.Pointer, N) for i := 0; i < N; i++ { reduceBuffers <- MemAlloc(1 * cu.SIZEOF_FLOAT32) } } // launch configuration for reduce kernels // 8 is typ. number of multiprocessors. // could be improved but takes hardly ~1% of execution time var reducecfg = &config{Grid: cu.Dim3{X: 8, Y: 1, Z: 1}, Block: cu.Dim3{X: REDUCE_BLOCKSIZE, Y: 1, Z: 1}} mumax3-3.10/cuda/reduce.h000066400000000000000000000044441371432437400152210ustar00rootroot00000000000000#ifndef _REDUCE_H_ #define _REDUCE_H_ // Block size for reduce kernels. #define REDUCE_BLOCKSIZE 512 // This macro expands to a reduce kernel with arbitrary reduce operation. // Ugly, perhaps, but arguably nicer than some 1000+ line C++ template. // load(i): loads element i, possibly pre-processing the data // op(a, b): reduce operation. e.g. sum // atomicOp(a, b): atomic reduce operation in global mem. #define reduce(load, op, atomicOp) \ __shared__ float sdata[REDUCE_BLOCKSIZE]; \ int tid = threadIdx.x; \ int i = blockIdx.x * blockDim.x + threadIdx.x; \ \ float mine = initVal; \ int stride = gridDim.x * blockDim.x; \ while (i < n) { \ mine = op(mine, load(i)); \ i += stride; \ } \ sdata[tid] = mine; \ __syncthreads(); \ \ for (unsigned int s=blockDim.x/2; s>32; s>>=1) { \ if (tid < s){ \ sdata[tid] = op(sdata[tid], sdata[tid + s]);\ } \ __syncthreads(); \ } \ \ if (tid < 32) { \ volatile float* smem = sdata; \ smem[tid] = op(smem[tid], smem[tid + 32]); \ smem[tid] = op(smem[tid], smem[tid + 16]); \ smem[tid] = op(smem[tid], smem[tid + 8]); \ smem[tid] = op(smem[tid], smem[tid + 4]); \ smem[tid] = op(smem[tid], smem[tid + 2]); \ smem[tid] = op(smem[tid], smem[tid + 1]); \ } \ \ if (tid == 0) { atomicOp(dst, sdata[0]); } \ // Based on "Optimizing parallel reduction in CUDA" by Mark Harris. #endif mumax3-3.10/cuda/reduce_test.go000066400000000000000000000031411371432437400164270ustar00rootroot00000000000000package cuda import ( "testing" "unsafe" "github.com/mumax/3/data" "github.com/mumax/3/util" ) // test input data var in1, in2, in3 *data.Slice func initTest() { if in1 != nil { return } { inh1 := make([]float32, 1000) for i := range inh1 { inh1[i] = float32(i) } in1 = toGPU(inh1) } { inh2 := make([]float32, 100000) for i := range inh2 { inh2[i] = -float32(i) / 100 } in2 = toGPU(inh2) } } func toGPU(list []float32) *data.Slice { mesh := [3]int{1, 1, len(list)} h := sliceFromList([][]float32{list}, mesh) d := NewSlice(1, mesh) data.Copy(d, h) return d } func TestReduceSum(t *testing.T) { initTest() result := Sum(in1) if result != 499500 { t.Error("got:", result) } } func TestReduceDot(t *testing.T) { initTest() // test for 1 comp a := toGPU([]float32{1, 2, 3, 4, 5}) b := toGPU([]float32{5, 4, 3, -1, 2}) result := Dot(a, b) if result != 5+8+9-4+10 { t.Error("got:", result) } // test for 3 comp const N = 32 mesh := [3]int{1, 1, N} c := NewSlice(3, mesh) d := NewSlice(3, mesh) Memset(c, 1, 2, 3) Memset(d, 4, 5, 6) result = Dot(c, d) if result != N*(4+10+18) { t.Error("got:", result) } } func TestReduceMaxAbs(t *testing.T) { result := MaxAbs(in1) if result != 999 { t.Error("got:", result) } result = MaxAbs(in2) if result != 999.99 { t.Error("got:", result) } } func sliceFromList(arr [][]float32, size [3]int) *data.Slice { ptrs := make([]unsafe.Pointer, len(arr)) for i := range ptrs { util.Argument(len(arr[i]) == prod(size)) ptrs[i] = unsafe.Pointer(&arr[i][0]) } return data.SliceFromPtrs(size, data.CPUMemory, ptrs) } mumax3-3.10/cuda/reducedot.cu000066400000000000000000000004051371432437400161010ustar00rootroot00000000000000#include "reduce.h" #include "sum.h" #define load_prod(i) (x1[i] * x2[i]) extern "C" __global__ void reducedot(float* __restrict__ x1, float* __restrict__ x2, float*__restrict__ dst, float initVal, int n) { reduce(load_prod, sum, atomicAdd) } mumax3-3.10/cuda/reducedot_wrapper.go000066400000000000000000001146611371432437400176510ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for reducedot kernel var reducedot_code cu.Function // Stores the arguments for reducedot kernel invocation type reducedot_args_t struct { arg_x1 unsafe.Pointer arg_x2 unsafe.Pointer arg_dst unsafe.Pointer arg_initVal float32 arg_n int argptr [5]unsafe.Pointer sync.Mutex } // Stores the arguments for reducedot kernel invocation var reducedot_args reducedot_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. reducedot_args.argptr[0] = unsafe.Pointer(&reducedot_args.arg_x1) reducedot_args.argptr[1] = unsafe.Pointer(&reducedot_args.arg_x2) reducedot_args.argptr[2] = unsafe.Pointer(&reducedot_args.arg_dst) reducedot_args.argptr[3] = unsafe.Pointer(&reducedot_args.arg_initVal) reducedot_args.argptr[4] = unsafe.Pointer(&reducedot_args.arg_n) } // Wrapper for reducedot CUDA kernel, asynchronous. func k_reducedot_async(x1 unsafe.Pointer, x2 unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { if Synchronous { // debug Sync() timer.Start("reducedot") } reducedot_args.Lock() defer reducedot_args.Unlock() if reducedot_code == 0 { reducedot_code = fatbinLoad(reducedot_map, "reducedot") } reducedot_args.arg_x1 = x1 reducedot_args.arg_x2 = x2 reducedot_args.arg_dst = dst reducedot_args.arg_initVal = initVal reducedot_args.arg_n = n args := reducedot_args.argptr[:] cu.LaunchKernel(reducedot_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("reducedot") } } // maps compute capability on PTX code for reducedot kernel. var reducedot_map = map[int]string{0: "", 30: reducedot_ptx_30, 32: reducedot_ptx_32, 35: reducedot_ptx_35, 37: reducedot_ptx_37, 50: reducedot_ptx_50, 52: reducedot_ptx_52, 53: reducedot_ptx_53, 60: reducedot_ptx_60, 61: reducedot_ptx_61, 62: reducedot_ptx_62, 70: reducedot_ptx_70, 72: reducedot_ptx_72, 75: reducedot_ptx_75} // reducedot PTX code for various compute capabilities. const ( reducedot_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<21>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd4, [reducedot_param_0]; ld.param.u64 %rd5, [reducedot_param_1]; ld.param.u64 %rd3, [reducedot_param_2]; ld.param.f32 %f31, [reducedot_param_3]; ld.param.u32 %r10, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r19, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.f32 %f5, [%rd8]; ld.global.f32 %f6, [%rd7]; fma.rn.f32 %f31, %f6, %f5, %f31; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducedotE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; add.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; add.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; add.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; add.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; add.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; add.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; add.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd9, %rd3; atom.global.add.f32 %f29, [%rd9], %f28; BB0_10: ret; } ` reducedot_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<21>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd4, [reducedot_param_0]; ld.param.u64 %rd5, [reducedot_param_1]; ld.param.u64 %rd3, [reducedot_param_2]; ld.param.f32 %f31, [reducedot_param_3]; ld.param.u32 %r10, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r19, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; fma.rn.f32 %f31, %f6, %f5, %f31; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducedotE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; add.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; add.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; add.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; add.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; add.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; add.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; add.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd9, %rd3; atom.global.add.f32 %f29, [%rd9], %f28; BB0_10: ret; } ` reducedot_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<21>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd4, [reducedot_param_0]; ld.param.u64 %rd5, [reducedot_param_1]; ld.param.u64 %rd3, [reducedot_param_2]; ld.param.f32 %f31, [reducedot_param_3]; ld.param.u32 %r10, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r19, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; fma.rn.f32 %f31, %f6, %f5, %f31; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducedotE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; add.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; add.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; add.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; add.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; add.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; add.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; add.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd9, %rd3; atom.global.add.f32 %f29, [%rd9], %f28; BB0_10: ret; } ` reducedot_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<21>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd4, [reducedot_param_0]; ld.param.u64 %rd5, [reducedot_param_1]; ld.param.u64 %rd3, [reducedot_param_2]; ld.param.f32 %f31, [reducedot_param_3]; ld.param.u32 %r10, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r19, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; fma.rn.f32 %f31, %f6, %f5, %f31; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducedotE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; add.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; add.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; add.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; add.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; add.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; add.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; add.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd9, %rd3; atom.global.add.f32 %f29, [%rd9], %f28; BB0_10: ret; } ` reducedot_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<21>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd4, [reducedot_param_0]; ld.param.u64 %rd5, [reducedot_param_1]; ld.param.u64 %rd3, [reducedot_param_2]; ld.param.f32 %f31, [reducedot_param_3]; ld.param.u32 %r10, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r19, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; fma.rn.f32 %f31, %f6, %f5, %f31; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducedotE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; add.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; add.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; add.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; add.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; add.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; add.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; add.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd9, %rd3; atom.global.add.f32 %f29, [%rd9], %f28; BB0_10: ret; } ` reducedot_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<21>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd4, [reducedot_param_0]; ld.param.u64 %rd5, [reducedot_param_1]; ld.param.u64 %rd3, [reducedot_param_2]; ld.param.f32 %f31, [reducedot_param_3]; ld.param.u32 %r10, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r19, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; fma.rn.f32 %f31, %f6, %f5, %f31; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducedotE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; add.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; add.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; add.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; add.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; add.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; add.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; add.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd9, %rd3; atom.global.add.f32 %f29, [%rd9], %f28; BB0_10: ret; } ` reducedot_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<21>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd4, [reducedot_param_0]; ld.param.u64 %rd5, [reducedot_param_1]; ld.param.u64 %rd3, [reducedot_param_2]; ld.param.f32 %f31, [reducedot_param_3]; ld.param.u32 %r10, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r19, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; fma.rn.f32 %f31, %f6, %f5, %f31; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducedotE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; add.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; add.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; add.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; add.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; add.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; add.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; add.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd9, %rd3; atom.global.add.f32 %f29, [%rd9], %f28; BB0_10: ret; } ` reducedot_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<21>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd4, [reducedot_param_0]; ld.param.u64 %rd5, [reducedot_param_1]; ld.param.u64 %rd3, [reducedot_param_2]; ld.param.f32 %f31, [reducedot_param_3]; ld.param.u32 %r10, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r19, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; fma.rn.f32 %f31, %f6, %f5, %f31; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducedotE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; add.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; add.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; add.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; add.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; add.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; add.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; add.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd9, %rd3; atom.global.add.f32 %f29, [%rd9], %f28; BB0_10: ret; } ` reducedot_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<21>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd4, [reducedot_param_0]; ld.param.u64 %rd5, [reducedot_param_1]; ld.param.u64 %rd3, [reducedot_param_2]; ld.param.f32 %f31, [reducedot_param_3]; ld.param.u32 %r10, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r19, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; fma.rn.f32 %f31, %f6, %f5, %f31; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducedotE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; add.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; add.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; add.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; add.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; add.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; add.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; add.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd9, %rd3; atom.global.add.f32 %f29, [%rd9], %f28; BB0_10: ret; } ` reducedot_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<21>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd4, [reducedot_param_0]; ld.param.u64 %rd5, [reducedot_param_1]; ld.param.u64 %rd3, [reducedot_param_2]; ld.param.f32 %f31, [reducedot_param_3]; ld.param.u32 %r10, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r19, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; fma.rn.f32 %f31, %f6, %f5, %f31; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducedotE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; add.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; add.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; add.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; add.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; add.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; add.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; add.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd9, %rd3; atom.global.add.f32 %f29, [%rd9], %f28; BB0_10: ret; } ` reducedot_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<21>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd4, [reducedot_param_0]; ld.param.u64 %rd5, [reducedot_param_1]; ld.param.u64 %rd3, [reducedot_param_2]; ld.param.f32 %f31, [reducedot_param_3]; ld.param.u32 %r10, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r19, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; fma.rn.f32 %f31, %f6, %f5, %f31; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducedotE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; add.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; add.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; add.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; add.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; add.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; add.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; add.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd9, %rd3; atom.global.add.f32 %f29, [%rd9], %f28; BB0_10: ret; } ` reducedot_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<21>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd4, [reducedot_param_0]; ld.param.u64 %rd5, [reducedot_param_1]; ld.param.u64 %rd3, [reducedot_param_2]; ld.param.f32 %f31, [reducedot_param_3]; ld.param.u32 %r10, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r19, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; fma.rn.f32 %f31, %f6, %f5, %f31; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducedotE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; add.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; add.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; add.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; add.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; add.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; add.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; add.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd9, %rd3; atom.global.add.f32 %f29, [%rd9], %f28; BB0_10: ret; } ` reducedot_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl reducedot .visible .entry reducedot( .param .u64 reducedot_param_0, .param .u64 reducedot_param_1, .param .u64 reducedot_param_2, .param .f32 reducedot_param_3, .param .u32 reducedot_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<21>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ9reducedotE5sdata[2048]; ld.param.u64 %rd4, [reducedot_param_0]; ld.param.u64 %rd5, [reducedot_param_1]; ld.param.u64 %rd3, [reducedot_param_2]; ld.param.f32 %f31, [reducedot_param_3]; ld.param.u32 %r10, [reducedot_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r19, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; fma.rn.f32 %f31, %f6, %f5, %f31; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducedotE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; add.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; add.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; add.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; add.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; add.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; add.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; add.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ9reducedotE5sdata]; cvta.to.global.u64 %rd9, %rd3; atom.global.add.f32 %f29, [%rd9], %f28; BB0_10: ret; } ` ) mumax3-3.10/cuda/reducemaxabs.cu000066400000000000000000000003551371432437400165720ustar00rootroot00000000000000#include "reduce.h" #include "atomicf.h" #define load_fabs(i) fabs(src[i]) extern "C" __global__ void reducemaxabs(float* __restrict__ src, float* __restrict__ dst, float initVal, int n) { reduce(load_fabs, fmax, atomicFmaxabs) } mumax3-3.10/cuda/reducemaxabs_wrapper.go000066400000000000000000001131351371432437400203310ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for reducemaxabs kernel var reducemaxabs_code cu.Function // Stores the arguments for reducemaxabs kernel invocation type reducemaxabs_args_t struct { arg_src unsafe.Pointer arg_dst unsafe.Pointer arg_initVal float32 arg_n int argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for reducemaxabs kernel invocation var reducemaxabs_args reducemaxabs_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. reducemaxabs_args.argptr[0] = unsafe.Pointer(&reducemaxabs_args.arg_src) reducemaxabs_args.argptr[1] = unsafe.Pointer(&reducemaxabs_args.arg_dst) reducemaxabs_args.argptr[2] = unsafe.Pointer(&reducemaxabs_args.arg_initVal) reducemaxabs_args.argptr[3] = unsafe.Pointer(&reducemaxabs_args.arg_n) } // Wrapper for reducemaxabs CUDA kernel, asynchronous. func k_reducemaxabs_async(src unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { if Synchronous { // debug Sync() timer.Start("reducemaxabs") } reducemaxabs_args.Lock() defer reducemaxabs_args.Unlock() if reducemaxabs_code == 0 { reducemaxabs_code = fatbinLoad(reducemaxabs_map, "reducemaxabs") } reducemaxabs_args.arg_src = src reducemaxabs_args.arg_dst = dst reducemaxabs_args.arg_initVal = initVal reducemaxabs_args.arg_n = n args := reducemaxabs_args.argptr[:] cu.LaunchKernel(reducemaxabs_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("reducemaxabs") } } // maps compute capability on PTX code for reducemaxabs kernel. var reducemaxabs_map = map[int]string{0: "", 30: reducemaxabs_ptx_30, 32: reducemaxabs_ptx_32, 35: reducemaxabs_ptx_35, 37: reducemaxabs_ptx_37, 50: reducemaxabs_ptx_50, 52: reducemaxabs_ptx_52, 53: reducemaxabs_ptx_53, 60: reducemaxabs_ptx_60, 61: reducemaxabs_ptx_61, 62: reducemaxabs_ptx_62, 70: reducemaxabs_ptx_70, 72: reducemaxabs_ptx_72, 75: reducemaxabs_ptx_75} // reducemaxabs PTX code for various compute capabilities. const ( reducemaxabs_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<23>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd3, [reducemaxabs_param_0]; ld.param.u64 %rd2, [reducemaxabs_param_1]; ld.param.f32 %f31, [reducemaxabs_param_2]; ld.param.u32 %r10, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r21, 4; add.s64 %rd5, %rd1, %rd4; ld.global.f32 %f5, [%rd5]; abs.f32 %f6, %f5; max.f32 %f31, %f31, %f6; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ12reducemaxabsE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; max.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; max.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; max.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; max.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; max.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; max.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; max.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f29, %f28; mov.b32 %r19, %f29; cvta.to.global.u64 %rd6, %rd2; atom.global.max.s32 %r20, [%rd6], %r19; BB0_10: ret; } ` reducemaxabs_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<23>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd3, [reducemaxabs_param_0]; ld.param.u64 %rd2, [reducemaxabs_param_1]; ld.param.f32 %f31, [reducemaxabs_param_2]; ld.param.u32 %r10, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r21, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; abs.f32 %f6, %f5; max.f32 %f31, %f31, %f6; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ12reducemaxabsE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; max.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; max.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; max.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; max.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; max.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; max.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; max.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f29, %f28; mov.b32 %r19, %f29; cvta.to.global.u64 %rd6, %rd2; atom.global.max.s32 %r20, [%rd6], %r19; BB0_10: ret; } ` reducemaxabs_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<23>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd3, [reducemaxabs_param_0]; ld.param.u64 %rd2, [reducemaxabs_param_1]; ld.param.f32 %f31, [reducemaxabs_param_2]; ld.param.u32 %r10, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r21, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; abs.f32 %f6, %f5; max.f32 %f31, %f31, %f6; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ12reducemaxabsE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; max.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; max.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; max.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; max.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; max.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; max.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; max.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f29, %f28; mov.b32 %r19, %f29; cvta.to.global.u64 %rd6, %rd2; atom.global.max.s32 %r20, [%rd6], %r19; BB0_10: ret; } ` reducemaxabs_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<23>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd3, [reducemaxabs_param_0]; ld.param.u64 %rd2, [reducemaxabs_param_1]; ld.param.f32 %f31, [reducemaxabs_param_2]; ld.param.u32 %r10, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r21, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; abs.f32 %f6, %f5; max.f32 %f31, %f31, %f6; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ12reducemaxabsE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; max.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; max.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; max.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; max.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; max.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; max.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; max.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f29, %f28; mov.b32 %r19, %f29; cvta.to.global.u64 %rd6, %rd2; atom.global.max.s32 %r20, [%rd6], %r19; BB0_10: ret; } ` reducemaxabs_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<23>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd3, [reducemaxabs_param_0]; ld.param.u64 %rd2, [reducemaxabs_param_1]; ld.param.f32 %f31, [reducemaxabs_param_2]; ld.param.u32 %r10, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r21, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; abs.f32 %f6, %f5; max.f32 %f31, %f31, %f6; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ12reducemaxabsE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; max.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; max.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; max.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; max.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; max.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; max.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; max.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f29, %f28; mov.b32 %r19, %f29; cvta.to.global.u64 %rd6, %rd2; atom.global.max.s32 %r20, [%rd6], %r19; BB0_10: ret; } ` reducemaxabs_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<23>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd3, [reducemaxabs_param_0]; ld.param.u64 %rd2, [reducemaxabs_param_1]; ld.param.f32 %f31, [reducemaxabs_param_2]; ld.param.u32 %r10, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r21, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; abs.f32 %f6, %f5; max.f32 %f31, %f31, %f6; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ12reducemaxabsE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; max.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; max.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; max.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; max.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; max.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; max.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; max.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f29, %f28; mov.b32 %r19, %f29; cvta.to.global.u64 %rd6, %rd2; atom.global.max.s32 %r20, [%rd6], %r19; BB0_10: ret; } ` reducemaxabs_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<23>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd3, [reducemaxabs_param_0]; ld.param.u64 %rd2, [reducemaxabs_param_1]; ld.param.f32 %f31, [reducemaxabs_param_2]; ld.param.u32 %r10, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r21, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; abs.f32 %f6, %f5; max.f32 %f31, %f31, %f6; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ12reducemaxabsE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; max.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; max.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; max.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; max.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; max.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; max.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; max.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f29, %f28; mov.b32 %r19, %f29; cvta.to.global.u64 %rd6, %rd2; atom.global.max.s32 %r20, [%rd6], %r19; BB0_10: ret; } ` reducemaxabs_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<23>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd3, [reducemaxabs_param_0]; ld.param.u64 %rd2, [reducemaxabs_param_1]; ld.param.f32 %f31, [reducemaxabs_param_2]; ld.param.u32 %r10, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r21, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; abs.f32 %f6, %f5; max.f32 %f31, %f31, %f6; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ12reducemaxabsE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; max.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; max.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; max.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; max.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; max.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; max.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; max.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f29, %f28; mov.b32 %r19, %f29; cvta.to.global.u64 %rd6, %rd2; atom.global.max.s32 %r20, [%rd6], %r19; BB0_10: ret; } ` reducemaxabs_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<23>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd3, [reducemaxabs_param_0]; ld.param.u64 %rd2, [reducemaxabs_param_1]; ld.param.f32 %f31, [reducemaxabs_param_2]; ld.param.u32 %r10, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r21, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; abs.f32 %f6, %f5; max.f32 %f31, %f31, %f6; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ12reducemaxabsE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; max.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; max.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; max.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; max.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; max.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; max.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; max.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f29, %f28; mov.b32 %r19, %f29; cvta.to.global.u64 %rd6, %rd2; atom.global.max.s32 %r20, [%rd6], %r19; BB0_10: ret; } ` reducemaxabs_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<23>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd3, [reducemaxabs_param_0]; ld.param.u64 %rd2, [reducemaxabs_param_1]; ld.param.f32 %f31, [reducemaxabs_param_2]; ld.param.u32 %r10, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r21, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; abs.f32 %f6, %f5; max.f32 %f31, %f31, %f6; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ12reducemaxabsE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; max.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; max.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; max.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; max.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; max.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; max.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; max.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f29, %f28; mov.b32 %r19, %f29; cvta.to.global.u64 %rd6, %rd2; atom.global.max.s32 %r20, [%rd6], %r19; BB0_10: ret; } ` reducemaxabs_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<23>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd3, [reducemaxabs_param_0]; ld.param.u64 %rd2, [reducemaxabs_param_1]; ld.param.f32 %f31, [reducemaxabs_param_2]; ld.param.u32 %r10, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r21, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; abs.f32 %f6, %f5; max.f32 %f31, %f31, %f6; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ12reducemaxabsE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; max.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; max.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; max.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; max.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; max.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; max.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; max.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f29, %f28; mov.b32 %r19, %f29; cvta.to.global.u64 %rd6, %rd2; atom.global.max.s32 %r20, [%rd6], %r19; BB0_10: ret; } ` reducemaxabs_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<23>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd3, [reducemaxabs_param_0]; ld.param.u64 %rd2, [reducemaxabs_param_1]; ld.param.f32 %f31, [reducemaxabs_param_2]; ld.param.u32 %r10, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r21, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; abs.f32 %f6, %f5; max.f32 %f31, %f31, %f6; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ12reducemaxabsE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; max.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; max.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; max.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; max.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; max.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; max.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; max.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f29, %f28; mov.b32 %r19, %f29; cvta.to.global.u64 %rd6, %rd2; atom.global.max.s32 %r20, [%rd6], %r19; BB0_10: ret; } ` reducemaxabs_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl reducemaxabs .visible .entry reducemaxabs( .param .u64 reducemaxabs_param_0, .param .u64 reducemaxabs_param_1, .param .f32 reducemaxabs_param_2, .param .u32 reducemaxabs_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<32>; .reg .b32 %r<23>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ12reducemaxabsE5sdata[2048]; ld.param.u64 %rd3, [reducemaxabs_param_0]; ld.param.u64 %rd2, [reducemaxabs_param_1]; ld.param.f32 %f31, [reducemaxabs_param_2]; ld.param.u32 %r10, [reducemaxabs_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r21, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; abs.f32 %f6, %f5; max.f32 %f31, %f31, %f6; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ12reducemaxabsE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f31; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f7, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f8, [%r18]; max.f32 %f9, %f7, %f8; st.shared.f32 [%r7], %f9; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f10, [%r7]; ld.volatile.shared.f32 %f11, [%r7+128]; max.f32 %f12, %f10, %f11; st.volatile.shared.f32 [%r7], %f12; ld.volatile.shared.f32 %f13, [%r7+64]; ld.volatile.shared.f32 %f14, [%r7]; max.f32 %f15, %f14, %f13; st.volatile.shared.f32 [%r7], %f15; ld.volatile.shared.f32 %f16, [%r7+32]; ld.volatile.shared.f32 %f17, [%r7]; max.f32 %f18, %f17, %f16; st.volatile.shared.f32 [%r7], %f18; ld.volatile.shared.f32 %f19, [%r7+16]; ld.volatile.shared.f32 %f20, [%r7]; max.f32 %f21, %f20, %f19; st.volatile.shared.f32 [%r7], %f21; ld.volatile.shared.f32 %f22, [%r7+8]; ld.volatile.shared.f32 %f23, [%r7]; max.f32 %f24, %f23, %f22; st.volatile.shared.f32 [%r7], %f24; ld.volatile.shared.f32 %f25, [%r7+4]; ld.volatile.shared.f32 %f26, [%r7]; max.f32 %f27, %f26, %f25; st.volatile.shared.f32 [%r7], %f27; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f28, [_ZZ12reducemaxabsE5sdata]; abs.f32 %f29, %f28; mov.b32 %r19, %f29; cvta.to.global.u64 %rd6, %rd2; atom.global.max.s32 %r20, [%rd6], %r19; BB0_10: ret; } ` ) mumax3-3.10/cuda/reducemaxdiff.cu000066400000000000000000000004251371432437400167330ustar00rootroot00000000000000#include "reduce.h" #include "atomicf.h" #define load_diff(i) fabs(src1[i] - src2[i]) extern "C" __global__ void reducemaxdiff(float* __restrict__ src1, float* __restrict__ src2, float* __restrict__ dst, float initVal, int n) { reduce(load_diff, fmax, atomicFmaxabs) } mumax3-3.10/cuda/reducemaxdiff_wrapper.go000066400000000000000000001211161371432437400204720ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for reducemaxdiff kernel var reducemaxdiff_code cu.Function // Stores the arguments for reducemaxdiff kernel invocation type reducemaxdiff_args_t struct { arg_src1 unsafe.Pointer arg_src2 unsafe.Pointer arg_dst unsafe.Pointer arg_initVal float32 arg_n int argptr [5]unsafe.Pointer sync.Mutex } // Stores the arguments for reducemaxdiff kernel invocation var reducemaxdiff_args reducemaxdiff_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. reducemaxdiff_args.argptr[0] = unsafe.Pointer(&reducemaxdiff_args.arg_src1) reducemaxdiff_args.argptr[1] = unsafe.Pointer(&reducemaxdiff_args.arg_src2) reducemaxdiff_args.argptr[2] = unsafe.Pointer(&reducemaxdiff_args.arg_dst) reducemaxdiff_args.argptr[3] = unsafe.Pointer(&reducemaxdiff_args.arg_initVal) reducemaxdiff_args.argptr[4] = unsafe.Pointer(&reducemaxdiff_args.arg_n) } // Wrapper for reducemaxdiff CUDA kernel, asynchronous. func k_reducemaxdiff_async(src1 unsafe.Pointer, src2 unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { if Synchronous { // debug Sync() timer.Start("reducemaxdiff") } reducemaxdiff_args.Lock() defer reducemaxdiff_args.Unlock() if reducemaxdiff_code == 0 { reducemaxdiff_code = fatbinLoad(reducemaxdiff_map, "reducemaxdiff") } reducemaxdiff_args.arg_src1 = src1 reducemaxdiff_args.arg_src2 = src2 reducemaxdiff_args.arg_dst = dst reducemaxdiff_args.arg_initVal = initVal reducemaxdiff_args.arg_n = n args := reducemaxdiff_args.argptr[:] cu.LaunchKernel(reducemaxdiff_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("reducemaxdiff") } } // maps compute capability on PTX code for reducemaxdiff kernel. var reducemaxdiff_map = map[int]string{0: "", 30: reducemaxdiff_ptx_30, 32: reducemaxdiff_ptx_32, 35: reducemaxdiff_ptx_35, 37: reducemaxdiff_ptx_37, 50: reducemaxdiff_ptx_50, 52: reducemaxdiff_ptx_52, 53: reducemaxdiff_ptx_53, 60: reducemaxdiff_ptx_60, 61: reducemaxdiff_ptx_61, 62: reducemaxdiff_ptx_62, 70: reducemaxdiff_ptx_70, 72: reducemaxdiff_ptx_72, 75: reducemaxdiff_ptx_75} // reducemaxdiff PTX code for various compute capabilities. const ( reducemaxdiff_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<34>; .reg .b32 %r<23>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd4, [reducemaxdiff_param_0]; ld.param.u64 %rd5, [reducemaxdiff_param_1]; ld.param.u64 %rd3, [reducemaxdiff_param_2]; ld.param.f32 %f33, [reducemaxdiff_param_3]; ld.param.u32 %r10, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r21, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.f32 %f5, [%rd8]; ld.global.f32 %f6, [%rd7]; sub.f32 %f7, %f6, %f5; abs.f32 %f8, %f7; max.f32 %f33, %f33, %f8; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ13reducemaxdiffE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f33; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f9, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f10, [%r18]; max.f32 %f11, %f9, %f10; st.shared.f32 [%r7], %f11; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f12, [%r7]; ld.volatile.shared.f32 %f13, [%r7+128]; max.f32 %f14, %f12, %f13; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+64]; ld.volatile.shared.f32 %f16, [%r7]; max.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+32]; ld.volatile.shared.f32 %f19, [%r7]; max.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+16]; ld.volatile.shared.f32 %f22, [%r7]; max.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+8]; ld.volatile.shared.f32 %f25, [%r7]; max.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; ld.volatile.shared.f32 %f27, [%r7+4]; ld.volatile.shared.f32 %f28, [%r7]; max.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r7], %f29; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f30, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f31, %f30; mov.b32 %r19, %f31; cvta.to.global.u64 %rd9, %rd3; atom.global.max.s32 %r20, [%rd9], %r19; BB0_10: ret; } ` reducemaxdiff_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<34>; .reg .b32 %r<23>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd4, [reducemaxdiff_param_0]; ld.param.u64 %rd5, [reducemaxdiff_param_1]; ld.param.u64 %rd3, [reducemaxdiff_param_2]; ld.param.f32 %f33, [reducemaxdiff_param_3]; ld.param.u32 %r10, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r21, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; sub.f32 %f7, %f6, %f5; abs.f32 %f8, %f7; max.f32 %f33, %f33, %f8; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ13reducemaxdiffE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f33; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f9, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f10, [%r18]; max.f32 %f11, %f9, %f10; st.shared.f32 [%r7], %f11; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f12, [%r7]; ld.volatile.shared.f32 %f13, [%r7+128]; max.f32 %f14, %f12, %f13; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+64]; ld.volatile.shared.f32 %f16, [%r7]; max.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+32]; ld.volatile.shared.f32 %f19, [%r7]; max.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+16]; ld.volatile.shared.f32 %f22, [%r7]; max.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+8]; ld.volatile.shared.f32 %f25, [%r7]; max.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; ld.volatile.shared.f32 %f27, [%r7+4]; ld.volatile.shared.f32 %f28, [%r7]; max.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r7], %f29; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f30, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f31, %f30; mov.b32 %r19, %f31; cvta.to.global.u64 %rd9, %rd3; atom.global.max.s32 %r20, [%rd9], %r19; BB0_10: ret; } ` reducemaxdiff_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<34>; .reg .b32 %r<23>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd4, [reducemaxdiff_param_0]; ld.param.u64 %rd5, [reducemaxdiff_param_1]; ld.param.u64 %rd3, [reducemaxdiff_param_2]; ld.param.f32 %f33, [reducemaxdiff_param_3]; ld.param.u32 %r10, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r21, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; sub.f32 %f7, %f6, %f5; abs.f32 %f8, %f7; max.f32 %f33, %f33, %f8; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ13reducemaxdiffE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f33; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f9, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f10, [%r18]; max.f32 %f11, %f9, %f10; st.shared.f32 [%r7], %f11; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f12, [%r7]; ld.volatile.shared.f32 %f13, [%r7+128]; max.f32 %f14, %f12, %f13; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+64]; ld.volatile.shared.f32 %f16, [%r7]; max.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+32]; ld.volatile.shared.f32 %f19, [%r7]; max.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+16]; ld.volatile.shared.f32 %f22, [%r7]; max.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+8]; ld.volatile.shared.f32 %f25, [%r7]; max.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; ld.volatile.shared.f32 %f27, [%r7+4]; ld.volatile.shared.f32 %f28, [%r7]; max.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r7], %f29; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f30, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f31, %f30; mov.b32 %r19, %f31; cvta.to.global.u64 %rd9, %rd3; atom.global.max.s32 %r20, [%rd9], %r19; BB0_10: ret; } ` reducemaxdiff_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<34>; .reg .b32 %r<23>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd4, [reducemaxdiff_param_0]; ld.param.u64 %rd5, [reducemaxdiff_param_1]; ld.param.u64 %rd3, [reducemaxdiff_param_2]; ld.param.f32 %f33, [reducemaxdiff_param_3]; ld.param.u32 %r10, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r21, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; sub.f32 %f7, %f6, %f5; abs.f32 %f8, %f7; max.f32 %f33, %f33, %f8; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ13reducemaxdiffE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f33; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f9, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f10, [%r18]; max.f32 %f11, %f9, %f10; st.shared.f32 [%r7], %f11; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f12, [%r7]; ld.volatile.shared.f32 %f13, [%r7+128]; max.f32 %f14, %f12, %f13; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+64]; ld.volatile.shared.f32 %f16, [%r7]; max.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+32]; ld.volatile.shared.f32 %f19, [%r7]; max.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+16]; ld.volatile.shared.f32 %f22, [%r7]; max.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+8]; ld.volatile.shared.f32 %f25, [%r7]; max.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; ld.volatile.shared.f32 %f27, [%r7+4]; ld.volatile.shared.f32 %f28, [%r7]; max.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r7], %f29; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f30, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f31, %f30; mov.b32 %r19, %f31; cvta.to.global.u64 %rd9, %rd3; atom.global.max.s32 %r20, [%rd9], %r19; BB0_10: ret; } ` reducemaxdiff_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<34>; .reg .b32 %r<23>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd4, [reducemaxdiff_param_0]; ld.param.u64 %rd5, [reducemaxdiff_param_1]; ld.param.u64 %rd3, [reducemaxdiff_param_2]; ld.param.f32 %f33, [reducemaxdiff_param_3]; ld.param.u32 %r10, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r21, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; sub.f32 %f7, %f6, %f5; abs.f32 %f8, %f7; max.f32 %f33, %f33, %f8; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ13reducemaxdiffE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f33; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f9, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f10, [%r18]; max.f32 %f11, %f9, %f10; st.shared.f32 [%r7], %f11; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f12, [%r7]; ld.volatile.shared.f32 %f13, [%r7+128]; max.f32 %f14, %f12, %f13; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+64]; ld.volatile.shared.f32 %f16, [%r7]; max.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+32]; ld.volatile.shared.f32 %f19, [%r7]; max.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+16]; ld.volatile.shared.f32 %f22, [%r7]; max.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+8]; ld.volatile.shared.f32 %f25, [%r7]; max.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; ld.volatile.shared.f32 %f27, [%r7+4]; ld.volatile.shared.f32 %f28, [%r7]; max.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r7], %f29; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f30, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f31, %f30; mov.b32 %r19, %f31; cvta.to.global.u64 %rd9, %rd3; atom.global.max.s32 %r20, [%rd9], %r19; BB0_10: ret; } ` reducemaxdiff_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<34>; .reg .b32 %r<23>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd4, [reducemaxdiff_param_0]; ld.param.u64 %rd5, [reducemaxdiff_param_1]; ld.param.u64 %rd3, [reducemaxdiff_param_2]; ld.param.f32 %f33, [reducemaxdiff_param_3]; ld.param.u32 %r10, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r21, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; sub.f32 %f7, %f6, %f5; abs.f32 %f8, %f7; max.f32 %f33, %f33, %f8; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ13reducemaxdiffE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f33; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f9, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f10, [%r18]; max.f32 %f11, %f9, %f10; st.shared.f32 [%r7], %f11; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f12, [%r7]; ld.volatile.shared.f32 %f13, [%r7+128]; max.f32 %f14, %f12, %f13; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+64]; ld.volatile.shared.f32 %f16, [%r7]; max.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+32]; ld.volatile.shared.f32 %f19, [%r7]; max.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+16]; ld.volatile.shared.f32 %f22, [%r7]; max.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+8]; ld.volatile.shared.f32 %f25, [%r7]; max.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; ld.volatile.shared.f32 %f27, [%r7+4]; ld.volatile.shared.f32 %f28, [%r7]; max.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r7], %f29; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f30, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f31, %f30; mov.b32 %r19, %f31; cvta.to.global.u64 %rd9, %rd3; atom.global.max.s32 %r20, [%rd9], %r19; BB0_10: ret; } ` reducemaxdiff_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<34>; .reg .b32 %r<23>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd4, [reducemaxdiff_param_0]; ld.param.u64 %rd5, [reducemaxdiff_param_1]; ld.param.u64 %rd3, [reducemaxdiff_param_2]; ld.param.f32 %f33, [reducemaxdiff_param_3]; ld.param.u32 %r10, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r21, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; sub.f32 %f7, %f6, %f5; abs.f32 %f8, %f7; max.f32 %f33, %f33, %f8; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ13reducemaxdiffE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f33; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f9, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f10, [%r18]; max.f32 %f11, %f9, %f10; st.shared.f32 [%r7], %f11; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f12, [%r7]; ld.volatile.shared.f32 %f13, [%r7+128]; max.f32 %f14, %f12, %f13; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+64]; ld.volatile.shared.f32 %f16, [%r7]; max.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+32]; ld.volatile.shared.f32 %f19, [%r7]; max.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+16]; ld.volatile.shared.f32 %f22, [%r7]; max.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+8]; ld.volatile.shared.f32 %f25, [%r7]; max.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; ld.volatile.shared.f32 %f27, [%r7+4]; ld.volatile.shared.f32 %f28, [%r7]; max.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r7], %f29; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f30, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f31, %f30; mov.b32 %r19, %f31; cvta.to.global.u64 %rd9, %rd3; atom.global.max.s32 %r20, [%rd9], %r19; BB0_10: ret; } ` reducemaxdiff_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<34>; .reg .b32 %r<23>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd4, [reducemaxdiff_param_0]; ld.param.u64 %rd5, [reducemaxdiff_param_1]; ld.param.u64 %rd3, [reducemaxdiff_param_2]; ld.param.f32 %f33, [reducemaxdiff_param_3]; ld.param.u32 %r10, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r21, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; sub.f32 %f7, %f6, %f5; abs.f32 %f8, %f7; max.f32 %f33, %f33, %f8; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ13reducemaxdiffE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f33; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f9, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f10, [%r18]; max.f32 %f11, %f9, %f10; st.shared.f32 [%r7], %f11; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f12, [%r7]; ld.volatile.shared.f32 %f13, [%r7+128]; max.f32 %f14, %f12, %f13; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+64]; ld.volatile.shared.f32 %f16, [%r7]; max.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+32]; ld.volatile.shared.f32 %f19, [%r7]; max.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+16]; ld.volatile.shared.f32 %f22, [%r7]; max.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+8]; ld.volatile.shared.f32 %f25, [%r7]; max.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; ld.volatile.shared.f32 %f27, [%r7+4]; ld.volatile.shared.f32 %f28, [%r7]; max.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r7], %f29; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f30, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f31, %f30; mov.b32 %r19, %f31; cvta.to.global.u64 %rd9, %rd3; atom.global.max.s32 %r20, [%rd9], %r19; BB0_10: ret; } ` reducemaxdiff_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<34>; .reg .b32 %r<23>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd4, [reducemaxdiff_param_0]; ld.param.u64 %rd5, [reducemaxdiff_param_1]; ld.param.u64 %rd3, [reducemaxdiff_param_2]; ld.param.f32 %f33, [reducemaxdiff_param_3]; ld.param.u32 %r10, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r21, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; sub.f32 %f7, %f6, %f5; abs.f32 %f8, %f7; max.f32 %f33, %f33, %f8; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ13reducemaxdiffE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f33; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f9, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f10, [%r18]; max.f32 %f11, %f9, %f10; st.shared.f32 [%r7], %f11; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f12, [%r7]; ld.volatile.shared.f32 %f13, [%r7+128]; max.f32 %f14, %f12, %f13; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+64]; ld.volatile.shared.f32 %f16, [%r7]; max.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+32]; ld.volatile.shared.f32 %f19, [%r7]; max.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+16]; ld.volatile.shared.f32 %f22, [%r7]; max.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+8]; ld.volatile.shared.f32 %f25, [%r7]; max.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; ld.volatile.shared.f32 %f27, [%r7+4]; ld.volatile.shared.f32 %f28, [%r7]; max.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r7], %f29; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f30, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f31, %f30; mov.b32 %r19, %f31; cvta.to.global.u64 %rd9, %rd3; atom.global.max.s32 %r20, [%rd9], %r19; BB0_10: ret; } ` reducemaxdiff_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<34>; .reg .b32 %r<23>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd4, [reducemaxdiff_param_0]; ld.param.u64 %rd5, [reducemaxdiff_param_1]; ld.param.u64 %rd3, [reducemaxdiff_param_2]; ld.param.f32 %f33, [reducemaxdiff_param_3]; ld.param.u32 %r10, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r21, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; sub.f32 %f7, %f6, %f5; abs.f32 %f8, %f7; max.f32 %f33, %f33, %f8; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ13reducemaxdiffE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f33; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f9, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f10, [%r18]; max.f32 %f11, %f9, %f10; st.shared.f32 [%r7], %f11; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f12, [%r7]; ld.volatile.shared.f32 %f13, [%r7+128]; max.f32 %f14, %f12, %f13; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+64]; ld.volatile.shared.f32 %f16, [%r7]; max.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+32]; ld.volatile.shared.f32 %f19, [%r7]; max.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+16]; ld.volatile.shared.f32 %f22, [%r7]; max.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+8]; ld.volatile.shared.f32 %f25, [%r7]; max.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; ld.volatile.shared.f32 %f27, [%r7+4]; ld.volatile.shared.f32 %f28, [%r7]; max.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r7], %f29; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f30, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f31, %f30; mov.b32 %r19, %f31; cvta.to.global.u64 %rd9, %rd3; atom.global.max.s32 %r20, [%rd9], %r19; BB0_10: ret; } ` reducemaxdiff_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<34>; .reg .b32 %r<23>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd4, [reducemaxdiff_param_0]; ld.param.u64 %rd5, [reducemaxdiff_param_1]; ld.param.u64 %rd3, [reducemaxdiff_param_2]; ld.param.f32 %f33, [reducemaxdiff_param_3]; ld.param.u32 %r10, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r21, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; sub.f32 %f7, %f6, %f5; abs.f32 %f8, %f7; max.f32 %f33, %f33, %f8; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ13reducemaxdiffE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f33; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f9, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f10, [%r18]; max.f32 %f11, %f9, %f10; st.shared.f32 [%r7], %f11; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f12, [%r7]; ld.volatile.shared.f32 %f13, [%r7+128]; max.f32 %f14, %f12, %f13; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+64]; ld.volatile.shared.f32 %f16, [%r7]; max.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+32]; ld.volatile.shared.f32 %f19, [%r7]; max.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+16]; ld.volatile.shared.f32 %f22, [%r7]; max.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+8]; ld.volatile.shared.f32 %f25, [%r7]; max.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; ld.volatile.shared.f32 %f27, [%r7+4]; ld.volatile.shared.f32 %f28, [%r7]; max.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r7], %f29; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f30, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f31, %f30; mov.b32 %r19, %f31; cvta.to.global.u64 %rd9, %rd3; atom.global.max.s32 %r20, [%rd9], %r19; BB0_10: ret; } ` reducemaxdiff_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<34>; .reg .b32 %r<23>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd4, [reducemaxdiff_param_0]; ld.param.u64 %rd5, [reducemaxdiff_param_1]; ld.param.u64 %rd3, [reducemaxdiff_param_2]; ld.param.f32 %f33, [reducemaxdiff_param_3]; ld.param.u32 %r10, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r21, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; sub.f32 %f7, %f6, %f5; abs.f32 %f8, %f7; max.f32 %f33, %f33, %f8; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ13reducemaxdiffE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f33; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f9, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f10, [%r18]; max.f32 %f11, %f9, %f10; st.shared.f32 [%r7], %f11; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f12, [%r7]; ld.volatile.shared.f32 %f13, [%r7+128]; max.f32 %f14, %f12, %f13; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+64]; ld.volatile.shared.f32 %f16, [%r7]; max.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+32]; ld.volatile.shared.f32 %f19, [%r7]; max.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+16]; ld.volatile.shared.f32 %f22, [%r7]; max.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+8]; ld.volatile.shared.f32 %f25, [%r7]; max.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; ld.volatile.shared.f32 %f27, [%r7+4]; ld.volatile.shared.f32 %f28, [%r7]; max.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r7], %f29; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f30, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f31, %f30; mov.b32 %r19, %f31; cvta.to.global.u64 %rd9, %rd3; atom.global.max.s32 %r20, [%rd9], %r19; BB0_10: ret; } ` reducemaxdiff_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl reducemaxdiff .visible .entry reducemaxdiff( .param .u64 reducemaxdiff_param_0, .param .u64 reducemaxdiff_param_1, .param .u64 reducemaxdiff_param_2, .param .f32 reducemaxdiff_param_3, .param .u32 reducemaxdiff_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<34>; .reg .b32 %r<23>; .reg .b64 %rd<10>; // demoted variable .shared .align 4 .b8 _ZZ13reducemaxdiffE5sdata[2048]; ld.param.u64 %rd4, [reducemaxdiff_param_0]; ld.param.u64 %rd5, [reducemaxdiff_param_1]; ld.param.u64 %rd3, [reducemaxdiff_param_2]; ld.param.f32 %f33, [reducemaxdiff_param_3]; ld.param.u32 %r10, [reducemaxdiff_param_4]; cvta.to.global.u64 %rd1, %rd5; cvta.to.global.u64 %rd2, %rd4; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd6, %r21, 4; add.s64 %rd7, %rd2, %rd6; add.s64 %rd8, %rd1, %rd6; ld.global.nc.f32 %f5, [%rd8]; ld.global.nc.f32 %f6, [%rd7]; sub.f32 %f7, %f6, %f5; abs.f32 %f8, %f7; max.f32 %f33, %f33, %f8; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ13reducemaxdiffE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f33; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f9, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f10, [%r18]; max.f32 %f11, %f9, %f10; st.shared.f32 [%r7], %f11; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f12, [%r7]; ld.volatile.shared.f32 %f13, [%r7+128]; max.f32 %f14, %f12, %f13; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+64]; ld.volatile.shared.f32 %f16, [%r7]; max.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+32]; ld.volatile.shared.f32 %f19, [%r7]; max.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+16]; ld.volatile.shared.f32 %f22, [%r7]; max.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+8]; ld.volatile.shared.f32 %f25, [%r7]; max.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; ld.volatile.shared.f32 %f27, [%r7+4]; ld.volatile.shared.f32 %f28, [%r7]; max.f32 %f29, %f28, %f27; st.volatile.shared.f32 [%r7], %f29; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f30, [_ZZ13reducemaxdiffE5sdata]; abs.f32 %f31, %f30; mov.b32 %r19, %f31; cvta.to.global.u64 %rd9, %rd3; atom.global.max.s32 %r20, [%rd9], %r19; BB0_10: ret; } ` ) mumax3-3.10/cuda/reducemaxvecdiff2.cu000066400000000000000000000007521371432437400175160ustar00rootroot00000000000000#include "reduce.h" #include "atomicf.h" #include "float3.h" #define load_vecdiff2(i) \ pow2(x1[i] - x2[i]) + \ pow2(y1[i] - y2[i]) + \ pow2(z1[i] - z2[i]) \ extern "C" __global__ void reducemaxvecdiff2(float* __restrict__ x1, float* __restrict__ y1, float* __restrict__ z1, float* __restrict__ x2, float* __restrict__ y2, float* __restrict__ z2, float* __restrict__ dst, float initVal, int n) { reduce(load_vecdiff2, fmax, atomicFmaxabs) } mumax3-3.10/cuda/reducemaxvecdiff2_wrapper.go000066400000000000000000001532221371432437400212550ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for reducemaxvecdiff2 kernel var reducemaxvecdiff2_code cu.Function // Stores the arguments for reducemaxvecdiff2 kernel invocation type reducemaxvecdiff2_args_t struct { arg_x1 unsafe.Pointer arg_y1 unsafe.Pointer arg_z1 unsafe.Pointer arg_x2 unsafe.Pointer arg_y2 unsafe.Pointer arg_z2 unsafe.Pointer arg_dst unsafe.Pointer arg_initVal float32 arg_n int argptr [9]unsafe.Pointer sync.Mutex } // Stores the arguments for reducemaxvecdiff2 kernel invocation var reducemaxvecdiff2_args reducemaxvecdiff2_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. reducemaxvecdiff2_args.argptr[0] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_x1) reducemaxvecdiff2_args.argptr[1] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_y1) reducemaxvecdiff2_args.argptr[2] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_z1) reducemaxvecdiff2_args.argptr[3] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_x2) reducemaxvecdiff2_args.argptr[4] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_y2) reducemaxvecdiff2_args.argptr[5] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_z2) reducemaxvecdiff2_args.argptr[6] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_dst) reducemaxvecdiff2_args.argptr[7] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_initVal) reducemaxvecdiff2_args.argptr[8] = unsafe.Pointer(&reducemaxvecdiff2_args.arg_n) } // Wrapper for reducemaxvecdiff2 CUDA kernel, asynchronous. func k_reducemaxvecdiff2_async(x1 unsafe.Pointer, y1 unsafe.Pointer, z1 unsafe.Pointer, x2 unsafe.Pointer, y2 unsafe.Pointer, z2 unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { if Synchronous { // debug Sync() timer.Start("reducemaxvecdiff2") } reducemaxvecdiff2_args.Lock() defer reducemaxvecdiff2_args.Unlock() if reducemaxvecdiff2_code == 0 { reducemaxvecdiff2_code = fatbinLoad(reducemaxvecdiff2_map, "reducemaxvecdiff2") } reducemaxvecdiff2_args.arg_x1 = x1 reducemaxvecdiff2_args.arg_y1 = y1 reducemaxvecdiff2_args.arg_z1 = z1 reducemaxvecdiff2_args.arg_x2 = x2 reducemaxvecdiff2_args.arg_y2 = y2 reducemaxvecdiff2_args.arg_z2 = z2 reducemaxvecdiff2_args.arg_dst = dst reducemaxvecdiff2_args.arg_initVal = initVal reducemaxvecdiff2_args.arg_n = n args := reducemaxvecdiff2_args.argptr[:] cu.LaunchKernel(reducemaxvecdiff2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("reducemaxvecdiff2") } } // maps compute capability on PTX code for reducemaxvecdiff2 kernel. var reducemaxvecdiff2_map = map[int]string{0: "", 30: reducemaxvecdiff2_ptx_30, 32: reducemaxvecdiff2_ptx_32, 35: reducemaxvecdiff2_ptx_35, 37: reducemaxvecdiff2_ptx_37, 50: reducemaxvecdiff2_ptx_50, 52: reducemaxvecdiff2_ptx_52, 53: reducemaxvecdiff2_ptx_53, 60: reducemaxvecdiff2_ptx_60, 61: reducemaxvecdiff2_ptx_61, 62: reducemaxvecdiff2_ptx_62, 70: reducemaxvecdiff2_ptx_70, 72: reducemaxvecdiff2_ptx_72, 75: reducemaxvecdiff2_ptx_75} // reducemaxvecdiff2 PTX code for various compute capabilities. const ( reducemaxvecdiff2_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<8>; .reg .f32 %f<42>; .reg .b32 %r<23>; .reg .b64 %rd<22>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd8, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd9, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd10, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd11, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd12, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd13, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd7, [reducemaxvecdiff2_param_6]; ld.param.f32 %f41, [reducemaxvecdiff2_param_7]; ld.param.u32 %r10, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd13; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd12; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd11; cvta.to.global.u64 %rd6, %rd8; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd14, %r21, 4; add.s64 %rd15, %rd6, %rd14; add.s64 %rd16, %rd5, %rd14; ld.global.f32 %f5, [%rd16]; ld.global.f32 %f6, [%rd15]; sub.f32 %f7, %f6, %f5; add.s64 %rd17, %rd4, %rd14; add.s64 %rd18, %rd3, %rd14; ld.global.f32 %f8, [%rd18]; ld.global.f32 %f9, [%rd17]; sub.f32 %f10, %f9, %f8; mul.f32 %f11, %f10, %f10; fma.rn.f32 %f12, %f7, %f7, %f11; add.s64 %rd19, %rd2, %rd14; add.s64 %rd20, %rd1, %rd14; ld.global.f32 %f13, [%rd20]; ld.global.f32 %f14, [%rd19]; sub.f32 %f15, %f14, %f13; fma.rn.f32 %f16, %f15, %f15, %f12; max.f32 %f41, %f41, %f16; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f41; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f17, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f18, [%r18]; max.f32 %f19, %f17, %f18; st.shared.f32 [%r7], %f19; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f20, [%r7]; ld.volatile.shared.f32 %f21, [%r7+128]; max.f32 %f22, %f20, %f21; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+64]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+32]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+16]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; ld.volatile.shared.f32 %f32, [%r7+8]; ld.volatile.shared.f32 %f33, [%r7]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r7], %f34; ld.volatile.shared.f32 %f35, [%r7+4]; ld.volatile.shared.f32 %f36, [%r7]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r7], %f37; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f38, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f39, %f38; mov.b32 %r19, %f39; cvta.to.global.u64 %rd21, %rd7; atom.global.max.s32 %r20, [%rd21], %r19; BB0_10: ret; } ` reducemaxvecdiff2_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<8>; .reg .f32 %f<42>; .reg .b32 %r<23>; .reg .b64 %rd<22>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd8, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd9, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd10, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd6, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd11, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd12, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd7, [reducemaxvecdiff2_param_6]; ld.param.f32 %f41, [reducemaxvecdiff2_param_7]; ld.param.u32 %r10, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd11; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_3; cvta.to.global.u64 %rd15, %rd6; BB0_2: mul.wide.s32 %rd13, %r21, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f5, [%rd16]; ld.global.nc.f32 %f6, [%rd14]; sub.f32 %f7, %f6, %f5; add.s64 %rd17, %rd4, %rd13; add.s64 %rd18, %rd3, %rd13; ld.global.nc.f32 %f8, [%rd18]; ld.global.nc.f32 %f9, [%rd17]; sub.f32 %f10, %f9, %f8; mul.f32 %f11, %f10, %f10; fma.rn.f32 %f12, %f7, %f7, %f11; add.s64 %rd19, %rd2, %rd13; add.s64 %rd20, %rd1, %rd13; ld.global.nc.f32 %f13, [%rd20]; ld.global.nc.f32 %f14, [%rd19]; sub.f32 %f15, %f14, %f13; fma.rn.f32 %f16, %f15, %f15, %f12; max.f32 %f41, %f41, %f16; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_2; BB0_3: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f41; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_7; BB0_4: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_6; ld.shared.f32 %f17, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f18, [%r18]; max.f32 %f19, %f17, %f18; st.shared.f32 [%r7], %f19; BB0_6: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_4; BB0_7: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_9; ld.volatile.shared.f32 %f20, [%r7]; ld.volatile.shared.f32 %f21, [%r7+128]; max.f32 %f22, %f20, %f21; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+64]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+32]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+16]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; ld.volatile.shared.f32 %f32, [%r7+8]; ld.volatile.shared.f32 %f33, [%r7]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r7], %f34; ld.volatile.shared.f32 %f35, [%r7+4]; ld.volatile.shared.f32 %f36, [%r7]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r7], %f37; BB0_9: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_11; ld.shared.f32 %f38, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f39, %f38; mov.b32 %r19, %f39; cvta.to.global.u64 %rd21, %rd7; atom.global.max.s32 %r20, [%rd21], %r19; BB0_11: ret; } ` reducemaxvecdiff2_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<8>; .reg .f32 %f<42>; .reg .b32 %r<23>; .reg .b64 %rd<22>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd8, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd9, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd10, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd6, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd11, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd12, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd7, [reducemaxvecdiff2_param_6]; ld.param.f32 %f41, [reducemaxvecdiff2_param_7]; ld.param.u32 %r10, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd11; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_3; cvta.to.global.u64 %rd15, %rd6; BB0_2: mul.wide.s32 %rd13, %r21, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f5, [%rd16]; ld.global.nc.f32 %f6, [%rd14]; sub.f32 %f7, %f6, %f5; add.s64 %rd17, %rd4, %rd13; add.s64 %rd18, %rd3, %rd13; ld.global.nc.f32 %f8, [%rd18]; ld.global.nc.f32 %f9, [%rd17]; sub.f32 %f10, %f9, %f8; mul.f32 %f11, %f10, %f10; fma.rn.f32 %f12, %f7, %f7, %f11; add.s64 %rd19, %rd2, %rd13; add.s64 %rd20, %rd1, %rd13; ld.global.nc.f32 %f13, [%rd20]; ld.global.nc.f32 %f14, [%rd19]; sub.f32 %f15, %f14, %f13; fma.rn.f32 %f16, %f15, %f15, %f12; max.f32 %f41, %f41, %f16; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_2; BB0_3: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f41; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_7; BB0_4: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_6; ld.shared.f32 %f17, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f18, [%r18]; max.f32 %f19, %f17, %f18; st.shared.f32 [%r7], %f19; BB0_6: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_4; BB0_7: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_9; ld.volatile.shared.f32 %f20, [%r7]; ld.volatile.shared.f32 %f21, [%r7+128]; max.f32 %f22, %f20, %f21; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+64]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+32]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+16]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; ld.volatile.shared.f32 %f32, [%r7+8]; ld.volatile.shared.f32 %f33, [%r7]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r7], %f34; ld.volatile.shared.f32 %f35, [%r7+4]; ld.volatile.shared.f32 %f36, [%r7]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r7], %f37; BB0_9: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_11; ld.shared.f32 %f38, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f39, %f38; mov.b32 %r19, %f39; cvta.to.global.u64 %rd21, %rd7; atom.global.max.s32 %r20, [%rd21], %r19; BB0_11: ret; } ` reducemaxvecdiff2_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<8>; .reg .f32 %f<42>; .reg .b32 %r<23>; .reg .b64 %rd<22>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd8, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd9, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd10, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd6, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd11, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd12, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd7, [reducemaxvecdiff2_param_6]; ld.param.f32 %f41, [reducemaxvecdiff2_param_7]; ld.param.u32 %r10, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd11; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_3; cvta.to.global.u64 %rd15, %rd6; BB0_2: mul.wide.s32 %rd13, %r21, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f5, [%rd16]; ld.global.nc.f32 %f6, [%rd14]; sub.f32 %f7, %f6, %f5; add.s64 %rd17, %rd4, %rd13; add.s64 %rd18, %rd3, %rd13; ld.global.nc.f32 %f8, [%rd18]; ld.global.nc.f32 %f9, [%rd17]; sub.f32 %f10, %f9, %f8; mul.f32 %f11, %f10, %f10; fma.rn.f32 %f12, %f7, %f7, %f11; add.s64 %rd19, %rd2, %rd13; add.s64 %rd20, %rd1, %rd13; ld.global.nc.f32 %f13, [%rd20]; ld.global.nc.f32 %f14, [%rd19]; sub.f32 %f15, %f14, %f13; fma.rn.f32 %f16, %f15, %f15, %f12; max.f32 %f41, %f41, %f16; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_2; BB0_3: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f41; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_7; BB0_4: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_6; ld.shared.f32 %f17, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f18, [%r18]; max.f32 %f19, %f17, %f18; st.shared.f32 [%r7], %f19; BB0_6: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_4; BB0_7: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_9; ld.volatile.shared.f32 %f20, [%r7]; ld.volatile.shared.f32 %f21, [%r7+128]; max.f32 %f22, %f20, %f21; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+64]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+32]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+16]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; ld.volatile.shared.f32 %f32, [%r7+8]; ld.volatile.shared.f32 %f33, [%r7]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r7], %f34; ld.volatile.shared.f32 %f35, [%r7+4]; ld.volatile.shared.f32 %f36, [%r7]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r7], %f37; BB0_9: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_11; ld.shared.f32 %f38, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f39, %f38; mov.b32 %r19, %f39; cvta.to.global.u64 %rd21, %rd7; atom.global.max.s32 %r20, [%rd21], %r19; BB0_11: ret; } ` reducemaxvecdiff2_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<8>; .reg .f32 %f<42>; .reg .b32 %r<23>; .reg .b64 %rd<22>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd8, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd9, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd10, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd6, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd11, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd12, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd7, [reducemaxvecdiff2_param_6]; ld.param.f32 %f41, [reducemaxvecdiff2_param_7]; ld.param.u32 %r10, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd11; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_3; cvta.to.global.u64 %rd15, %rd6; BB0_2: mul.wide.s32 %rd13, %r21, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f5, [%rd16]; ld.global.nc.f32 %f6, [%rd14]; sub.f32 %f7, %f6, %f5; add.s64 %rd17, %rd4, %rd13; add.s64 %rd18, %rd3, %rd13; ld.global.nc.f32 %f8, [%rd18]; ld.global.nc.f32 %f9, [%rd17]; sub.f32 %f10, %f9, %f8; mul.f32 %f11, %f10, %f10; fma.rn.f32 %f12, %f7, %f7, %f11; add.s64 %rd19, %rd2, %rd13; add.s64 %rd20, %rd1, %rd13; ld.global.nc.f32 %f13, [%rd20]; ld.global.nc.f32 %f14, [%rd19]; sub.f32 %f15, %f14, %f13; fma.rn.f32 %f16, %f15, %f15, %f12; max.f32 %f41, %f41, %f16; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_2; BB0_3: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f41; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_7; BB0_4: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_6; ld.shared.f32 %f17, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f18, [%r18]; max.f32 %f19, %f17, %f18; st.shared.f32 [%r7], %f19; BB0_6: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_4; BB0_7: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_9; ld.volatile.shared.f32 %f20, [%r7]; ld.volatile.shared.f32 %f21, [%r7+128]; max.f32 %f22, %f20, %f21; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+64]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+32]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+16]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; ld.volatile.shared.f32 %f32, [%r7+8]; ld.volatile.shared.f32 %f33, [%r7]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r7], %f34; ld.volatile.shared.f32 %f35, [%r7+4]; ld.volatile.shared.f32 %f36, [%r7]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r7], %f37; BB0_9: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_11; ld.shared.f32 %f38, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f39, %f38; mov.b32 %r19, %f39; cvta.to.global.u64 %rd21, %rd7; atom.global.max.s32 %r20, [%rd21], %r19; BB0_11: ret; } ` reducemaxvecdiff2_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<8>; .reg .f32 %f<42>; .reg .b32 %r<23>; .reg .b64 %rd<22>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd8, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd9, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd10, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd6, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd11, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd12, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd7, [reducemaxvecdiff2_param_6]; ld.param.f32 %f41, [reducemaxvecdiff2_param_7]; ld.param.u32 %r10, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd11; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_3; cvta.to.global.u64 %rd15, %rd6; BB0_2: mul.wide.s32 %rd13, %r21, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f5, [%rd16]; ld.global.nc.f32 %f6, [%rd14]; sub.f32 %f7, %f6, %f5; add.s64 %rd17, %rd4, %rd13; add.s64 %rd18, %rd3, %rd13; ld.global.nc.f32 %f8, [%rd18]; ld.global.nc.f32 %f9, [%rd17]; sub.f32 %f10, %f9, %f8; mul.f32 %f11, %f10, %f10; fma.rn.f32 %f12, %f7, %f7, %f11; add.s64 %rd19, %rd2, %rd13; add.s64 %rd20, %rd1, %rd13; ld.global.nc.f32 %f13, [%rd20]; ld.global.nc.f32 %f14, [%rd19]; sub.f32 %f15, %f14, %f13; fma.rn.f32 %f16, %f15, %f15, %f12; max.f32 %f41, %f41, %f16; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_2; BB0_3: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f41; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_7; BB0_4: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_6; ld.shared.f32 %f17, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f18, [%r18]; max.f32 %f19, %f17, %f18; st.shared.f32 [%r7], %f19; BB0_6: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_4; BB0_7: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_9; ld.volatile.shared.f32 %f20, [%r7]; ld.volatile.shared.f32 %f21, [%r7+128]; max.f32 %f22, %f20, %f21; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+64]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+32]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+16]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; ld.volatile.shared.f32 %f32, [%r7+8]; ld.volatile.shared.f32 %f33, [%r7]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r7], %f34; ld.volatile.shared.f32 %f35, [%r7+4]; ld.volatile.shared.f32 %f36, [%r7]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r7], %f37; BB0_9: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_11; ld.shared.f32 %f38, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f39, %f38; mov.b32 %r19, %f39; cvta.to.global.u64 %rd21, %rd7; atom.global.max.s32 %r20, [%rd21], %r19; BB0_11: ret; } ` reducemaxvecdiff2_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<8>; .reg .f32 %f<42>; .reg .b32 %r<23>; .reg .b64 %rd<22>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd8, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd9, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd10, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd6, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd11, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd12, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd7, [reducemaxvecdiff2_param_6]; ld.param.f32 %f41, [reducemaxvecdiff2_param_7]; ld.param.u32 %r10, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd11; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_3; cvta.to.global.u64 %rd15, %rd6; BB0_2: mul.wide.s32 %rd13, %r21, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f5, [%rd16]; ld.global.nc.f32 %f6, [%rd14]; sub.f32 %f7, %f6, %f5; add.s64 %rd17, %rd4, %rd13; add.s64 %rd18, %rd3, %rd13; ld.global.nc.f32 %f8, [%rd18]; ld.global.nc.f32 %f9, [%rd17]; sub.f32 %f10, %f9, %f8; mul.f32 %f11, %f10, %f10; fma.rn.f32 %f12, %f7, %f7, %f11; add.s64 %rd19, %rd2, %rd13; add.s64 %rd20, %rd1, %rd13; ld.global.nc.f32 %f13, [%rd20]; ld.global.nc.f32 %f14, [%rd19]; sub.f32 %f15, %f14, %f13; fma.rn.f32 %f16, %f15, %f15, %f12; max.f32 %f41, %f41, %f16; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_2; BB0_3: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f41; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_7; BB0_4: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_6; ld.shared.f32 %f17, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f18, [%r18]; max.f32 %f19, %f17, %f18; st.shared.f32 [%r7], %f19; BB0_6: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_4; BB0_7: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_9; ld.volatile.shared.f32 %f20, [%r7]; ld.volatile.shared.f32 %f21, [%r7+128]; max.f32 %f22, %f20, %f21; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+64]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+32]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+16]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; ld.volatile.shared.f32 %f32, [%r7+8]; ld.volatile.shared.f32 %f33, [%r7]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r7], %f34; ld.volatile.shared.f32 %f35, [%r7+4]; ld.volatile.shared.f32 %f36, [%r7]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r7], %f37; BB0_9: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_11; ld.shared.f32 %f38, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f39, %f38; mov.b32 %r19, %f39; cvta.to.global.u64 %rd21, %rd7; atom.global.max.s32 %r20, [%rd21], %r19; BB0_11: ret; } ` reducemaxvecdiff2_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<8>; .reg .f32 %f<42>; .reg .b32 %r<23>; .reg .b64 %rd<22>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd8, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd9, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd10, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd6, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd11, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd12, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd7, [reducemaxvecdiff2_param_6]; ld.param.f32 %f41, [reducemaxvecdiff2_param_7]; ld.param.u32 %r10, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd11; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_3; cvta.to.global.u64 %rd15, %rd6; BB0_2: mul.wide.s32 %rd13, %r21, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f5, [%rd16]; ld.global.nc.f32 %f6, [%rd14]; sub.f32 %f7, %f6, %f5; add.s64 %rd17, %rd4, %rd13; add.s64 %rd18, %rd3, %rd13; ld.global.nc.f32 %f8, [%rd18]; ld.global.nc.f32 %f9, [%rd17]; sub.f32 %f10, %f9, %f8; mul.f32 %f11, %f10, %f10; fma.rn.f32 %f12, %f7, %f7, %f11; add.s64 %rd19, %rd2, %rd13; add.s64 %rd20, %rd1, %rd13; ld.global.nc.f32 %f13, [%rd20]; ld.global.nc.f32 %f14, [%rd19]; sub.f32 %f15, %f14, %f13; fma.rn.f32 %f16, %f15, %f15, %f12; max.f32 %f41, %f41, %f16; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_2; BB0_3: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f41; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_7; BB0_4: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_6; ld.shared.f32 %f17, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f18, [%r18]; max.f32 %f19, %f17, %f18; st.shared.f32 [%r7], %f19; BB0_6: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_4; BB0_7: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_9; ld.volatile.shared.f32 %f20, [%r7]; ld.volatile.shared.f32 %f21, [%r7+128]; max.f32 %f22, %f20, %f21; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+64]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+32]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+16]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; ld.volatile.shared.f32 %f32, [%r7+8]; ld.volatile.shared.f32 %f33, [%r7]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r7], %f34; ld.volatile.shared.f32 %f35, [%r7+4]; ld.volatile.shared.f32 %f36, [%r7]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r7], %f37; BB0_9: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_11; ld.shared.f32 %f38, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f39, %f38; mov.b32 %r19, %f39; cvta.to.global.u64 %rd21, %rd7; atom.global.max.s32 %r20, [%rd21], %r19; BB0_11: ret; } ` reducemaxvecdiff2_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<8>; .reg .f32 %f<42>; .reg .b32 %r<23>; .reg .b64 %rd<22>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd8, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd9, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd10, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd6, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd11, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd12, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd7, [reducemaxvecdiff2_param_6]; ld.param.f32 %f41, [reducemaxvecdiff2_param_7]; ld.param.u32 %r10, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd11; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_3; cvta.to.global.u64 %rd15, %rd6; BB0_2: mul.wide.s32 %rd13, %r21, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f5, [%rd16]; ld.global.nc.f32 %f6, [%rd14]; sub.f32 %f7, %f6, %f5; add.s64 %rd17, %rd4, %rd13; add.s64 %rd18, %rd3, %rd13; ld.global.nc.f32 %f8, [%rd18]; ld.global.nc.f32 %f9, [%rd17]; sub.f32 %f10, %f9, %f8; mul.f32 %f11, %f10, %f10; fma.rn.f32 %f12, %f7, %f7, %f11; add.s64 %rd19, %rd2, %rd13; add.s64 %rd20, %rd1, %rd13; ld.global.nc.f32 %f13, [%rd20]; ld.global.nc.f32 %f14, [%rd19]; sub.f32 %f15, %f14, %f13; fma.rn.f32 %f16, %f15, %f15, %f12; max.f32 %f41, %f41, %f16; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_2; BB0_3: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f41; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_7; BB0_4: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_6; ld.shared.f32 %f17, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f18, [%r18]; max.f32 %f19, %f17, %f18; st.shared.f32 [%r7], %f19; BB0_6: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_4; BB0_7: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_9; ld.volatile.shared.f32 %f20, [%r7]; ld.volatile.shared.f32 %f21, [%r7+128]; max.f32 %f22, %f20, %f21; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+64]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+32]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+16]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; ld.volatile.shared.f32 %f32, [%r7+8]; ld.volatile.shared.f32 %f33, [%r7]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r7], %f34; ld.volatile.shared.f32 %f35, [%r7+4]; ld.volatile.shared.f32 %f36, [%r7]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r7], %f37; BB0_9: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_11; ld.shared.f32 %f38, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f39, %f38; mov.b32 %r19, %f39; cvta.to.global.u64 %rd21, %rd7; atom.global.max.s32 %r20, [%rd21], %r19; BB0_11: ret; } ` reducemaxvecdiff2_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<8>; .reg .f32 %f<42>; .reg .b32 %r<23>; .reg .b64 %rd<22>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd8, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd9, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd10, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd6, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd11, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd12, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd7, [reducemaxvecdiff2_param_6]; ld.param.f32 %f41, [reducemaxvecdiff2_param_7]; ld.param.u32 %r10, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd11; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_3; cvta.to.global.u64 %rd15, %rd6; BB0_2: mul.wide.s32 %rd13, %r21, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f5, [%rd16]; ld.global.nc.f32 %f6, [%rd14]; sub.f32 %f7, %f6, %f5; add.s64 %rd17, %rd4, %rd13; add.s64 %rd18, %rd3, %rd13; ld.global.nc.f32 %f8, [%rd18]; ld.global.nc.f32 %f9, [%rd17]; sub.f32 %f10, %f9, %f8; mul.f32 %f11, %f10, %f10; fma.rn.f32 %f12, %f7, %f7, %f11; add.s64 %rd19, %rd2, %rd13; add.s64 %rd20, %rd1, %rd13; ld.global.nc.f32 %f13, [%rd20]; ld.global.nc.f32 %f14, [%rd19]; sub.f32 %f15, %f14, %f13; fma.rn.f32 %f16, %f15, %f15, %f12; max.f32 %f41, %f41, %f16; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_2; BB0_3: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f41; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_7; BB0_4: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_6; ld.shared.f32 %f17, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f18, [%r18]; max.f32 %f19, %f17, %f18; st.shared.f32 [%r7], %f19; BB0_6: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_4; BB0_7: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_9; ld.volatile.shared.f32 %f20, [%r7]; ld.volatile.shared.f32 %f21, [%r7+128]; max.f32 %f22, %f20, %f21; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+64]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+32]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+16]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; ld.volatile.shared.f32 %f32, [%r7+8]; ld.volatile.shared.f32 %f33, [%r7]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r7], %f34; ld.volatile.shared.f32 %f35, [%r7+4]; ld.volatile.shared.f32 %f36, [%r7]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r7], %f37; BB0_9: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_11; ld.shared.f32 %f38, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f39, %f38; mov.b32 %r19, %f39; cvta.to.global.u64 %rd21, %rd7; atom.global.max.s32 %r20, [%rd21], %r19; BB0_11: ret; } ` reducemaxvecdiff2_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<8>; .reg .f32 %f<42>; .reg .b32 %r<23>; .reg .b64 %rd<22>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd8, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd9, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd10, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd6, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd11, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd12, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd7, [reducemaxvecdiff2_param_6]; ld.param.f32 %f41, [reducemaxvecdiff2_param_7]; ld.param.u32 %r10, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd11; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_3; cvta.to.global.u64 %rd15, %rd6; BB0_2: mul.wide.s32 %rd13, %r21, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f5, [%rd16]; ld.global.nc.f32 %f6, [%rd14]; sub.f32 %f7, %f6, %f5; add.s64 %rd17, %rd4, %rd13; add.s64 %rd18, %rd3, %rd13; ld.global.nc.f32 %f8, [%rd18]; ld.global.nc.f32 %f9, [%rd17]; sub.f32 %f10, %f9, %f8; mul.f32 %f11, %f10, %f10; fma.rn.f32 %f12, %f7, %f7, %f11; add.s64 %rd19, %rd2, %rd13; add.s64 %rd20, %rd1, %rd13; ld.global.nc.f32 %f13, [%rd20]; ld.global.nc.f32 %f14, [%rd19]; sub.f32 %f15, %f14, %f13; fma.rn.f32 %f16, %f15, %f15, %f12; max.f32 %f41, %f41, %f16; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_2; BB0_3: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f41; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_7; BB0_4: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_6; ld.shared.f32 %f17, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f18, [%r18]; max.f32 %f19, %f17, %f18; st.shared.f32 [%r7], %f19; BB0_6: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_4; BB0_7: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_9; ld.volatile.shared.f32 %f20, [%r7]; ld.volatile.shared.f32 %f21, [%r7+128]; max.f32 %f22, %f20, %f21; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+64]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+32]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+16]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; ld.volatile.shared.f32 %f32, [%r7+8]; ld.volatile.shared.f32 %f33, [%r7]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r7], %f34; ld.volatile.shared.f32 %f35, [%r7+4]; ld.volatile.shared.f32 %f36, [%r7]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r7], %f37; BB0_9: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_11; ld.shared.f32 %f38, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f39, %f38; mov.b32 %r19, %f39; cvta.to.global.u64 %rd21, %rd7; atom.global.max.s32 %r20, [%rd21], %r19; BB0_11: ret; } ` reducemaxvecdiff2_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<8>; .reg .f32 %f<42>; .reg .b32 %r<23>; .reg .b64 %rd<22>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd8, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd9, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd10, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd6, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd11, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd12, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd7, [reducemaxvecdiff2_param_6]; ld.param.f32 %f41, [reducemaxvecdiff2_param_7]; ld.param.u32 %r10, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd11; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_3; cvta.to.global.u64 %rd15, %rd6; BB0_2: mul.wide.s32 %rd13, %r21, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f5, [%rd16]; ld.global.nc.f32 %f6, [%rd14]; sub.f32 %f7, %f6, %f5; add.s64 %rd17, %rd4, %rd13; add.s64 %rd18, %rd3, %rd13; ld.global.nc.f32 %f8, [%rd18]; ld.global.nc.f32 %f9, [%rd17]; sub.f32 %f10, %f9, %f8; mul.f32 %f11, %f10, %f10; fma.rn.f32 %f12, %f7, %f7, %f11; add.s64 %rd19, %rd2, %rd13; add.s64 %rd20, %rd1, %rd13; ld.global.nc.f32 %f13, [%rd20]; ld.global.nc.f32 %f14, [%rd19]; sub.f32 %f15, %f14, %f13; fma.rn.f32 %f16, %f15, %f15, %f12; max.f32 %f41, %f41, %f16; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_2; BB0_3: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f41; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_7; BB0_4: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_6; ld.shared.f32 %f17, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f18, [%r18]; max.f32 %f19, %f17, %f18; st.shared.f32 [%r7], %f19; BB0_6: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_4; BB0_7: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_9; ld.volatile.shared.f32 %f20, [%r7]; ld.volatile.shared.f32 %f21, [%r7+128]; max.f32 %f22, %f20, %f21; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+64]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+32]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+16]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; ld.volatile.shared.f32 %f32, [%r7+8]; ld.volatile.shared.f32 %f33, [%r7]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r7], %f34; ld.volatile.shared.f32 %f35, [%r7+4]; ld.volatile.shared.f32 %f36, [%r7]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r7], %f37; BB0_9: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_11; ld.shared.f32 %f38, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f39, %f38; mov.b32 %r19, %f39; cvta.to.global.u64 %rd21, %rd7; atom.global.max.s32 %r20, [%rd21], %r19; BB0_11: ret; } ` reducemaxvecdiff2_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl reducemaxvecdiff2 .visible .entry reducemaxvecdiff2( .param .u64 reducemaxvecdiff2_param_0, .param .u64 reducemaxvecdiff2_param_1, .param .u64 reducemaxvecdiff2_param_2, .param .u64 reducemaxvecdiff2_param_3, .param .u64 reducemaxvecdiff2_param_4, .param .u64 reducemaxvecdiff2_param_5, .param .u64 reducemaxvecdiff2_param_6, .param .f32 reducemaxvecdiff2_param_7, .param .u32 reducemaxvecdiff2_param_8 ) { .reg .pred %p<8>; .reg .f32 %f<42>; .reg .b32 %r<23>; .reg .b64 %rd<22>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecdiff2E5sdata[2048]; ld.param.u64 %rd8, [reducemaxvecdiff2_param_0]; ld.param.u64 %rd9, [reducemaxvecdiff2_param_1]; ld.param.u64 %rd10, [reducemaxvecdiff2_param_2]; ld.param.u64 %rd6, [reducemaxvecdiff2_param_3]; ld.param.u64 %rd11, [reducemaxvecdiff2_param_4]; ld.param.u64 %rd12, [reducemaxvecdiff2_param_5]; ld.param.u64 %rd7, [reducemaxvecdiff2_param_6]; ld.param.f32 %f41, [reducemaxvecdiff2_param_7]; ld.param.u32 %r10, [reducemaxvecdiff2_param_8]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd10; cvta.to.global.u64 %rd3, %rd11; cvta.to.global.u64 %rd4, %rd9; cvta.to.global.u64 %rd5, %rd8; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_3; cvta.to.global.u64 %rd15, %rd6; BB0_2: mul.wide.s32 %rd13, %r21, 4; add.s64 %rd14, %rd5, %rd13; add.s64 %rd16, %rd15, %rd13; ld.global.nc.f32 %f5, [%rd16]; ld.global.nc.f32 %f6, [%rd14]; sub.f32 %f7, %f6, %f5; add.s64 %rd17, %rd4, %rd13; add.s64 %rd18, %rd3, %rd13; ld.global.nc.f32 %f8, [%rd18]; ld.global.nc.f32 %f9, [%rd17]; sub.f32 %f10, %f9, %f8; mul.f32 %f11, %f10, %f10; fma.rn.f32 %f12, %f7, %f7, %f11; add.s64 %rd19, %rd2, %rd13; add.s64 %rd20, %rd1, %rd13; ld.global.nc.f32 %f13, [%rd20]; ld.global.nc.f32 %f14, [%rd19]; sub.f32 %f15, %f14, %f13; fma.rn.f32 %f16, %f15, %f15, %f12; max.f32 %f41, %f41, %f16; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_2; BB0_3: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecdiff2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f41; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_7; BB0_4: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_6; ld.shared.f32 %f17, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f18, [%r18]; max.f32 %f19, %f17, %f18; st.shared.f32 [%r7], %f19; BB0_6: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_4; BB0_7: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_9; ld.volatile.shared.f32 %f20, [%r7]; ld.volatile.shared.f32 %f21, [%r7+128]; max.f32 %f22, %f20, %f21; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+64]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+32]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+16]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; ld.volatile.shared.f32 %f32, [%r7+8]; ld.volatile.shared.f32 %f33, [%r7]; max.f32 %f34, %f33, %f32; st.volatile.shared.f32 [%r7], %f34; ld.volatile.shared.f32 %f35, [%r7+4]; ld.volatile.shared.f32 %f36, [%r7]; max.f32 %f37, %f36, %f35; st.volatile.shared.f32 [%r7], %f37; BB0_9: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_11; ld.shared.f32 %f38, [_ZZ17reducemaxvecdiff2E5sdata]; abs.f32 %f39, %f38; mov.b32 %r19, %f39; cvta.to.global.u64 %rd21, %rd7; atom.global.max.s32 %r20, [%rd21], %r19; BB0_11: ret; } ` ) mumax3-3.10/cuda/reducemaxvecnorm2.cu000066400000000000000000000005261371432437400175600ustar00rootroot00000000000000#include "reduce.h" #include "atomicf.h" #include "float3.h" #define load_vecnorm2(i) \ pow2(x[i]) + pow2(y[i]) + pow2(z[i]) extern "C" __global__ void reducemaxvecnorm2(float* __restrict__ x, float* __restrict__ y, float* __restrict__ z, float* __restrict__ dst, float initVal, int n) { reduce(load_vecnorm2, fmax, atomicFmaxabs) } mumax3-3.10/cuda/reducemaxvecnorm2_wrapper.go000066400000000000000000001314001371432437400213120ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for reducemaxvecnorm2 kernel var reducemaxvecnorm2_code cu.Function // Stores the arguments for reducemaxvecnorm2 kernel invocation type reducemaxvecnorm2_args_t struct { arg_x unsafe.Pointer arg_y unsafe.Pointer arg_z unsafe.Pointer arg_dst unsafe.Pointer arg_initVal float32 arg_n int argptr [6]unsafe.Pointer sync.Mutex } // Stores the arguments for reducemaxvecnorm2 kernel invocation var reducemaxvecnorm2_args reducemaxvecnorm2_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. reducemaxvecnorm2_args.argptr[0] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_x) reducemaxvecnorm2_args.argptr[1] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_y) reducemaxvecnorm2_args.argptr[2] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_z) reducemaxvecnorm2_args.argptr[3] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_dst) reducemaxvecnorm2_args.argptr[4] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_initVal) reducemaxvecnorm2_args.argptr[5] = unsafe.Pointer(&reducemaxvecnorm2_args.arg_n) } // Wrapper for reducemaxvecnorm2 CUDA kernel, asynchronous. func k_reducemaxvecnorm2_async(x unsafe.Pointer, y unsafe.Pointer, z unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { if Synchronous { // debug Sync() timer.Start("reducemaxvecnorm2") } reducemaxvecnorm2_args.Lock() defer reducemaxvecnorm2_args.Unlock() if reducemaxvecnorm2_code == 0 { reducemaxvecnorm2_code = fatbinLoad(reducemaxvecnorm2_map, "reducemaxvecnorm2") } reducemaxvecnorm2_args.arg_x = x reducemaxvecnorm2_args.arg_y = y reducemaxvecnorm2_args.arg_z = z reducemaxvecnorm2_args.arg_dst = dst reducemaxvecnorm2_args.arg_initVal = initVal reducemaxvecnorm2_args.arg_n = n args := reducemaxvecnorm2_args.argptr[:] cu.LaunchKernel(reducemaxvecnorm2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("reducemaxvecnorm2") } } // maps compute capability on PTX code for reducemaxvecnorm2 kernel. var reducemaxvecnorm2_map = map[int]string{0: "", 30: reducemaxvecnorm2_ptx_30, 32: reducemaxvecnorm2_ptx_32, 35: reducemaxvecnorm2_ptx_35, 37: reducemaxvecnorm2_ptx_37, 50: reducemaxvecnorm2_ptx_50, 52: reducemaxvecnorm2_ptx_52, 53: reducemaxvecnorm2_ptx_53, 60: reducemaxvecnorm2_ptx_60, 61: reducemaxvecnorm2_ptx_61, 62: reducemaxvecnorm2_ptx_62, 70: reducemaxvecnorm2_ptx_70, 72: reducemaxvecnorm2_ptx_72, 75: reducemaxvecnorm2_ptx_75} // reducemaxvecnorm2 PTX code for various compute capabilities. const ( reducemaxvecnorm2_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<8>; .reg .f32 %f<36>; .reg .b32 %r<23>; .reg .b64 %rd<13>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd5, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd6, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd7, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd4, [reducemaxvecnorm2_param_3]; ld.param.f32 %f35, [reducemaxvecnorm2_param_4]; ld.param.u32 %r10, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd7; cvta.to.global.u64 %rd2, %rd6; cvta.to.global.u64 %rd3, %rd5; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd8, %r21, 4; add.s64 %rd9, %rd3, %rd8; ld.global.f32 %f5, [%rd9]; add.s64 %rd10, %rd2, %rd8; ld.global.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f6; fma.rn.f32 %f8, %f5, %f5, %f7; add.s64 %rd11, %rd1, %rd8; ld.global.f32 %f9, [%rd11]; fma.rn.f32 %f10, %f9, %f9, %f8; max.f32 %f35, %f35, %f10; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f35; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f11, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f12, [%r18]; max.f32 %f13, %f11, %f12; st.shared.f32 [%r7], %f13; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f14, [%r7]; ld.volatile.shared.f32 %f15, [%r7+128]; max.f32 %f16, %f14, %f15; st.volatile.shared.f32 [%r7], %f16; ld.volatile.shared.f32 %f17, [%r7+64]; ld.volatile.shared.f32 %f18, [%r7]; max.f32 %f19, %f18, %f17; st.volatile.shared.f32 [%r7], %f19; ld.volatile.shared.f32 %f20, [%r7+32]; ld.volatile.shared.f32 %f21, [%r7]; max.f32 %f22, %f21, %f20; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+16]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+8]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+4]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f32, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f33, %f32; mov.b32 %r19, %f33; cvta.to.global.u64 %rd12, %rd4; atom.global.max.s32 %r20, [%rd12], %r19; BB0_10: ret; } ` reducemaxvecnorm2_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<8>; .reg .f32 %f<36>; .reg .b32 %r<23>; .reg .b64 %rd<13>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd5, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd6, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd7, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd4, [reducemaxvecnorm2_param_3]; ld.param.f32 %f35, [reducemaxvecnorm2_param_4]; ld.param.u32 %r10, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd7; cvta.to.global.u64 %rd2, %rd6; cvta.to.global.u64 %rd3, %rd5; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd8, %r21, 4; add.s64 %rd9, %rd3, %rd8; ld.global.nc.f32 %f5, [%rd9]; add.s64 %rd10, %rd2, %rd8; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f6; fma.rn.f32 %f8, %f5, %f5, %f7; add.s64 %rd11, %rd1, %rd8; ld.global.nc.f32 %f9, [%rd11]; fma.rn.f32 %f10, %f9, %f9, %f8; max.f32 %f35, %f35, %f10; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f35; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f11, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f12, [%r18]; max.f32 %f13, %f11, %f12; st.shared.f32 [%r7], %f13; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f14, [%r7]; ld.volatile.shared.f32 %f15, [%r7+128]; max.f32 %f16, %f14, %f15; st.volatile.shared.f32 [%r7], %f16; ld.volatile.shared.f32 %f17, [%r7+64]; ld.volatile.shared.f32 %f18, [%r7]; max.f32 %f19, %f18, %f17; st.volatile.shared.f32 [%r7], %f19; ld.volatile.shared.f32 %f20, [%r7+32]; ld.volatile.shared.f32 %f21, [%r7]; max.f32 %f22, %f21, %f20; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+16]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+8]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+4]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f32, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f33, %f32; mov.b32 %r19, %f33; cvta.to.global.u64 %rd12, %rd4; atom.global.max.s32 %r20, [%rd12], %r19; BB0_10: ret; } ` reducemaxvecnorm2_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<8>; .reg .f32 %f<36>; .reg .b32 %r<23>; .reg .b64 %rd<13>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd5, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd6, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd7, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd4, [reducemaxvecnorm2_param_3]; ld.param.f32 %f35, [reducemaxvecnorm2_param_4]; ld.param.u32 %r10, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd7; cvta.to.global.u64 %rd2, %rd6; cvta.to.global.u64 %rd3, %rd5; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd8, %r21, 4; add.s64 %rd9, %rd3, %rd8; ld.global.nc.f32 %f5, [%rd9]; add.s64 %rd10, %rd2, %rd8; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f6; fma.rn.f32 %f8, %f5, %f5, %f7; add.s64 %rd11, %rd1, %rd8; ld.global.nc.f32 %f9, [%rd11]; fma.rn.f32 %f10, %f9, %f9, %f8; max.f32 %f35, %f35, %f10; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f35; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f11, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f12, [%r18]; max.f32 %f13, %f11, %f12; st.shared.f32 [%r7], %f13; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f14, [%r7]; ld.volatile.shared.f32 %f15, [%r7+128]; max.f32 %f16, %f14, %f15; st.volatile.shared.f32 [%r7], %f16; ld.volatile.shared.f32 %f17, [%r7+64]; ld.volatile.shared.f32 %f18, [%r7]; max.f32 %f19, %f18, %f17; st.volatile.shared.f32 [%r7], %f19; ld.volatile.shared.f32 %f20, [%r7+32]; ld.volatile.shared.f32 %f21, [%r7]; max.f32 %f22, %f21, %f20; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+16]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+8]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+4]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f32, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f33, %f32; mov.b32 %r19, %f33; cvta.to.global.u64 %rd12, %rd4; atom.global.max.s32 %r20, [%rd12], %r19; BB0_10: ret; } ` reducemaxvecnorm2_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<8>; .reg .f32 %f<36>; .reg .b32 %r<23>; .reg .b64 %rd<13>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd5, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd6, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd7, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd4, [reducemaxvecnorm2_param_3]; ld.param.f32 %f35, [reducemaxvecnorm2_param_4]; ld.param.u32 %r10, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd7; cvta.to.global.u64 %rd2, %rd6; cvta.to.global.u64 %rd3, %rd5; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd8, %r21, 4; add.s64 %rd9, %rd3, %rd8; ld.global.nc.f32 %f5, [%rd9]; add.s64 %rd10, %rd2, %rd8; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f6; fma.rn.f32 %f8, %f5, %f5, %f7; add.s64 %rd11, %rd1, %rd8; ld.global.nc.f32 %f9, [%rd11]; fma.rn.f32 %f10, %f9, %f9, %f8; max.f32 %f35, %f35, %f10; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f35; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f11, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f12, [%r18]; max.f32 %f13, %f11, %f12; st.shared.f32 [%r7], %f13; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f14, [%r7]; ld.volatile.shared.f32 %f15, [%r7+128]; max.f32 %f16, %f14, %f15; st.volatile.shared.f32 [%r7], %f16; ld.volatile.shared.f32 %f17, [%r7+64]; ld.volatile.shared.f32 %f18, [%r7]; max.f32 %f19, %f18, %f17; st.volatile.shared.f32 [%r7], %f19; ld.volatile.shared.f32 %f20, [%r7+32]; ld.volatile.shared.f32 %f21, [%r7]; max.f32 %f22, %f21, %f20; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+16]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+8]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+4]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f32, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f33, %f32; mov.b32 %r19, %f33; cvta.to.global.u64 %rd12, %rd4; atom.global.max.s32 %r20, [%rd12], %r19; BB0_10: ret; } ` reducemaxvecnorm2_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<8>; .reg .f32 %f<36>; .reg .b32 %r<23>; .reg .b64 %rd<13>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd5, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd6, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd7, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd4, [reducemaxvecnorm2_param_3]; ld.param.f32 %f35, [reducemaxvecnorm2_param_4]; ld.param.u32 %r10, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd7; cvta.to.global.u64 %rd2, %rd6; cvta.to.global.u64 %rd3, %rd5; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd8, %r21, 4; add.s64 %rd9, %rd3, %rd8; ld.global.nc.f32 %f5, [%rd9]; add.s64 %rd10, %rd2, %rd8; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f6; fma.rn.f32 %f8, %f5, %f5, %f7; add.s64 %rd11, %rd1, %rd8; ld.global.nc.f32 %f9, [%rd11]; fma.rn.f32 %f10, %f9, %f9, %f8; max.f32 %f35, %f35, %f10; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f35; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f11, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f12, [%r18]; max.f32 %f13, %f11, %f12; st.shared.f32 [%r7], %f13; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f14, [%r7]; ld.volatile.shared.f32 %f15, [%r7+128]; max.f32 %f16, %f14, %f15; st.volatile.shared.f32 [%r7], %f16; ld.volatile.shared.f32 %f17, [%r7+64]; ld.volatile.shared.f32 %f18, [%r7]; max.f32 %f19, %f18, %f17; st.volatile.shared.f32 [%r7], %f19; ld.volatile.shared.f32 %f20, [%r7+32]; ld.volatile.shared.f32 %f21, [%r7]; max.f32 %f22, %f21, %f20; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+16]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+8]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+4]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f32, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f33, %f32; mov.b32 %r19, %f33; cvta.to.global.u64 %rd12, %rd4; atom.global.max.s32 %r20, [%rd12], %r19; BB0_10: ret; } ` reducemaxvecnorm2_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<8>; .reg .f32 %f<36>; .reg .b32 %r<23>; .reg .b64 %rd<13>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd5, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd6, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd7, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd4, [reducemaxvecnorm2_param_3]; ld.param.f32 %f35, [reducemaxvecnorm2_param_4]; ld.param.u32 %r10, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd7; cvta.to.global.u64 %rd2, %rd6; cvta.to.global.u64 %rd3, %rd5; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd8, %r21, 4; add.s64 %rd9, %rd3, %rd8; ld.global.nc.f32 %f5, [%rd9]; add.s64 %rd10, %rd2, %rd8; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f6; fma.rn.f32 %f8, %f5, %f5, %f7; add.s64 %rd11, %rd1, %rd8; ld.global.nc.f32 %f9, [%rd11]; fma.rn.f32 %f10, %f9, %f9, %f8; max.f32 %f35, %f35, %f10; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f35; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f11, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f12, [%r18]; max.f32 %f13, %f11, %f12; st.shared.f32 [%r7], %f13; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f14, [%r7]; ld.volatile.shared.f32 %f15, [%r7+128]; max.f32 %f16, %f14, %f15; st.volatile.shared.f32 [%r7], %f16; ld.volatile.shared.f32 %f17, [%r7+64]; ld.volatile.shared.f32 %f18, [%r7]; max.f32 %f19, %f18, %f17; st.volatile.shared.f32 [%r7], %f19; ld.volatile.shared.f32 %f20, [%r7+32]; ld.volatile.shared.f32 %f21, [%r7]; max.f32 %f22, %f21, %f20; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+16]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+8]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+4]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f32, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f33, %f32; mov.b32 %r19, %f33; cvta.to.global.u64 %rd12, %rd4; atom.global.max.s32 %r20, [%rd12], %r19; BB0_10: ret; } ` reducemaxvecnorm2_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<8>; .reg .f32 %f<36>; .reg .b32 %r<23>; .reg .b64 %rd<13>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd5, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd6, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd7, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd4, [reducemaxvecnorm2_param_3]; ld.param.f32 %f35, [reducemaxvecnorm2_param_4]; ld.param.u32 %r10, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd7; cvta.to.global.u64 %rd2, %rd6; cvta.to.global.u64 %rd3, %rd5; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd8, %r21, 4; add.s64 %rd9, %rd3, %rd8; ld.global.nc.f32 %f5, [%rd9]; add.s64 %rd10, %rd2, %rd8; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f6; fma.rn.f32 %f8, %f5, %f5, %f7; add.s64 %rd11, %rd1, %rd8; ld.global.nc.f32 %f9, [%rd11]; fma.rn.f32 %f10, %f9, %f9, %f8; max.f32 %f35, %f35, %f10; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f35; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f11, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f12, [%r18]; max.f32 %f13, %f11, %f12; st.shared.f32 [%r7], %f13; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f14, [%r7]; ld.volatile.shared.f32 %f15, [%r7+128]; max.f32 %f16, %f14, %f15; st.volatile.shared.f32 [%r7], %f16; ld.volatile.shared.f32 %f17, [%r7+64]; ld.volatile.shared.f32 %f18, [%r7]; max.f32 %f19, %f18, %f17; st.volatile.shared.f32 [%r7], %f19; ld.volatile.shared.f32 %f20, [%r7+32]; ld.volatile.shared.f32 %f21, [%r7]; max.f32 %f22, %f21, %f20; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+16]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+8]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+4]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f32, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f33, %f32; mov.b32 %r19, %f33; cvta.to.global.u64 %rd12, %rd4; atom.global.max.s32 %r20, [%rd12], %r19; BB0_10: ret; } ` reducemaxvecnorm2_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<8>; .reg .f32 %f<36>; .reg .b32 %r<23>; .reg .b64 %rd<13>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd5, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd6, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd7, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd4, [reducemaxvecnorm2_param_3]; ld.param.f32 %f35, [reducemaxvecnorm2_param_4]; ld.param.u32 %r10, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd7; cvta.to.global.u64 %rd2, %rd6; cvta.to.global.u64 %rd3, %rd5; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd8, %r21, 4; add.s64 %rd9, %rd3, %rd8; ld.global.nc.f32 %f5, [%rd9]; add.s64 %rd10, %rd2, %rd8; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f6; fma.rn.f32 %f8, %f5, %f5, %f7; add.s64 %rd11, %rd1, %rd8; ld.global.nc.f32 %f9, [%rd11]; fma.rn.f32 %f10, %f9, %f9, %f8; max.f32 %f35, %f35, %f10; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f35; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f11, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f12, [%r18]; max.f32 %f13, %f11, %f12; st.shared.f32 [%r7], %f13; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f14, [%r7]; ld.volatile.shared.f32 %f15, [%r7+128]; max.f32 %f16, %f14, %f15; st.volatile.shared.f32 [%r7], %f16; ld.volatile.shared.f32 %f17, [%r7+64]; ld.volatile.shared.f32 %f18, [%r7]; max.f32 %f19, %f18, %f17; st.volatile.shared.f32 [%r7], %f19; ld.volatile.shared.f32 %f20, [%r7+32]; ld.volatile.shared.f32 %f21, [%r7]; max.f32 %f22, %f21, %f20; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+16]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+8]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+4]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f32, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f33, %f32; mov.b32 %r19, %f33; cvta.to.global.u64 %rd12, %rd4; atom.global.max.s32 %r20, [%rd12], %r19; BB0_10: ret; } ` reducemaxvecnorm2_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<8>; .reg .f32 %f<36>; .reg .b32 %r<23>; .reg .b64 %rd<13>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd5, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd6, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd7, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd4, [reducemaxvecnorm2_param_3]; ld.param.f32 %f35, [reducemaxvecnorm2_param_4]; ld.param.u32 %r10, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd7; cvta.to.global.u64 %rd2, %rd6; cvta.to.global.u64 %rd3, %rd5; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd8, %r21, 4; add.s64 %rd9, %rd3, %rd8; ld.global.nc.f32 %f5, [%rd9]; add.s64 %rd10, %rd2, %rd8; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f6; fma.rn.f32 %f8, %f5, %f5, %f7; add.s64 %rd11, %rd1, %rd8; ld.global.nc.f32 %f9, [%rd11]; fma.rn.f32 %f10, %f9, %f9, %f8; max.f32 %f35, %f35, %f10; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f35; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f11, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f12, [%r18]; max.f32 %f13, %f11, %f12; st.shared.f32 [%r7], %f13; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f14, [%r7]; ld.volatile.shared.f32 %f15, [%r7+128]; max.f32 %f16, %f14, %f15; st.volatile.shared.f32 [%r7], %f16; ld.volatile.shared.f32 %f17, [%r7+64]; ld.volatile.shared.f32 %f18, [%r7]; max.f32 %f19, %f18, %f17; st.volatile.shared.f32 [%r7], %f19; ld.volatile.shared.f32 %f20, [%r7+32]; ld.volatile.shared.f32 %f21, [%r7]; max.f32 %f22, %f21, %f20; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+16]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+8]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+4]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f32, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f33, %f32; mov.b32 %r19, %f33; cvta.to.global.u64 %rd12, %rd4; atom.global.max.s32 %r20, [%rd12], %r19; BB0_10: ret; } ` reducemaxvecnorm2_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<8>; .reg .f32 %f<36>; .reg .b32 %r<23>; .reg .b64 %rd<13>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd5, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd6, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd7, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd4, [reducemaxvecnorm2_param_3]; ld.param.f32 %f35, [reducemaxvecnorm2_param_4]; ld.param.u32 %r10, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd7; cvta.to.global.u64 %rd2, %rd6; cvta.to.global.u64 %rd3, %rd5; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd8, %r21, 4; add.s64 %rd9, %rd3, %rd8; ld.global.nc.f32 %f5, [%rd9]; add.s64 %rd10, %rd2, %rd8; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f6; fma.rn.f32 %f8, %f5, %f5, %f7; add.s64 %rd11, %rd1, %rd8; ld.global.nc.f32 %f9, [%rd11]; fma.rn.f32 %f10, %f9, %f9, %f8; max.f32 %f35, %f35, %f10; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f35; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f11, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f12, [%r18]; max.f32 %f13, %f11, %f12; st.shared.f32 [%r7], %f13; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f14, [%r7]; ld.volatile.shared.f32 %f15, [%r7+128]; max.f32 %f16, %f14, %f15; st.volatile.shared.f32 [%r7], %f16; ld.volatile.shared.f32 %f17, [%r7+64]; ld.volatile.shared.f32 %f18, [%r7]; max.f32 %f19, %f18, %f17; st.volatile.shared.f32 [%r7], %f19; ld.volatile.shared.f32 %f20, [%r7+32]; ld.volatile.shared.f32 %f21, [%r7]; max.f32 %f22, %f21, %f20; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+16]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+8]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+4]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f32, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f33, %f32; mov.b32 %r19, %f33; cvta.to.global.u64 %rd12, %rd4; atom.global.max.s32 %r20, [%rd12], %r19; BB0_10: ret; } ` reducemaxvecnorm2_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<8>; .reg .f32 %f<36>; .reg .b32 %r<23>; .reg .b64 %rd<13>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd5, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd6, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd7, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd4, [reducemaxvecnorm2_param_3]; ld.param.f32 %f35, [reducemaxvecnorm2_param_4]; ld.param.u32 %r10, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd7; cvta.to.global.u64 %rd2, %rd6; cvta.to.global.u64 %rd3, %rd5; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd8, %r21, 4; add.s64 %rd9, %rd3, %rd8; ld.global.nc.f32 %f5, [%rd9]; add.s64 %rd10, %rd2, %rd8; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f6; fma.rn.f32 %f8, %f5, %f5, %f7; add.s64 %rd11, %rd1, %rd8; ld.global.nc.f32 %f9, [%rd11]; fma.rn.f32 %f10, %f9, %f9, %f8; max.f32 %f35, %f35, %f10; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f35; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f11, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f12, [%r18]; max.f32 %f13, %f11, %f12; st.shared.f32 [%r7], %f13; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f14, [%r7]; ld.volatile.shared.f32 %f15, [%r7+128]; max.f32 %f16, %f14, %f15; st.volatile.shared.f32 [%r7], %f16; ld.volatile.shared.f32 %f17, [%r7+64]; ld.volatile.shared.f32 %f18, [%r7]; max.f32 %f19, %f18, %f17; st.volatile.shared.f32 [%r7], %f19; ld.volatile.shared.f32 %f20, [%r7+32]; ld.volatile.shared.f32 %f21, [%r7]; max.f32 %f22, %f21, %f20; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+16]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+8]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+4]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f32, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f33, %f32; mov.b32 %r19, %f33; cvta.to.global.u64 %rd12, %rd4; atom.global.max.s32 %r20, [%rd12], %r19; BB0_10: ret; } ` reducemaxvecnorm2_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<8>; .reg .f32 %f<36>; .reg .b32 %r<23>; .reg .b64 %rd<13>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd5, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd6, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd7, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd4, [reducemaxvecnorm2_param_3]; ld.param.f32 %f35, [reducemaxvecnorm2_param_4]; ld.param.u32 %r10, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd7; cvta.to.global.u64 %rd2, %rd6; cvta.to.global.u64 %rd3, %rd5; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd8, %r21, 4; add.s64 %rd9, %rd3, %rd8; ld.global.nc.f32 %f5, [%rd9]; add.s64 %rd10, %rd2, %rd8; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f6; fma.rn.f32 %f8, %f5, %f5, %f7; add.s64 %rd11, %rd1, %rd8; ld.global.nc.f32 %f9, [%rd11]; fma.rn.f32 %f10, %f9, %f9, %f8; max.f32 %f35, %f35, %f10; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f35; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f11, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f12, [%r18]; max.f32 %f13, %f11, %f12; st.shared.f32 [%r7], %f13; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f14, [%r7]; ld.volatile.shared.f32 %f15, [%r7+128]; max.f32 %f16, %f14, %f15; st.volatile.shared.f32 [%r7], %f16; ld.volatile.shared.f32 %f17, [%r7+64]; ld.volatile.shared.f32 %f18, [%r7]; max.f32 %f19, %f18, %f17; st.volatile.shared.f32 [%r7], %f19; ld.volatile.shared.f32 %f20, [%r7+32]; ld.volatile.shared.f32 %f21, [%r7]; max.f32 %f22, %f21, %f20; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+16]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+8]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+4]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f32, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f33, %f32; mov.b32 %r19, %f33; cvta.to.global.u64 %rd12, %rd4; atom.global.max.s32 %r20, [%rd12], %r19; BB0_10: ret; } ` reducemaxvecnorm2_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl reducemaxvecnorm2 .visible .entry reducemaxvecnorm2( .param .u64 reducemaxvecnorm2_param_0, .param .u64 reducemaxvecnorm2_param_1, .param .u64 reducemaxvecnorm2_param_2, .param .u64 reducemaxvecnorm2_param_3, .param .f32 reducemaxvecnorm2_param_4, .param .u32 reducemaxvecnorm2_param_5 ) { .reg .pred %p<8>; .reg .f32 %f<36>; .reg .b32 %r<23>; .reg .b64 %rd<13>; // demoted variable .shared .align 4 .b8 _ZZ17reducemaxvecnorm2E5sdata[2048]; ld.param.u64 %rd5, [reducemaxvecnorm2_param_0]; ld.param.u64 %rd6, [reducemaxvecnorm2_param_1]; ld.param.u64 %rd7, [reducemaxvecnorm2_param_2]; ld.param.u64 %rd4, [reducemaxvecnorm2_param_3]; ld.param.f32 %f35, [reducemaxvecnorm2_param_4]; ld.param.u32 %r10, [reducemaxvecnorm2_param_5]; cvta.to.global.u64 %rd1, %rd7; cvta.to.global.u64 %rd2, %rd6; cvta.to.global.u64 %rd3, %rd5; mov.u32 %r22, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r21, %r22, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r22; setp.ge.s32 %p1, %r21, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd8, %r21, 4; add.s64 %rd9, %rd3, %rd8; ld.global.nc.f32 %f5, [%rd9]; add.s64 %rd10, %rd2, %rd8; ld.global.nc.f32 %f6, [%rd10]; mul.f32 %f7, %f6, %f6; fma.rn.f32 %f8, %f5, %f5, %f7; add.s64 %rd11, %rd1, %rd8; ld.global.nc.f32 %f9, [%rd11]; fma.rn.f32 %f10, %f9, %f9, %f8; max.f32 %f35, %f35, %f10; add.s32 %r21, %r21, %r4; setp.lt.s32 %p2, %r21, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ17reducemaxvecnorm2E5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f35; bar.sync 0; setp.lt.u32 %p3, %r22, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r22, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f11, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f12, [%r18]; max.f32 %f13, %f11, %f12; st.shared.f32 [%r7], %f13; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r22, 131; mov.u32 %r22, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f14, [%r7]; ld.volatile.shared.f32 %f15, [%r7+128]; max.f32 %f16, %f14, %f15; st.volatile.shared.f32 [%r7], %f16; ld.volatile.shared.f32 %f17, [%r7+64]; ld.volatile.shared.f32 %f18, [%r7]; max.f32 %f19, %f18, %f17; st.volatile.shared.f32 [%r7], %f19; ld.volatile.shared.f32 %f20, [%r7+32]; ld.volatile.shared.f32 %f21, [%r7]; max.f32 %f22, %f21, %f20; st.volatile.shared.f32 [%r7], %f22; ld.volatile.shared.f32 %f23, [%r7+16]; ld.volatile.shared.f32 %f24, [%r7]; max.f32 %f25, %f24, %f23; st.volatile.shared.f32 [%r7], %f25; ld.volatile.shared.f32 %f26, [%r7+8]; ld.volatile.shared.f32 %f27, [%r7]; max.f32 %f28, %f27, %f26; st.volatile.shared.f32 [%r7], %f28; ld.volatile.shared.f32 %f29, [%r7+4]; ld.volatile.shared.f32 %f30, [%r7]; max.f32 %f31, %f30, %f29; st.volatile.shared.f32 [%r7], %f31; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f32, [_ZZ17reducemaxvecnorm2E5sdata]; abs.f32 %f33, %f32; mov.b32 %r19, %f33; cvta.to.global.u64 %rd12, %rd4; atom.global.max.s32 %r20, [%rd12], %r19; BB0_10: ret; } ` ) mumax3-3.10/cuda/reducesum.cu000066400000000000000000000003211371432437400161140ustar00rootroot00000000000000#include "reduce.h" #include "sum.h" #define load(i) src[i] extern "C" __global__ void reducesum(float* __restrict__ src, float*__restrict__ dst, float initVal, int n) { reduce(load, sum, atomicAdd) } mumax3-3.10/cuda/reducesum_wrapper.go000066400000000000000000001100401371432437400176520ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for reducesum kernel var reducesum_code cu.Function // Stores the arguments for reducesum kernel invocation type reducesum_args_t struct { arg_src unsafe.Pointer arg_dst unsafe.Pointer arg_initVal float32 arg_n int argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for reducesum kernel invocation var reducesum_args reducesum_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. reducesum_args.argptr[0] = unsafe.Pointer(&reducesum_args.arg_src) reducesum_args.argptr[1] = unsafe.Pointer(&reducesum_args.arg_dst) reducesum_args.argptr[2] = unsafe.Pointer(&reducesum_args.arg_initVal) reducesum_args.argptr[3] = unsafe.Pointer(&reducesum_args.arg_n) } // Wrapper for reducesum CUDA kernel, asynchronous. func k_reducesum_async(src unsafe.Pointer, dst unsafe.Pointer, initVal float32, n int, cfg *config) { if Synchronous { // debug Sync() timer.Start("reducesum") } reducesum_args.Lock() defer reducesum_args.Unlock() if reducesum_code == 0 { reducesum_code = fatbinLoad(reducesum_map, "reducesum") } reducesum_args.arg_src = src reducesum_args.arg_dst = dst reducesum_args.arg_initVal = initVal reducesum_args.arg_n = n args := reducesum_args.argptr[:] cu.LaunchKernel(reducesum_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("reducesum") } } // maps compute capability on PTX code for reducesum kernel. var reducesum_map = map[int]string{0: "", 30: reducesum_ptx_30, 32: reducesum_ptx_32, 35: reducesum_ptx_35, 37: reducesum_ptx_37, 50: reducesum_ptx_50, 52: reducesum_ptx_52, 53: reducesum_ptx_53, 60: reducesum_ptx_60, 61: reducesum_ptx_61, 62: reducesum_ptx_62, 70: reducesum_ptx_70, 72: reducesum_ptx_72, 75: reducesum_ptx_75} // reducesum PTX code for various compute capabilities. const ( reducesum_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<31>; .reg .b32 %r<21>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd3, [reducesum_param_0]; ld.param.u64 %rd2, [reducesum_param_1]; ld.param.f32 %f30, [reducesum_param_2]; ld.param.u32 %r10, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd1, %rd4; ld.global.f32 %f5, [%rd5]; add.f32 %f30, %f30, %f5; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducesumE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f30; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f6, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f7, [%r18]; add.f32 %f8, %f6, %f7; st.shared.f32 [%r7], %f8; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f9, [%r7]; ld.volatile.shared.f32 %f10, [%r7+128]; add.f32 %f11, %f9, %f10; st.volatile.shared.f32 [%r7], %f11; ld.volatile.shared.f32 %f12, [%r7+64]; ld.volatile.shared.f32 %f13, [%r7]; add.f32 %f14, %f13, %f12; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+32]; ld.volatile.shared.f32 %f16, [%r7]; add.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+16]; ld.volatile.shared.f32 %f19, [%r7]; add.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+8]; ld.volatile.shared.f32 %f22, [%r7]; add.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+4]; ld.volatile.shared.f32 %f25, [%r7]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f27, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd6, %rd2; atom.global.add.f32 %f28, [%rd6], %f27; BB0_10: ret; } ` reducesum_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<31>; .reg .b32 %r<21>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd3, [reducesum_param_0]; ld.param.u64 %rd2, [reducesum_param_1]; ld.param.f32 %f30, [reducesum_param_2]; ld.param.u32 %r10, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; add.f32 %f30, %f30, %f5; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducesumE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f30; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f6, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f7, [%r18]; add.f32 %f8, %f6, %f7; st.shared.f32 [%r7], %f8; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f9, [%r7]; ld.volatile.shared.f32 %f10, [%r7+128]; add.f32 %f11, %f9, %f10; st.volatile.shared.f32 [%r7], %f11; ld.volatile.shared.f32 %f12, [%r7+64]; ld.volatile.shared.f32 %f13, [%r7]; add.f32 %f14, %f13, %f12; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+32]; ld.volatile.shared.f32 %f16, [%r7]; add.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+16]; ld.volatile.shared.f32 %f19, [%r7]; add.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+8]; ld.volatile.shared.f32 %f22, [%r7]; add.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+4]; ld.volatile.shared.f32 %f25, [%r7]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f27, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd6, %rd2; atom.global.add.f32 %f28, [%rd6], %f27; BB0_10: ret; } ` reducesum_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<31>; .reg .b32 %r<21>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd3, [reducesum_param_0]; ld.param.u64 %rd2, [reducesum_param_1]; ld.param.f32 %f30, [reducesum_param_2]; ld.param.u32 %r10, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; add.f32 %f30, %f30, %f5; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducesumE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f30; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f6, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f7, [%r18]; add.f32 %f8, %f6, %f7; st.shared.f32 [%r7], %f8; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f9, [%r7]; ld.volatile.shared.f32 %f10, [%r7+128]; add.f32 %f11, %f9, %f10; st.volatile.shared.f32 [%r7], %f11; ld.volatile.shared.f32 %f12, [%r7+64]; ld.volatile.shared.f32 %f13, [%r7]; add.f32 %f14, %f13, %f12; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+32]; ld.volatile.shared.f32 %f16, [%r7]; add.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+16]; ld.volatile.shared.f32 %f19, [%r7]; add.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+8]; ld.volatile.shared.f32 %f22, [%r7]; add.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+4]; ld.volatile.shared.f32 %f25, [%r7]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f27, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd6, %rd2; atom.global.add.f32 %f28, [%rd6], %f27; BB0_10: ret; } ` reducesum_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<31>; .reg .b32 %r<21>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd3, [reducesum_param_0]; ld.param.u64 %rd2, [reducesum_param_1]; ld.param.f32 %f30, [reducesum_param_2]; ld.param.u32 %r10, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; add.f32 %f30, %f30, %f5; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducesumE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f30; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f6, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f7, [%r18]; add.f32 %f8, %f6, %f7; st.shared.f32 [%r7], %f8; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f9, [%r7]; ld.volatile.shared.f32 %f10, [%r7+128]; add.f32 %f11, %f9, %f10; st.volatile.shared.f32 [%r7], %f11; ld.volatile.shared.f32 %f12, [%r7+64]; ld.volatile.shared.f32 %f13, [%r7]; add.f32 %f14, %f13, %f12; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+32]; ld.volatile.shared.f32 %f16, [%r7]; add.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+16]; ld.volatile.shared.f32 %f19, [%r7]; add.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+8]; ld.volatile.shared.f32 %f22, [%r7]; add.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+4]; ld.volatile.shared.f32 %f25, [%r7]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f27, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd6, %rd2; atom.global.add.f32 %f28, [%rd6], %f27; BB0_10: ret; } ` reducesum_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<31>; .reg .b32 %r<21>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd3, [reducesum_param_0]; ld.param.u64 %rd2, [reducesum_param_1]; ld.param.f32 %f30, [reducesum_param_2]; ld.param.u32 %r10, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; add.f32 %f30, %f30, %f5; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducesumE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f30; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f6, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f7, [%r18]; add.f32 %f8, %f6, %f7; st.shared.f32 [%r7], %f8; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f9, [%r7]; ld.volatile.shared.f32 %f10, [%r7+128]; add.f32 %f11, %f9, %f10; st.volatile.shared.f32 [%r7], %f11; ld.volatile.shared.f32 %f12, [%r7+64]; ld.volatile.shared.f32 %f13, [%r7]; add.f32 %f14, %f13, %f12; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+32]; ld.volatile.shared.f32 %f16, [%r7]; add.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+16]; ld.volatile.shared.f32 %f19, [%r7]; add.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+8]; ld.volatile.shared.f32 %f22, [%r7]; add.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+4]; ld.volatile.shared.f32 %f25, [%r7]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f27, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd6, %rd2; atom.global.add.f32 %f28, [%rd6], %f27; BB0_10: ret; } ` reducesum_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<31>; .reg .b32 %r<21>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd3, [reducesum_param_0]; ld.param.u64 %rd2, [reducesum_param_1]; ld.param.f32 %f30, [reducesum_param_2]; ld.param.u32 %r10, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; add.f32 %f30, %f30, %f5; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducesumE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f30; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f6, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f7, [%r18]; add.f32 %f8, %f6, %f7; st.shared.f32 [%r7], %f8; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f9, [%r7]; ld.volatile.shared.f32 %f10, [%r7+128]; add.f32 %f11, %f9, %f10; st.volatile.shared.f32 [%r7], %f11; ld.volatile.shared.f32 %f12, [%r7+64]; ld.volatile.shared.f32 %f13, [%r7]; add.f32 %f14, %f13, %f12; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+32]; ld.volatile.shared.f32 %f16, [%r7]; add.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+16]; ld.volatile.shared.f32 %f19, [%r7]; add.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+8]; ld.volatile.shared.f32 %f22, [%r7]; add.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+4]; ld.volatile.shared.f32 %f25, [%r7]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f27, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd6, %rd2; atom.global.add.f32 %f28, [%rd6], %f27; BB0_10: ret; } ` reducesum_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<31>; .reg .b32 %r<21>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd3, [reducesum_param_0]; ld.param.u64 %rd2, [reducesum_param_1]; ld.param.f32 %f30, [reducesum_param_2]; ld.param.u32 %r10, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; add.f32 %f30, %f30, %f5; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducesumE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f30; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f6, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f7, [%r18]; add.f32 %f8, %f6, %f7; st.shared.f32 [%r7], %f8; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f9, [%r7]; ld.volatile.shared.f32 %f10, [%r7+128]; add.f32 %f11, %f9, %f10; st.volatile.shared.f32 [%r7], %f11; ld.volatile.shared.f32 %f12, [%r7+64]; ld.volatile.shared.f32 %f13, [%r7]; add.f32 %f14, %f13, %f12; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+32]; ld.volatile.shared.f32 %f16, [%r7]; add.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+16]; ld.volatile.shared.f32 %f19, [%r7]; add.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+8]; ld.volatile.shared.f32 %f22, [%r7]; add.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+4]; ld.volatile.shared.f32 %f25, [%r7]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f27, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd6, %rd2; atom.global.add.f32 %f28, [%rd6], %f27; BB0_10: ret; } ` reducesum_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<31>; .reg .b32 %r<21>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd3, [reducesum_param_0]; ld.param.u64 %rd2, [reducesum_param_1]; ld.param.f32 %f30, [reducesum_param_2]; ld.param.u32 %r10, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; add.f32 %f30, %f30, %f5; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducesumE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f30; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f6, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f7, [%r18]; add.f32 %f8, %f6, %f7; st.shared.f32 [%r7], %f8; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f9, [%r7]; ld.volatile.shared.f32 %f10, [%r7+128]; add.f32 %f11, %f9, %f10; st.volatile.shared.f32 [%r7], %f11; ld.volatile.shared.f32 %f12, [%r7+64]; ld.volatile.shared.f32 %f13, [%r7]; add.f32 %f14, %f13, %f12; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+32]; ld.volatile.shared.f32 %f16, [%r7]; add.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+16]; ld.volatile.shared.f32 %f19, [%r7]; add.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+8]; ld.volatile.shared.f32 %f22, [%r7]; add.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+4]; ld.volatile.shared.f32 %f25, [%r7]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f27, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd6, %rd2; atom.global.add.f32 %f28, [%rd6], %f27; BB0_10: ret; } ` reducesum_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<31>; .reg .b32 %r<21>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd3, [reducesum_param_0]; ld.param.u64 %rd2, [reducesum_param_1]; ld.param.f32 %f30, [reducesum_param_2]; ld.param.u32 %r10, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; add.f32 %f30, %f30, %f5; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducesumE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f30; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f6, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f7, [%r18]; add.f32 %f8, %f6, %f7; st.shared.f32 [%r7], %f8; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f9, [%r7]; ld.volatile.shared.f32 %f10, [%r7+128]; add.f32 %f11, %f9, %f10; st.volatile.shared.f32 [%r7], %f11; ld.volatile.shared.f32 %f12, [%r7+64]; ld.volatile.shared.f32 %f13, [%r7]; add.f32 %f14, %f13, %f12; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+32]; ld.volatile.shared.f32 %f16, [%r7]; add.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+16]; ld.volatile.shared.f32 %f19, [%r7]; add.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+8]; ld.volatile.shared.f32 %f22, [%r7]; add.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+4]; ld.volatile.shared.f32 %f25, [%r7]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f27, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd6, %rd2; atom.global.add.f32 %f28, [%rd6], %f27; BB0_10: ret; } ` reducesum_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<31>; .reg .b32 %r<21>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd3, [reducesum_param_0]; ld.param.u64 %rd2, [reducesum_param_1]; ld.param.f32 %f30, [reducesum_param_2]; ld.param.u32 %r10, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; add.f32 %f30, %f30, %f5; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducesumE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f30; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f6, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f7, [%r18]; add.f32 %f8, %f6, %f7; st.shared.f32 [%r7], %f8; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f9, [%r7]; ld.volatile.shared.f32 %f10, [%r7+128]; add.f32 %f11, %f9, %f10; st.volatile.shared.f32 [%r7], %f11; ld.volatile.shared.f32 %f12, [%r7+64]; ld.volatile.shared.f32 %f13, [%r7]; add.f32 %f14, %f13, %f12; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+32]; ld.volatile.shared.f32 %f16, [%r7]; add.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+16]; ld.volatile.shared.f32 %f19, [%r7]; add.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+8]; ld.volatile.shared.f32 %f22, [%r7]; add.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+4]; ld.volatile.shared.f32 %f25, [%r7]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f27, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd6, %rd2; atom.global.add.f32 %f28, [%rd6], %f27; BB0_10: ret; } ` reducesum_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<31>; .reg .b32 %r<21>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd3, [reducesum_param_0]; ld.param.u64 %rd2, [reducesum_param_1]; ld.param.f32 %f30, [reducesum_param_2]; ld.param.u32 %r10, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; add.f32 %f30, %f30, %f5; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducesumE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f30; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f6, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f7, [%r18]; add.f32 %f8, %f6, %f7; st.shared.f32 [%r7], %f8; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f9, [%r7]; ld.volatile.shared.f32 %f10, [%r7+128]; add.f32 %f11, %f9, %f10; st.volatile.shared.f32 [%r7], %f11; ld.volatile.shared.f32 %f12, [%r7+64]; ld.volatile.shared.f32 %f13, [%r7]; add.f32 %f14, %f13, %f12; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+32]; ld.volatile.shared.f32 %f16, [%r7]; add.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+16]; ld.volatile.shared.f32 %f19, [%r7]; add.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+8]; ld.volatile.shared.f32 %f22, [%r7]; add.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+4]; ld.volatile.shared.f32 %f25, [%r7]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f27, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd6, %rd2; atom.global.add.f32 %f28, [%rd6], %f27; BB0_10: ret; } ` reducesum_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<31>; .reg .b32 %r<21>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd3, [reducesum_param_0]; ld.param.u64 %rd2, [reducesum_param_1]; ld.param.f32 %f30, [reducesum_param_2]; ld.param.u32 %r10, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; add.f32 %f30, %f30, %f5; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducesumE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f30; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f6, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f7, [%r18]; add.f32 %f8, %f6, %f7; st.shared.f32 [%r7], %f8; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f9, [%r7]; ld.volatile.shared.f32 %f10, [%r7+128]; add.f32 %f11, %f9, %f10; st.volatile.shared.f32 [%r7], %f11; ld.volatile.shared.f32 %f12, [%r7+64]; ld.volatile.shared.f32 %f13, [%r7]; add.f32 %f14, %f13, %f12; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+32]; ld.volatile.shared.f32 %f16, [%r7]; add.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+16]; ld.volatile.shared.f32 %f19, [%r7]; add.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+8]; ld.volatile.shared.f32 %f22, [%r7]; add.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+4]; ld.volatile.shared.f32 %f25, [%r7]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f27, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd6, %rd2; atom.global.add.f32 %f28, [%rd6], %f27; BB0_10: ret; } ` reducesum_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl reducesum .visible .entry reducesum( .param .u64 reducesum_param_0, .param .u64 reducesum_param_1, .param .f32 reducesum_param_2, .param .u32 reducesum_param_3 ) { .reg .pred %p<8>; .reg .f32 %f<31>; .reg .b32 %r<21>; .reg .b64 %rd<7>; // demoted variable .shared .align 4 .b8 _ZZ9reducesumE5sdata[2048]; ld.param.u64 %rd3, [reducesum_param_0]; ld.param.u64 %rd2, [reducesum_param_1]; ld.param.f32 %f30, [reducesum_param_2]; ld.param.u32 %r10, [reducesum_param_3]; cvta.to.global.u64 %rd1, %rd3; mov.u32 %r20, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r19, %r20, %r11, %r2; mov.u32 %r12, %nctaid.x; mul.lo.s32 %r4, %r12, %r20; setp.ge.s32 %p1, %r19, %r10; @%p1 bra BB0_2; BB0_1: mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd1, %rd4; ld.global.nc.f32 %f5, [%rd5]; add.f32 %f30, %f30, %f5; add.s32 %r19, %r19, %r4; setp.lt.s32 %p2, %r19, %r10; @%p2 bra BB0_1; BB0_2: shl.b32 %r13, %r2, 2; mov.u32 %r14, _ZZ9reducesumE5sdata; add.s32 %r7, %r14, %r13; st.shared.f32 [%r7], %f30; bar.sync 0; setp.lt.u32 %p3, %r20, 66; @%p3 bra BB0_6; BB0_3: shr.u32 %r9, %r20, 1; setp.ge.u32 %p4, %r2, %r9; @%p4 bra BB0_5; ld.shared.f32 %f6, [%r7]; add.s32 %r15, %r9, %r2; shl.b32 %r16, %r15, 2; add.s32 %r18, %r14, %r16; ld.shared.f32 %f7, [%r18]; add.f32 %f8, %f6, %f7; st.shared.f32 [%r7], %f8; BB0_5: bar.sync 0; setp.gt.u32 %p5, %r20, 131; mov.u32 %r20, %r9; @%p5 bra BB0_3; BB0_6: setp.gt.s32 %p6, %r2, 31; @%p6 bra BB0_8; ld.volatile.shared.f32 %f9, [%r7]; ld.volatile.shared.f32 %f10, [%r7+128]; add.f32 %f11, %f9, %f10; st.volatile.shared.f32 [%r7], %f11; ld.volatile.shared.f32 %f12, [%r7+64]; ld.volatile.shared.f32 %f13, [%r7]; add.f32 %f14, %f13, %f12; st.volatile.shared.f32 [%r7], %f14; ld.volatile.shared.f32 %f15, [%r7+32]; ld.volatile.shared.f32 %f16, [%r7]; add.f32 %f17, %f16, %f15; st.volatile.shared.f32 [%r7], %f17; ld.volatile.shared.f32 %f18, [%r7+16]; ld.volatile.shared.f32 %f19, [%r7]; add.f32 %f20, %f19, %f18; st.volatile.shared.f32 [%r7], %f20; ld.volatile.shared.f32 %f21, [%r7+8]; ld.volatile.shared.f32 %f22, [%r7]; add.f32 %f23, %f22, %f21; st.volatile.shared.f32 [%r7], %f23; ld.volatile.shared.f32 %f24, [%r7+4]; ld.volatile.shared.f32 %f25, [%r7]; add.f32 %f26, %f25, %f24; st.volatile.shared.f32 [%r7], %f26; BB0_8: setp.ne.s32 %p7, %r2, 0; @%p7 bra BB0_10; ld.shared.f32 %f27, [_ZZ9reducesumE5sdata]; cvta.to.global.u64 %rd6, %rd2; atom.global.add.f32 %f28, [%rd6], %f27; BB0_10: ret; } ` ) mumax3-3.10/cuda/region.go000066400000000000000000000024221371432437400154050ustar00rootroot00000000000000package cuda import ( "unsafe" "github.com/mumax/3/data" "github.com/mumax/3/util" ) // dst += LUT[region], for vectors. Used to add terms to excitation. func RegionAddV(dst *data.Slice, lut LUTPtrs, regions *Bytes) { util.Argument(dst.NComp() == 3) N := dst.Len() cfg := make1DConf(N) k_regionaddv_async(dst.DevPtr(X), dst.DevPtr(Y), dst.DevPtr(Z), lut[X], lut[Y], lut[Z], regions.Ptr, N, cfg) } // dst += LUT[region], for scalar. Used to add terms to scalar excitation. func RegionAddS(dst *data.Slice, lut LUTPtr, regions *Bytes) { util.Argument(dst.NComp() == 1) N := dst.Len() cfg := make1DConf(N) k_regionadds_async(dst.DevPtr(0), unsafe.Pointer(lut), regions.Ptr, N, cfg) } // decode the regions+LUT pair into an uncompressed array func RegionDecode(dst *data.Slice, lut LUTPtr, regions *Bytes) { N := dst.Len() cfg := make1DConf(N) k_regiondecode_async(dst.DevPtr(0), unsafe.Pointer(lut), regions.Ptr, N, cfg) } // select the part of src within the specified region, set 0's everywhere else. func RegionSelect(dst, src *data.Slice, regions *Bytes, region byte) { util.Argument(dst.NComp() == src.NComp()) N := dst.Len() cfg := make1DConf(N) for c := 0; c < dst.NComp(); c++ { k_regionselect_async(dst.DevPtr(c), src.DevPtr(c), regions.Ptr, region, N, cfg) } } mumax3-3.10/cuda/regionadds.cu000066400000000000000000000005551371432437400162500ustar00rootroot00000000000000#include // add region-based scalar to dst: // dst[i] += LUT[region[i]] extern "C" __global__ void regionadds(float* __restrict__ dst, float* __restrict__ LUT, uint8_t* regions, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { uint8_t r = regions[i]; dst[i] += LUT[r]; } } mumax3-3.10/cuda/regionadds_wrapper.go000066400000000000000000000437331371432437400200130ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for regionadds kernel var regionadds_code cu.Function // Stores the arguments for regionadds kernel invocation type regionadds_args_t struct { arg_dst unsafe.Pointer arg_LUT unsafe.Pointer arg_regions unsafe.Pointer arg_N int argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for regionadds kernel invocation var regionadds_args regionadds_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. regionadds_args.argptr[0] = unsafe.Pointer(®ionadds_args.arg_dst) regionadds_args.argptr[1] = unsafe.Pointer(®ionadds_args.arg_LUT) regionadds_args.argptr[2] = unsafe.Pointer(®ionadds_args.arg_regions) regionadds_args.argptr[3] = unsafe.Pointer(®ionadds_args.arg_N) } // Wrapper for regionadds CUDA kernel, asynchronous. func k_regionadds_async(dst unsafe.Pointer, LUT unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("regionadds") } regionadds_args.Lock() defer regionadds_args.Unlock() if regionadds_code == 0 { regionadds_code = fatbinLoad(regionadds_map, "regionadds") } regionadds_args.arg_dst = dst regionadds_args.arg_LUT = LUT regionadds_args.arg_regions = regions regionadds_args.arg_N = N args := regionadds_args.argptr[:] cu.LaunchKernel(regionadds_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("regionadds") } } // maps compute capability on PTX code for regionadds kernel. var regionadds_map = map[int]string{0: "", 30: regionadds_ptx_30, 32: regionadds_ptx_32, 35: regionadds_ptx_35, 37: regionadds_ptx_37, 50: regionadds_ptx_50, 52: regionadds_ptx_52, 53: regionadds_ptx_53, 60: regionadds_ptx_60, 61: regionadds_ptx_61, 62: regionadds_ptx_62, 70: regionadds_ptx_70, 72: regionadds_ptx_72, 75: regionadds_ptx_75} // regionadds PTX code for various compute capabilities. const ( regionadds_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<4>; .reg .b32 %r<10>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd2; ld.global.u8 %r9, [%rd6]; mul.wide.u32 %rd8, %r9, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; BB0_2: ret; } ` regionadds_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; BB0_2: ret; } ` regionadds_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; BB0_2: ret; } ` regionadds_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; BB0_2: ret; } ` regionadds_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; BB0_2: ret; } ` regionadds_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; BB0_2: ret; } ` regionadds_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; BB0_2: ret; } ` regionadds_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; BB0_2: ret; } ` regionadds_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; BB0_2: ret; } ` regionadds_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; BB0_2: ret; } ` regionadds_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; BB0_2: ret; } ` regionadds_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; BB0_2: ret; } ` regionadds_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl regionadds .visible .entry regionadds( .param .u64 regionadds_param_0, .param .u64 regionadds_param_1, .param .u64 regionadds_param_2, .param .u32 regionadds_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<4>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionadds_param_0]; ld.param.u64 %rd2, [regionadds_param_1]; ld.param.u64 %rd3, [regionadds_param_2]; ld.param.u32 %r2, [regionadds_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; ld.global.f32 %f1, [%rd12]; ld.global.nc.f32 %f2, [%rd9]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd12], %f3; BB0_2: ret; } ` ) mumax3-3.10/cuda/regionaddv.cu000066400000000000000000000010461371432437400162470ustar00rootroot00000000000000#include // add region-based vector to dst: // dst[i] += LUT[region[i]] extern "C" __global__ void regionaddv(float* __restrict__ dstx, float* __restrict__ dsty, float* __restrict__ dstz, float* __restrict__ LUTx, float* __restrict__ LUTy, float* __restrict__ LUTz, uint8_t* regions, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { uint8_t r = regions[i]; dstx[i] += LUTx[r]; dsty[i] += LUTy[r]; dstz[i] += LUTz[r]; } } mumax3-3.10/cuda/regionaddv_wrapper.go000066400000000000000000000715261371432437400200170ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for regionaddv kernel var regionaddv_code cu.Function // Stores the arguments for regionaddv kernel invocation type regionaddv_args_t struct { arg_dstx unsafe.Pointer arg_dsty unsafe.Pointer arg_dstz unsafe.Pointer arg_LUTx unsafe.Pointer arg_LUTy unsafe.Pointer arg_LUTz unsafe.Pointer arg_regions unsafe.Pointer arg_N int argptr [8]unsafe.Pointer sync.Mutex } // Stores the arguments for regionaddv kernel invocation var regionaddv_args regionaddv_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. regionaddv_args.argptr[0] = unsafe.Pointer(®ionaddv_args.arg_dstx) regionaddv_args.argptr[1] = unsafe.Pointer(®ionaddv_args.arg_dsty) regionaddv_args.argptr[2] = unsafe.Pointer(®ionaddv_args.arg_dstz) regionaddv_args.argptr[3] = unsafe.Pointer(®ionaddv_args.arg_LUTx) regionaddv_args.argptr[4] = unsafe.Pointer(®ionaddv_args.arg_LUTy) regionaddv_args.argptr[5] = unsafe.Pointer(®ionaddv_args.arg_LUTz) regionaddv_args.argptr[6] = unsafe.Pointer(®ionaddv_args.arg_regions) regionaddv_args.argptr[7] = unsafe.Pointer(®ionaddv_args.arg_N) } // Wrapper for regionaddv CUDA kernel, asynchronous. func k_regionaddv_async(dstx unsafe.Pointer, dsty unsafe.Pointer, dstz unsafe.Pointer, LUTx unsafe.Pointer, LUTy unsafe.Pointer, LUTz unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("regionaddv") } regionaddv_args.Lock() defer regionaddv_args.Unlock() if regionaddv_code == 0 { regionaddv_code = fatbinLoad(regionaddv_map, "regionaddv") } regionaddv_args.arg_dstx = dstx regionaddv_args.arg_dsty = dsty regionaddv_args.arg_dstz = dstz regionaddv_args.arg_LUTx = LUTx regionaddv_args.arg_LUTy = LUTy regionaddv_args.arg_LUTz = LUTz regionaddv_args.arg_regions = regions regionaddv_args.arg_N = N args := regionaddv_args.argptr[:] cu.LaunchKernel(regionaddv_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("regionaddv") } } // maps compute capability on PTX code for regionaddv kernel. var regionaddv_map = map[int]string{0: "", 30: regionaddv_ptx_30, 32: regionaddv_ptx_32, 35: regionaddv_ptx_35, 37: regionaddv_ptx_37, 50: regionaddv_ptx_50, 52: regionaddv_ptx_52, 53: regionaddv_ptx_53, 60: regionaddv_ptx_60, 61: regionaddv_ptx_61, 62: regionaddv_ptx_62, 70: regionaddv_ptx_70, 72: regionaddv_ptx_72, 75: regionaddv_ptx_75} // regionaddv PTX code for various compute capabilities. const ( regionaddv_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .f32 %f<10>; .reg .b32 %r<10>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; cvta.to.global.u64 %rd11, %rd4; ld.global.u8 %r9, [%rd10]; mul.wide.u32 %rd12, %r9, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; BB0_2: ret; } ` regionaddv_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; BB0_2: ret; } ` regionaddv_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; BB0_2: ret; } ` regionaddv_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; BB0_2: ret; } ` regionaddv_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; BB0_2: ret; } ` regionaddv_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; BB0_2: ret; } ` regionaddv_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; BB0_2: ret; } ` regionaddv_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; BB0_2: ret; } ` regionaddv_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; BB0_2: ret; } ` regionaddv_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; BB0_2: ret; } ` regionaddv_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; BB0_2: ret; } ` regionaddv_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; BB0_2: ret; } ` regionaddv_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl regionaddv .visible .entry regionaddv( .param .u64 regionaddv_param_0, .param .u64 regionaddv_param_1, .param .u64 regionaddv_param_2, .param .u64 regionaddv_param_3, .param .u64 regionaddv_param_4, .param .u64 regionaddv_param_5, .param .u64 regionaddv_param_6, .param .u32 regionaddv_param_7 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<10>; .reg .b32 %r<11>; .reg .b64 %rd<25>; ld.param.u64 %rd1, [regionaddv_param_0]; ld.param.u64 %rd2, [regionaddv_param_1]; ld.param.u64 %rd3, [regionaddv_param_2]; ld.param.u64 %rd4, [regionaddv_param_3]; ld.param.u64 %rd5, [regionaddv_param_4]; ld.param.u64 %rd6, [regionaddv_param_5]; ld.param.u64 %rd7, [regionaddv_param_6]; ld.param.u32 %r2, [regionaddv_param_7]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd8, %rd7; cvt.s64.s32 %rd9, %r1; add.s64 %rd10, %rd8, %rd9; ld.global.nc.u8 %rs1, [%rd10]; cvta.to.global.u64 %rd11, %rd4; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd12, %r10, 4; add.s64 %rd13, %rd11, %rd12; cvta.to.global.u64 %rd14, %rd1; mul.wide.s32 %rd15, %r1, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f1, [%rd16]; ld.global.nc.f32 %f2, [%rd13]; add.f32 %f3, %f2, %f1; st.global.f32 [%rd16], %f3; cvta.to.global.u64 %rd17, %rd5; add.s64 %rd18, %rd17, %rd12; cvta.to.global.u64 %rd19, %rd2; add.s64 %rd20, %rd19, %rd15; ld.global.f32 %f4, [%rd20]; ld.global.nc.f32 %f5, [%rd18]; add.f32 %f6, %f5, %f4; st.global.f32 [%rd20], %f6; cvta.to.global.u64 %rd21, %rd6; add.s64 %rd22, %rd21, %rd12; cvta.to.global.u64 %rd23, %rd3; add.s64 %rd24, %rd23, %rd15; ld.global.f32 %f7, [%rd24]; ld.global.nc.f32 %f8, [%rd22]; add.f32 %f9, %f8, %f7; st.global.f32 [%rd24], %f9; BB0_2: ret; } ` ) mumax3-3.10/cuda/regiondecode.cu000066400000000000000000000005231371432437400165530ustar00rootroot00000000000000#include // decode the regions+LUT pair into an uncompressed array extern "C" __global__ void regiondecode(float* __restrict__ dst, float* __restrict__ LUT, uint8_t* regions, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { dst[i] = LUT[regions[i]]; } } mumax3-3.10/cuda/regiondecode_wrapper.go000066400000000000000000000432141371432437400203150ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for regiondecode kernel var regiondecode_code cu.Function // Stores the arguments for regiondecode kernel invocation type regiondecode_args_t struct { arg_dst unsafe.Pointer arg_LUT unsafe.Pointer arg_regions unsafe.Pointer arg_N int argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for regiondecode kernel invocation var regiondecode_args regiondecode_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. regiondecode_args.argptr[0] = unsafe.Pointer(®iondecode_args.arg_dst) regiondecode_args.argptr[1] = unsafe.Pointer(®iondecode_args.arg_LUT) regiondecode_args.argptr[2] = unsafe.Pointer(®iondecode_args.arg_regions) regiondecode_args.argptr[3] = unsafe.Pointer(®iondecode_args.arg_N) } // Wrapper for regiondecode CUDA kernel, asynchronous. func k_regiondecode_async(dst unsafe.Pointer, LUT unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("regiondecode") } regiondecode_args.Lock() defer regiondecode_args.Unlock() if regiondecode_code == 0 { regiondecode_code = fatbinLoad(regiondecode_map, "regiondecode") } regiondecode_args.arg_dst = dst regiondecode_args.arg_LUT = LUT regiondecode_args.arg_regions = regions regiondecode_args.arg_N = N args := regiondecode_args.argptr[:] cu.LaunchKernel(regiondecode_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("regiondecode") } } // maps compute capability on PTX code for regiondecode kernel. var regiondecode_map = map[int]string{0: "", 30: regiondecode_ptx_30, 32: regiondecode_ptx_32, 35: regiondecode_ptx_35, 37: regiondecode_ptx_37, 50: regiondecode_ptx_50, 52: regiondecode_ptx_52, 53: regiondecode_ptx_53, 60: regiondecode_ptx_60, 61: regiondecode_ptx_61, 62: regiondecode_ptx_62, 70: regiondecode_ptx_70, 72: regiondecode_ptx_72, 75: regiondecode_ptx_75} // regiondecode PTX code for various compute capabilities. const ( regiondecode_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .f32 %f<2>; .reg .b32 %r<10>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; cvta.to.global.u64 %rd7, %rd2; ld.global.u8 %r9, [%rd6]; mul.wide.u32 %rd8, %r9, 4; add.s64 %rd9, %rd7, %rd8; ld.global.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; BB0_2: ret; } ` regiondecode_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; BB0_2: ret; } ` regiondecode_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; BB0_2: ret; } ` regiondecode_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; BB0_2: ret; } ` regiondecode_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; BB0_2: ret; } ` regiondecode_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; BB0_2: ret; } ` regiondecode_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; BB0_2: ret; } ` regiondecode_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; BB0_2: ret; } ` regiondecode_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; BB0_2: ret; } ` regiondecode_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; BB0_2: ret; } ` regiondecode_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; BB0_2: ret; } ` regiondecode_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; BB0_2: ret; } ` regiondecode_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl regiondecode .visible .entry regiondecode( .param .u64 regiondecode_param_0, .param .u64 regiondecode_param_1, .param .u64 regiondecode_param_2, .param .u32 regiondecode_param_3 ) { .reg .pred %p<2>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regiondecode_param_0]; ld.param.u64 %rd2, [regiondecode_param_1]; ld.param.u64 %rd3, [regiondecode_param_2]; ld.param.u32 %r2, [regiondecode_param_3]; mov.u32 %r3, %ctaid.y; mov.u32 %r4, %nctaid.x; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r4, %r3, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_2; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd2; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f1; BB0_2: ret; } ` ) mumax3-3.10/cuda/regionselect.cu000066400000000000000000000004731371432437400166130ustar00rootroot00000000000000#include extern "C" __global__ void regionselect(float* __restrict__ dst, float* __restrict__ src, uint8_t* regions, uint8_t region, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { dst[i] = (regions[i] == region? src[i]: 0.0f); } } mumax3-3.10/cuda/regionselect_wrapper.go000066400000000000000000000463611371432437400203570ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for regionselect kernel var regionselect_code cu.Function // Stores the arguments for regionselect kernel invocation type regionselect_args_t struct { arg_dst unsafe.Pointer arg_src unsafe.Pointer arg_regions unsafe.Pointer arg_region byte arg_N int argptr [5]unsafe.Pointer sync.Mutex } // Stores the arguments for regionselect kernel invocation var regionselect_args regionselect_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. regionselect_args.argptr[0] = unsafe.Pointer(®ionselect_args.arg_dst) regionselect_args.argptr[1] = unsafe.Pointer(®ionselect_args.arg_src) regionselect_args.argptr[2] = unsafe.Pointer(®ionselect_args.arg_regions) regionselect_args.argptr[3] = unsafe.Pointer(®ionselect_args.arg_region) regionselect_args.argptr[4] = unsafe.Pointer(®ionselect_args.arg_N) } // Wrapper for regionselect CUDA kernel, asynchronous. func k_regionselect_async(dst unsafe.Pointer, src unsafe.Pointer, regions unsafe.Pointer, region byte, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("regionselect") } regionselect_args.Lock() defer regionselect_args.Unlock() if regionselect_code == 0 { regionselect_code = fatbinLoad(regionselect_map, "regionselect") } regionselect_args.arg_dst = dst regionselect_args.arg_src = src regionselect_args.arg_regions = regions regionselect_args.arg_region = region regionselect_args.arg_N = N args := regionselect_args.argptr[:] cu.LaunchKernel(regionselect_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("regionselect") } } // maps compute capability on PTX code for regionselect kernel. var regionselect_map = map[int]string{0: "", 30: regionselect_ptx_30, 32: regionselect_ptx_32, 35: regionselect_ptx_35, 37: regionselect_ptx_37, 50: regionselect_ptx_50, 52: regionselect_ptx_52, 53: regionselect_ptx_53, 60: regionselect_ptx_60, 61: regionselect_ptx_61, 62: regionselect_ptx_62, 70: regionselect_ptx_70, 72: regionselect_ptx_72, 75: regionselect_ptx_75} // regionselect PTX code for various compute capabilities. const ( regionselect_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<3>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionselect_param_0]; ld.param.u64 %rd2, [regionselect_param_1]; ld.param.u64 %rd3, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; ld.param.u8 %rs1, [regionselect_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.u8 %rs2, [%rd6]; mov.f32 %f4, 0f00000000; setp.ne.s16 %p2, %rs2, %rs1; @%p2 bra BB0_3; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.f32 %f4, [%rd9]; BB0_3: cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; BB0_4: ret; } ` regionselect_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionselect_param_0]; ld.param.u64 %rd2, [regionselect_param_1]; ld.param.u64 %rd3, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; ld.param.u8 %rs1, [regionselect_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs2, [%rd6]; mov.f32 %f4, 0f00000000; setp.ne.s16 %p2, %rs2, %rs1; @%p2 bra BB0_3; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; BB0_3: cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; BB0_4: ret; } ` regionselect_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionselect_param_0]; ld.param.u64 %rd2, [regionselect_param_1]; ld.param.u64 %rd3, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; ld.param.u8 %rs1, [regionselect_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs2, [%rd6]; mov.f32 %f4, 0f00000000; setp.ne.s16 %p2, %rs2, %rs1; @%p2 bra BB0_3; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; BB0_3: cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; BB0_4: ret; } ` regionselect_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionselect_param_0]; ld.param.u64 %rd2, [regionselect_param_1]; ld.param.u64 %rd3, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; ld.param.u8 %rs1, [regionselect_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs2, [%rd6]; mov.f32 %f4, 0f00000000; setp.ne.s16 %p2, %rs2, %rs1; @%p2 bra BB0_3; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; BB0_3: cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; BB0_4: ret; } ` regionselect_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionselect_param_0]; ld.param.u64 %rd2, [regionselect_param_1]; ld.param.u64 %rd3, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; ld.param.u8 %rs1, [regionselect_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs2, [%rd6]; mov.f32 %f4, 0f00000000; setp.ne.s16 %p2, %rs2, %rs1; @%p2 bra BB0_3; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; BB0_3: cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; BB0_4: ret; } ` regionselect_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionselect_param_0]; ld.param.u64 %rd2, [regionselect_param_1]; ld.param.u64 %rd3, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; ld.param.u8 %rs1, [regionselect_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs2, [%rd6]; mov.f32 %f4, 0f00000000; setp.ne.s16 %p2, %rs2, %rs1; @%p2 bra BB0_3; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; BB0_3: cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; BB0_4: ret; } ` regionselect_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionselect_param_0]; ld.param.u64 %rd2, [regionselect_param_1]; ld.param.u64 %rd3, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; ld.param.u8 %rs1, [regionselect_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs2, [%rd6]; mov.f32 %f4, 0f00000000; setp.ne.s16 %p2, %rs2, %rs1; @%p2 bra BB0_3; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; BB0_3: cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; BB0_4: ret; } ` regionselect_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionselect_param_0]; ld.param.u64 %rd2, [regionselect_param_1]; ld.param.u64 %rd3, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; ld.param.u8 %rs1, [regionselect_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs2, [%rd6]; mov.f32 %f4, 0f00000000; setp.ne.s16 %p2, %rs2, %rs1; @%p2 bra BB0_3; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; BB0_3: cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; BB0_4: ret; } ` regionselect_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionselect_param_0]; ld.param.u64 %rd2, [regionselect_param_1]; ld.param.u64 %rd3, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; ld.param.u8 %rs1, [regionselect_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs2, [%rd6]; mov.f32 %f4, 0f00000000; setp.ne.s16 %p2, %rs2, %rs1; @%p2 bra BB0_3; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; BB0_3: cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; BB0_4: ret; } ` regionselect_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionselect_param_0]; ld.param.u64 %rd2, [regionselect_param_1]; ld.param.u64 %rd3, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; ld.param.u8 %rs1, [regionselect_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs2, [%rd6]; mov.f32 %f4, 0f00000000; setp.ne.s16 %p2, %rs2, %rs1; @%p2 bra BB0_3; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; BB0_3: cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; BB0_4: ret; } ` regionselect_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionselect_param_0]; ld.param.u64 %rd2, [regionselect_param_1]; ld.param.u64 %rd3, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; ld.param.u8 %rs1, [regionselect_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs2, [%rd6]; mov.f32 %f4, 0f00000000; setp.ne.s16 %p2, %rs2, %rs1; @%p2 bra BB0_3; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; BB0_3: cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; BB0_4: ret; } ` regionselect_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionselect_param_0]; ld.param.u64 %rd2, [regionselect_param_1]; ld.param.u64 %rd3, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; ld.param.u8 %rs1, [regionselect_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs2, [%rd6]; mov.f32 %f4, 0f00000000; setp.ne.s16 %p2, %rs2, %rs1; @%p2 bra BB0_3; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; BB0_3: cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; BB0_4: ret; } ` regionselect_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl regionselect .visible .entry regionselect( .param .u64 regionselect_param_0, .param .u64 regionselect_param_1, .param .u64 regionselect_param_2, .param .u8 regionselect_param_3, .param .u32 regionselect_param_4 ) { .reg .pred %p<3>; .reg .b16 %rs<4>; .reg .f32 %f<5>; .reg .b32 %r<9>; .reg .b64 %rd<13>; ld.param.u64 %rd1, [regionselect_param_0]; ld.param.u64 %rd2, [regionselect_param_1]; ld.param.u64 %rd3, [regionselect_param_2]; ld.param.u32 %r2, [regionselect_param_4]; ld.param.u8 %rs1, [regionselect_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_4; cvta.to.global.u64 %rd4, %rd3; cvt.s64.s32 %rd5, %r1; add.s64 %rd6, %rd4, %rd5; ld.global.nc.u8 %rs2, [%rd6]; mov.f32 %f4, 0f00000000; setp.ne.s16 %p2, %rs2, %rs1; @%p2 bra BB0_3; cvta.to.global.u64 %rd7, %rd2; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f4, [%rd9]; BB0_3: cvta.to.global.u64 %rd10, %rd1; mul.wide.s32 %rd11, %r1, 4; add.s64 %rd12, %rd10, %rd11; st.global.f32 [%rd12], %f4; BB0_4: ret; } ` ) mumax3-3.10/cuda/resize.cu000066400000000000000000000014321371432437400154250ustar00rootroot00000000000000 // Select and resize one layer for interactive output extern "C" __global__ void resize(float* __restrict__ dst, int Dx, int Dy, int Dz, float* __restrict__ src, int Sx, int Sy, int Sz, int layer, int scalex, int scaley) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; if (ix 0 && scaley > 0) cfg := make3DConf(dstsize) k_resize_async(dst.DevPtr(0), dstsize[X], dstsize[Y], dstsize[Z], src.DevPtr(0), srcsize[X], srcsize[Y], srcsize[Z], layer, scalex, scaley, cfg) } mumax3-3.10/cuda/resize_wrapper.go000066400000000000000000001726111371432437400171730ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for resize kernel var resize_code cu.Function // Stores the arguments for resize kernel invocation type resize_args_t struct { arg_dst unsafe.Pointer arg_Dx int arg_Dy int arg_Dz int arg_src unsafe.Pointer arg_Sx int arg_Sy int arg_Sz int arg_layer int arg_scalex int arg_scaley int argptr [11]unsafe.Pointer sync.Mutex } // Stores the arguments for resize kernel invocation var resize_args resize_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. resize_args.argptr[0] = unsafe.Pointer(&resize_args.arg_dst) resize_args.argptr[1] = unsafe.Pointer(&resize_args.arg_Dx) resize_args.argptr[2] = unsafe.Pointer(&resize_args.arg_Dy) resize_args.argptr[3] = unsafe.Pointer(&resize_args.arg_Dz) resize_args.argptr[4] = unsafe.Pointer(&resize_args.arg_src) resize_args.argptr[5] = unsafe.Pointer(&resize_args.arg_Sx) resize_args.argptr[6] = unsafe.Pointer(&resize_args.arg_Sy) resize_args.argptr[7] = unsafe.Pointer(&resize_args.arg_Sz) resize_args.argptr[8] = unsafe.Pointer(&resize_args.arg_layer) resize_args.argptr[9] = unsafe.Pointer(&resize_args.arg_scalex) resize_args.argptr[10] = unsafe.Pointer(&resize_args.arg_scaley) } // Wrapper for resize CUDA kernel, asynchronous. func k_resize_async(dst unsafe.Pointer, Dx int, Dy int, Dz int, src unsafe.Pointer, Sx int, Sy int, Sz int, layer int, scalex int, scaley int, cfg *config) { if Synchronous { // debug Sync() timer.Start("resize") } resize_args.Lock() defer resize_args.Unlock() if resize_code == 0 { resize_code = fatbinLoad(resize_map, "resize") } resize_args.arg_dst = dst resize_args.arg_Dx = Dx resize_args.arg_Dy = Dy resize_args.arg_Dz = Dz resize_args.arg_src = src resize_args.arg_Sx = Sx resize_args.arg_Sy = Sy resize_args.arg_Sz = Sz resize_args.arg_layer = layer resize_args.arg_scalex = scalex resize_args.arg_scaley = scaley args := resize_args.argptr[:] cu.LaunchKernel(resize_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("resize") } } // maps compute capability on PTX code for resize kernel. var resize_map = map[int]string{0: "", 30: resize_ptx_30, 32: resize_ptx_32, 35: resize_ptx_35, 37: resize_ptx_37, 50: resize_ptx_50, 52: resize_ptx_52, 53: resize_ptx_53, 60: resize_ptx_60, 61: resize_ptx_61, 62: resize_ptx_62, 70: resize_ptx_70, 72: resize_ptx_72, 75: resize_ptx_75} // resize PTX code for various compute capabilities. const ( resize_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<85>; .reg .b32 %r<88>; .reg .b64 %rd<21>; ld.param.u64 %rd4, [resize_param_0]; ld.param.u32 %r17, [resize_param_1]; ld.param.u32 %r23, [resize_param_2]; ld.param.u64 %rd5, [resize_param_4]; ld.param.u32 %r18, [resize_param_5]; ld.param.u32 %r19, [resize_param_6]; ld.param.u32 %r20, [resize_param_8]; ld.param.u32 %r21, [resize_param_9]; ld.param.u32 %r22, [resize_param_10]; mov.u32 %r24, %ctaid.x; mov.u32 %r25, %ntid.x; mov.u32 %r26, %tid.x; mad.lo.s32 %r27, %r25, %r24, %r26; mov.u32 %r28, %ntid.y; mov.u32 %r29, %ctaid.y; mov.u32 %r30, %tid.y; mad.lo.s32 %r31, %r28, %r29, %r30; setp.ge.s32 %p1, %r27, %r17; setp.ge.s32 %p2, %r31, %r23; or.pred %p3, %p1, %p2; @%p3 bra BB0_30; mov.f32 %f71, 0f00000000; setp.lt.s32 %p4, %r22, 1; mov.f32 %f72, %f71; @%p4 bra BB0_29; mov.f32 %f71, 0f00000000; mov.u32 %r82, 0; mov.f32 %f72, %f71; BB0_3: mad.lo.s32 %r2, %r31, %r22, %r82; setp.lt.s32 %p5, %r21, 1; @%p5 bra BB0_28; mad.lo.s32 %r41, %r20, %r19, %r2; mul.lo.s32 %r3, %r41, %r18; and.b32 %r40, %r21, 3; mov.u32 %r83, 0; mov.f32 %f48, 0f00000000; setp.eq.s32 %p6, %r40, 0; @%p6 bra BB0_5; setp.eq.s32 %p7, %r40, 1; @%p7 bra BB0_13; setp.eq.s32 %p8, %r40, 2; @%p8 bra BB0_10; setp.ge.s32 %p9, %r2, %r19; mul.lo.s32 %r47, %r27, %r21; setp.ge.s32 %p10, %r47, %r18; mov.u32 %r83, 1; or.pred %p11, %p9, %p10; @%p11 bra BB0_10; mad.lo.s32 %r53, %r27, %r21, %r3; cvta.to.global.u64 %rd6, %rd5; mul.wide.s32 %rd7, %r53, 4; add.s64 %rd8, %rd6, %rd7; ld.global.f32 %f49, [%rd8]; add.f32 %f72, %f72, %f49; add.f32 %f71, %f71, 0f3F800000; BB0_10: mad.lo.s32 %r5, %r27, %r21, %r83; setp.ge.s32 %p12, %r5, %r18; setp.ge.s32 %p13, %r2, %r19; or.pred %p14, %p13, %p12; @%p14 bra BB0_12; add.s32 %r58, %r5, %r3; cvta.to.global.u64 %rd9, %rd5; mul.wide.s32 %rd10, %r58, 4; add.s64 %rd11, %rd9, %rd10; ld.global.f32 %f50, [%rd11]; add.f32 %f72, %f72, %f50; add.f32 %f71, %f71, 0f3F800000; BB0_12: add.s32 %r83, %r83, 1; BB0_13: mad.lo.s32 %r8, %r27, %r21, %r83; setp.ge.s32 %p15, %r8, %r18; setp.ge.s32 %p16, %r2, %r19; or.pred %p17, %p16, %p15; @%p17 bra BB0_14; add.s32 %r63, %r8, %r3; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r63, 4; add.s64 %rd14, %rd12, %rd13; ld.global.f32 %f51, [%rd14]; add.f32 %f68, %f72, %f51; add.f32 %f67, %f71, 0f3F800000; bra.uni BB0_16; BB0_5: mov.f32 %f67, %f71; mov.f32 %f68, %f72; mov.f32 %f71, %f48; mov.f32 %f72, %f48; bra.uni BB0_17; BB0_14: mov.f32 %f68, %f72; mov.f32 %f67, %f71; BB0_16: add.s32 %r83, %r83, 1; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_17: setp.lt.u32 %p18, %r21, 4; @%p18 bra BB0_28; mad.lo.s32 %r86, %r27, %r21, %r83; mad.lo.s32 %r69, %r18, %r41, %r86; cvta.to.global.u64 %rd15, %rd5; mul.wide.s32 %rd16, %r69, 4; add.s64 %rd20, %rd15, %rd16; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_19: setp.ge.s32 %p19, %r86, %r18; setp.ge.s32 %p20, %r2, %r19; or.pred %p21, %p20, %p19; @%p21 bra BB0_21; ld.global.f32 %f52, [%rd20]; add.f32 %f72, %f72, %f52; add.f32 %f71, %f71, 0f3F800000; BB0_21: add.s32 %r70, %r86, 1; setp.ge.s32 %p22, %r70, %r18; or.pred %p24, %p20, %p22; @%p24 bra BB0_23; ld.global.f32 %f53, [%rd20+4]; add.f32 %f72, %f72, %f53; add.f32 %f71, %f71, 0f3F800000; BB0_23: add.s32 %r71, %r86, 2; setp.ge.s32 %p25, %r71, %r18; or.pred %p27, %p20, %p25; @%p27 bra BB0_25; ld.global.f32 %f54, [%rd20+8]; add.f32 %f72, %f72, %f54; add.f32 %f71, %f71, 0f3F800000; BB0_25: add.s32 %r72, %r86, 3; setp.ge.s32 %p28, %r72, %r18; or.pred %p30, %p20, %p28; @%p30 bra BB0_27; ld.global.f32 %f55, [%rd20+12]; add.f32 %f72, %f72, %f55; add.f32 %f71, %f71, 0f3F800000; BB0_27: add.s32 %r86, %r86, 4; add.s32 %r83, %r83, 4; setp.lt.s32 %p31, %r83, %r21; add.s64 %rd20, %rd20, 16; @%p31 bra BB0_19; BB0_28: add.s32 %r82, %r82, 1; setp.lt.s32 %p32, %r82, %r22; @%p32 bra BB0_3; BB0_29: mad.lo.s32 %r81, %r31, %r17, %r27; cvta.to.global.u64 %rd17, %rd4; mul.wide.s32 %rd18, %r81, 4; add.s64 %rd19, %rd17, %rd18; div.rn.f32 %f56, %f72, %f71; st.global.f32 [%rd19], %f56; BB0_30: ret; } ` resize_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<85>; .reg .b32 %r<58>; .reg .b64 %rd<18>; ld.param.u64 %rd5, [resize_param_0]; ld.param.u32 %r23, [resize_param_1]; ld.param.u32 %r29, [resize_param_2]; ld.param.u64 %rd6, [resize_param_4]; ld.param.u32 %r24, [resize_param_5]; ld.param.u32 %r25, [resize_param_6]; ld.param.u32 %r26, [resize_param_8]; ld.param.u32 %r27, [resize_param_9]; ld.param.u32 %r28, [resize_param_10]; cvta.to.global.u64 %rd1, %rd6; mov.u32 %r30, %ntid.x; mov.u32 %r31, %ctaid.x; mov.u32 %r32, %tid.x; mad.lo.s32 %r1, %r30, %r31, %r32; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r2, %r33, %r34, %r35; setp.ge.s32 %p1, %r1, %r23; setp.ge.s32 %p2, %r2, %r29; or.pred %p3, %p1, %p2; @%p3 bra BB0_30; mov.f32 %f71, 0f00000000; setp.lt.s32 %p4, %r28, 1; mov.f32 %f72, %f71; @%p4 bra BB0_29; mul.lo.s32 %r3, %r2, %r28; mul.lo.s32 %r4, %r1, %r27; mul.lo.s32 %r5, %r26, %r25; and.b32 %r6, %r27, 3; mov.f32 %f71, 0f00000000; mov.u32 %r52, 0; mov.f32 %f72, %f71; BB0_3: add.s32 %r8, %r52, %r3; setp.lt.s32 %p5, %r27, 1; @%p5 bra BB0_28; add.s32 %r40, %r8, %r5; mul.lo.s32 %r9, %r40, %r24; mov.u32 %r53, 0; mov.f32 %f48, 0f00000000; setp.eq.s32 %p6, %r6, 0; @%p6 bra BB0_5; setp.eq.s32 %p7, %r6, 1; @%p7 bra BB0_13; setp.eq.s32 %p8, %r6, 2; @%p8 bra BB0_10; setp.ge.s32 %p9, %r8, %r25; setp.ge.s32 %p10, %r4, %r24; mov.u32 %r53, 1; or.pred %p11, %p9, %p10; @%p11 bra BB0_10; add.s32 %r43, %r4, %r9; mul.wide.s32 %rd7, %r43, 4; add.s64 %rd8, %rd1, %rd7; ld.global.nc.f32 %f49, [%rd8]; add.f32 %f72, %f72, %f49; add.f32 %f71, %f71, 0f3F800000; BB0_10: add.s32 %r11, %r53, %r4; setp.ge.s32 %p12, %r11, %r24; setp.ge.s32 %p13, %r8, %r25; or.pred %p14, %p13, %p12; @%p14 bra BB0_12; add.s32 %r44, %r11, %r9; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd1, %rd9; ld.global.nc.f32 %f50, [%rd10]; add.f32 %f72, %f72, %f50; add.f32 %f71, %f71, 0f3F800000; BB0_12: add.s32 %r53, %r53, 1; BB0_13: add.s32 %r14, %r53, %r4; setp.ge.s32 %p15, %r14, %r24; setp.ge.s32 %p16, %r8, %r25; or.pred %p17, %p16, %p15; @%p17 bra BB0_14; add.s32 %r45, %r14, %r9; mul.wide.s32 %rd11, %r45, 4; add.s64 %rd12, %rd1, %rd11; ld.global.nc.f32 %f51, [%rd12]; add.f32 %f68, %f72, %f51; add.f32 %f67, %f71, 0f3F800000; bra.uni BB0_16; BB0_5: mov.f32 %f67, %f71; mov.f32 %f68, %f72; mov.f32 %f71, %f48; mov.f32 %f72, %f48; bra.uni BB0_17; BB0_14: mov.f32 %f68, %f72; mov.f32 %f67, %f71; BB0_16: add.s32 %r53, %r53, 1; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_17: setp.lt.u32 %p18, %r27, 4; @%p18 bra BB0_28; add.s32 %r56, %r4, %r53; mad.lo.s32 %r47, %r24, %r40, %r56; mul.wide.s32 %rd13, %r47, 4; add.s64 %rd17, %rd1, %rd13; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_19: setp.ge.s32 %p19, %r56, %r24; setp.ge.s32 %p20, %r8, %r25; or.pred %p21, %p20, %p19; @%p21 bra BB0_21; ld.global.nc.f32 %f52, [%rd17]; add.f32 %f72, %f72, %f52; add.f32 %f71, %f71, 0f3F800000; BB0_21: add.s32 %r48, %r56, 1; setp.ge.s32 %p22, %r48, %r24; or.pred %p24, %p20, %p22; @%p24 bra BB0_23; ld.global.nc.f32 %f53, [%rd17+4]; add.f32 %f72, %f72, %f53; add.f32 %f71, %f71, 0f3F800000; BB0_23: add.s32 %r49, %r56, 2; setp.ge.s32 %p25, %r49, %r24; or.pred %p27, %p20, %p25; @%p27 bra BB0_25; ld.global.nc.f32 %f54, [%rd17+8]; add.f32 %f72, %f72, %f54; add.f32 %f71, %f71, 0f3F800000; BB0_25: add.s32 %r50, %r56, 3; setp.ge.s32 %p28, %r50, %r24; or.pred %p30, %p20, %p28; @%p30 bra BB0_27; ld.global.nc.f32 %f55, [%rd17+12]; add.f32 %f72, %f72, %f55; add.f32 %f71, %f71, 0f3F800000; BB0_27: add.s32 %r56, %r56, 4; add.s32 %r53, %r53, 4; setp.lt.s32 %p31, %r53, %r27; add.s64 %rd17, %rd17, 16; @%p31 bra BB0_19; BB0_28: add.s32 %r52, %r52, 1; setp.lt.s32 %p32, %r52, %r28; @%p32 bra BB0_3; BB0_29: cvta.to.global.u64 %rd14, %rd5; mad.lo.s32 %r51, %r2, %r23, %r1; mul.wide.s32 %rd15, %r51, 4; add.s64 %rd16, %rd14, %rd15; div.rn.f32 %f56, %f72, %f71; st.global.f32 [%rd16], %f56; BB0_30: ret; } ` resize_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<85>; .reg .b32 %r<58>; .reg .b64 %rd<18>; ld.param.u64 %rd5, [resize_param_0]; ld.param.u32 %r23, [resize_param_1]; ld.param.u32 %r29, [resize_param_2]; ld.param.u64 %rd6, [resize_param_4]; ld.param.u32 %r24, [resize_param_5]; ld.param.u32 %r25, [resize_param_6]; ld.param.u32 %r26, [resize_param_8]; ld.param.u32 %r27, [resize_param_9]; ld.param.u32 %r28, [resize_param_10]; cvta.to.global.u64 %rd1, %rd6; mov.u32 %r30, %ntid.x; mov.u32 %r31, %ctaid.x; mov.u32 %r32, %tid.x; mad.lo.s32 %r1, %r30, %r31, %r32; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r2, %r33, %r34, %r35; setp.ge.s32 %p1, %r1, %r23; setp.ge.s32 %p2, %r2, %r29; or.pred %p3, %p1, %p2; @%p3 bra BB0_30; mov.f32 %f71, 0f00000000; setp.lt.s32 %p4, %r28, 1; mov.f32 %f72, %f71; @%p4 bra BB0_29; mul.lo.s32 %r3, %r2, %r28; mul.lo.s32 %r4, %r1, %r27; mul.lo.s32 %r5, %r26, %r25; and.b32 %r6, %r27, 3; mov.f32 %f71, 0f00000000; mov.u32 %r52, 0; mov.f32 %f72, %f71; BB0_3: add.s32 %r8, %r52, %r3; setp.lt.s32 %p5, %r27, 1; @%p5 bra BB0_28; add.s32 %r40, %r8, %r5; mul.lo.s32 %r9, %r40, %r24; mov.u32 %r53, 0; mov.f32 %f48, 0f00000000; setp.eq.s32 %p6, %r6, 0; @%p6 bra BB0_5; setp.eq.s32 %p7, %r6, 1; @%p7 bra BB0_13; setp.eq.s32 %p8, %r6, 2; @%p8 bra BB0_10; setp.ge.s32 %p9, %r8, %r25; setp.ge.s32 %p10, %r4, %r24; mov.u32 %r53, 1; or.pred %p11, %p9, %p10; @%p11 bra BB0_10; add.s32 %r43, %r4, %r9; mul.wide.s32 %rd7, %r43, 4; add.s64 %rd8, %rd1, %rd7; ld.global.nc.f32 %f49, [%rd8]; add.f32 %f72, %f72, %f49; add.f32 %f71, %f71, 0f3F800000; BB0_10: add.s32 %r11, %r53, %r4; setp.ge.s32 %p12, %r11, %r24; setp.ge.s32 %p13, %r8, %r25; or.pred %p14, %p13, %p12; @%p14 bra BB0_12; add.s32 %r44, %r11, %r9; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd1, %rd9; ld.global.nc.f32 %f50, [%rd10]; add.f32 %f72, %f72, %f50; add.f32 %f71, %f71, 0f3F800000; BB0_12: add.s32 %r53, %r53, 1; BB0_13: add.s32 %r14, %r53, %r4; setp.ge.s32 %p15, %r14, %r24; setp.ge.s32 %p16, %r8, %r25; or.pred %p17, %p16, %p15; @%p17 bra BB0_14; add.s32 %r45, %r14, %r9; mul.wide.s32 %rd11, %r45, 4; add.s64 %rd12, %rd1, %rd11; ld.global.nc.f32 %f51, [%rd12]; add.f32 %f68, %f72, %f51; add.f32 %f67, %f71, 0f3F800000; bra.uni BB0_16; BB0_5: mov.f32 %f67, %f71; mov.f32 %f68, %f72; mov.f32 %f71, %f48; mov.f32 %f72, %f48; bra.uni BB0_17; BB0_14: mov.f32 %f68, %f72; mov.f32 %f67, %f71; BB0_16: add.s32 %r53, %r53, 1; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_17: setp.lt.u32 %p18, %r27, 4; @%p18 bra BB0_28; add.s32 %r56, %r4, %r53; mad.lo.s32 %r47, %r24, %r40, %r56; mul.wide.s32 %rd13, %r47, 4; add.s64 %rd17, %rd1, %rd13; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_19: setp.ge.s32 %p19, %r56, %r24; setp.ge.s32 %p20, %r8, %r25; or.pred %p21, %p20, %p19; @%p21 bra BB0_21; ld.global.nc.f32 %f52, [%rd17]; add.f32 %f72, %f72, %f52; add.f32 %f71, %f71, 0f3F800000; BB0_21: add.s32 %r48, %r56, 1; setp.ge.s32 %p22, %r48, %r24; or.pred %p24, %p20, %p22; @%p24 bra BB0_23; ld.global.nc.f32 %f53, [%rd17+4]; add.f32 %f72, %f72, %f53; add.f32 %f71, %f71, 0f3F800000; BB0_23: add.s32 %r49, %r56, 2; setp.ge.s32 %p25, %r49, %r24; or.pred %p27, %p20, %p25; @%p27 bra BB0_25; ld.global.nc.f32 %f54, [%rd17+8]; add.f32 %f72, %f72, %f54; add.f32 %f71, %f71, 0f3F800000; BB0_25: add.s32 %r50, %r56, 3; setp.ge.s32 %p28, %r50, %r24; or.pred %p30, %p20, %p28; @%p30 bra BB0_27; ld.global.nc.f32 %f55, [%rd17+12]; add.f32 %f72, %f72, %f55; add.f32 %f71, %f71, 0f3F800000; BB0_27: add.s32 %r56, %r56, 4; add.s32 %r53, %r53, 4; setp.lt.s32 %p31, %r53, %r27; add.s64 %rd17, %rd17, 16; @%p31 bra BB0_19; BB0_28: add.s32 %r52, %r52, 1; setp.lt.s32 %p32, %r52, %r28; @%p32 bra BB0_3; BB0_29: cvta.to.global.u64 %rd14, %rd5; mad.lo.s32 %r51, %r2, %r23, %r1; mul.wide.s32 %rd15, %r51, 4; add.s64 %rd16, %rd14, %rd15; div.rn.f32 %f56, %f72, %f71; st.global.f32 [%rd16], %f56; BB0_30: ret; } ` resize_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<85>; .reg .b32 %r<58>; .reg .b64 %rd<18>; ld.param.u64 %rd5, [resize_param_0]; ld.param.u32 %r23, [resize_param_1]; ld.param.u32 %r29, [resize_param_2]; ld.param.u64 %rd6, [resize_param_4]; ld.param.u32 %r24, [resize_param_5]; ld.param.u32 %r25, [resize_param_6]; ld.param.u32 %r26, [resize_param_8]; ld.param.u32 %r27, [resize_param_9]; ld.param.u32 %r28, [resize_param_10]; cvta.to.global.u64 %rd1, %rd6; mov.u32 %r30, %ntid.x; mov.u32 %r31, %ctaid.x; mov.u32 %r32, %tid.x; mad.lo.s32 %r1, %r30, %r31, %r32; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r2, %r33, %r34, %r35; setp.ge.s32 %p1, %r1, %r23; setp.ge.s32 %p2, %r2, %r29; or.pred %p3, %p1, %p2; @%p3 bra BB0_30; mov.f32 %f71, 0f00000000; setp.lt.s32 %p4, %r28, 1; mov.f32 %f72, %f71; @%p4 bra BB0_29; mul.lo.s32 %r3, %r2, %r28; mul.lo.s32 %r4, %r1, %r27; mul.lo.s32 %r5, %r26, %r25; and.b32 %r6, %r27, 3; mov.f32 %f71, 0f00000000; mov.u32 %r52, 0; mov.f32 %f72, %f71; BB0_3: add.s32 %r8, %r52, %r3; setp.lt.s32 %p5, %r27, 1; @%p5 bra BB0_28; add.s32 %r40, %r8, %r5; mul.lo.s32 %r9, %r40, %r24; mov.u32 %r53, 0; mov.f32 %f48, 0f00000000; setp.eq.s32 %p6, %r6, 0; @%p6 bra BB0_5; setp.eq.s32 %p7, %r6, 1; @%p7 bra BB0_13; setp.eq.s32 %p8, %r6, 2; @%p8 bra BB0_10; setp.ge.s32 %p9, %r8, %r25; setp.ge.s32 %p10, %r4, %r24; mov.u32 %r53, 1; or.pred %p11, %p9, %p10; @%p11 bra BB0_10; add.s32 %r43, %r4, %r9; mul.wide.s32 %rd7, %r43, 4; add.s64 %rd8, %rd1, %rd7; ld.global.nc.f32 %f49, [%rd8]; add.f32 %f72, %f72, %f49; add.f32 %f71, %f71, 0f3F800000; BB0_10: add.s32 %r11, %r53, %r4; setp.ge.s32 %p12, %r11, %r24; setp.ge.s32 %p13, %r8, %r25; or.pred %p14, %p13, %p12; @%p14 bra BB0_12; add.s32 %r44, %r11, %r9; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd1, %rd9; ld.global.nc.f32 %f50, [%rd10]; add.f32 %f72, %f72, %f50; add.f32 %f71, %f71, 0f3F800000; BB0_12: add.s32 %r53, %r53, 1; BB0_13: add.s32 %r14, %r53, %r4; setp.ge.s32 %p15, %r14, %r24; setp.ge.s32 %p16, %r8, %r25; or.pred %p17, %p16, %p15; @%p17 bra BB0_14; add.s32 %r45, %r14, %r9; mul.wide.s32 %rd11, %r45, 4; add.s64 %rd12, %rd1, %rd11; ld.global.nc.f32 %f51, [%rd12]; add.f32 %f68, %f72, %f51; add.f32 %f67, %f71, 0f3F800000; bra.uni BB0_16; BB0_5: mov.f32 %f67, %f71; mov.f32 %f68, %f72; mov.f32 %f71, %f48; mov.f32 %f72, %f48; bra.uni BB0_17; BB0_14: mov.f32 %f68, %f72; mov.f32 %f67, %f71; BB0_16: add.s32 %r53, %r53, 1; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_17: setp.lt.u32 %p18, %r27, 4; @%p18 bra BB0_28; add.s32 %r56, %r4, %r53; mad.lo.s32 %r47, %r24, %r40, %r56; mul.wide.s32 %rd13, %r47, 4; add.s64 %rd17, %rd1, %rd13; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_19: setp.ge.s32 %p19, %r56, %r24; setp.ge.s32 %p20, %r8, %r25; or.pred %p21, %p20, %p19; @%p21 bra BB0_21; ld.global.nc.f32 %f52, [%rd17]; add.f32 %f72, %f72, %f52; add.f32 %f71, %f71, 0f3F800000; BB0_21: add.s32 %r48, %r56, 1; setp.ge.s32 %p22, %r48, %r24; or.pred %p24, %p20, %p22; @%p24 bra BB0_23; ld.global.nc.f32 %f53, [%rd17+4]; add.f32 %f72, %f72, %f53; add.f32 %f71, %f71, 0f3F800000; BB0_23: add.s32 %r49, %r56, 2; setp.ge.s32 %p25, %r49, %r24; or.pred %p27, %p20, %p25; @%p27 bra BB0_25; ld.global.nc.f32 %f54, [%rd17+8]; add.f32 %f72, %f72, %f54; add.f32 %f71, %f71, 0f3F800000; BB0_25: add.s32 %r50, %r56, 3; setp.ge.s32 %p28, %r50, %r24; or.pred %p30, %p20, %p28; @%p30 bra BB0_27; ld.global.nc.f32 %f55, [%rd17+12]; add.f32 %f72, %f72, %f55; add.f32 %f71, %f71, 0f3F800000; BB0_27: add.s32 %r56, %r56, 4; add.s32 %r53, %r53, 4; setp.lt.s32 %p31, %r53, %r27; add.s64 %rd17, %rd17, 16; @%p31 bra BB0_19; BB0_28: add.s32 %r52, %r52, 1; setp.lt.s32 %p32, %r52, %r28; @%p32 bra BB0_3; BB0_29: cvta.to.global.u64 %rd14, %rd5; mad.lo.s32 %r51, %r2, %r23, %r1; mul.wide.s32 %rd15, %r51, 4; add.s64 %rd16, %rd14, %rd15; div.rn.f32 %f56, %f72, %f71; st.global.f32 [%rd16], %f56; BB0_30: ret; } ` resize_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<85>; .reg .b32 %r<58>; .reg .b64 %rd<18>; ld.param.u64 %rd5, [resize_param_0]; ld.param.u32 %r23, [resize_param_1]; ld.param.u32 %r29, [resize_param_2]; ld.param.u64 %rd6, [resize_param_4]; ld.param.u32 %r24, [resize_param_5]; ld.param.u32 %r25, [resize_param_6]; ld.param.u32 %r26, [resize_param_8]; ld.param.u32 %r27, [resize_param_9]; ld.param.u32 %r28, [resize_param_10]; cvta.to.global.u64 %rd1, %rd6; mov.u32 %r30, %ntid.x; mov.u32 %r31, %ctaid.x; mov.u32 %r32, %tid.x; mad.lo.s32 %r1, %r30, %r31, %r32; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r2, %r33, %r34, %r35; setp.ge.s32 %p1, %r1, %r23; setp.ge.s32 %p2, %r2, %r29; or.pred %p3, %p1, %p2; @%p3 bra BB0_30; mov.f32 %f71, 0f00000000; setp.lt.s32 %p4, %r28, 1; mov.f32 %f72, %f71; @%p4 bra BB0_29; mul.lo.s32 %r3, %r2, %r28; mul.lo.s32 %r4, %r1, %r27; mul.lo.s32 %r5, %r26, %r25; and.b32 %r6, %r27, 3; mov.f32 %f71, 0f00000000; mov.u32 %r52, 0; mov.f32 %f72, %f71; BB0_3: add.s32 %r8, %r52, %r3; setp.lt.s32 %p5, %r27, 1; @%p5 bra BB0_28; add.s32 %r40, %r8, %r5; mul.lo.s32 %r9, %r40, %r24; mov.u32 %r53, 0; mov.f32 %f48, 0f00000000; setp.eq.s32 %p6, %r6, 0; @%p6 bra BB0_5; setp.eq.s32 %p7, %r6, 1; @%p7 bra BB0_13; setp.eq.s32 %p8, %r6, 2; @%p8 bra BB0_10; setp.ge.s32 %p9, %r8, %r25; setp.ge.s32 %p10, %r4, %r24; mov.u32 %r53, 1; or.pred %p11, %p9, %p10; @%p11 bra BB0_10; add.s32 %r43, %r4, %r9; mul.wide.s32 %rd7, %r43, 4; add.s64 %rd8, %rd1, %rd7; ld.global.nc.f32 %f49, [%rd8]; add.f32 %f72, %f72, %f49; add.f32 %f71, %f71, 0f3F800000; BB0_10: add.s32 %r11, %r53, %r4; setp.ge.s32 %p12, %r11, %r24; setp.ge.s32 %p13, %r8, %r25; or.pred %p14, %p13, %p12; @%p14 bra BB0_12; add.s32 %r44, %r11, %r9; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd1, %rd9; ld.global.nc.f32 %f50, [%rd10]; add.f32 %f72, %f72, %f50; add.f32 %f71, %f71, 0f3F800000; BB0_12: add.s32 %r53, %r53, 1; BB0_13: add.s32 %r14, %r53, %r4; setp.ge.s32 %p15, %r14, %r24; setp.ge.s32 %p16, %r8, %r25; or.pred %p17, %p16, %p15; @%p17 bra BB0_14; add.s32 %r45, %r14, %r9; mul.wide.s32 %rd11, %r45, 4; add.s64 %rd12, %rd1, %rd11; ld.global.nc.f32 %f51, [%rd12]; add.f32 %f68, %f72, %f51; add.f32 %f67, %f71, 0f3F800000; bra.uni BB0_16; BB0_5: mov.f32 %f67, %f71; mov.f32 %f68, %f72; mov.f32 %f71, %f48; mov.f32 %f72, %f48; bra.uni BB0_17; BB0_14: mov.f32 %f68, %f72; mov.f32 %f67, %f71; BB0_16: add.s32 %r53, %r53, 1; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_17: setp.lt.u32 %p18, %r27, 4; @%p18 bra BB0_28; add.s32 %r56, %r4, %r53; mad.lo.s32 %r47, %r24, %r40, %r56; mul.wide.s32 %rd13, %r47, 4; add.s64 %rd17, %rd1, %rd13; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_19: setp.ge.s32 %p19, %r56, %r24; setp.ge.s32 %p20, %r8, %r25; or.pred %p21, %p20, %p19; @%p21 bra BB0_21; ld.global.nc.f32 %f52, [%rd17]; add.f32 %f72, %f72, %f52; add.f32 %f71, %f71, 0f3F800000; BB0_21: add.s32 %r48, %r56, 1; setp.ge.s32 %p22, %r48, %r24; or.pred %p24, %p20, %p22; @%p24 bra BB0_23; ld.global.nc.f32 %f53, [%rd17+4]; add.f32 %f72, %f72, %f53; add.f32 %f71, %f71, 0f3F800000; BB0_23: add.s32 %r49, %r56, 2; setp.ge.s32 %p25, %r49, %r24; or.pred %p27, %p20, %p25; @%p27 bra BB0_25; ld.global.nc.f32 %f54, [%rd17+8]; add.f32 %f72, %f72, %f54; add.f32 %f71, %f71, 0f3F800000; BB0_25: add.s32 %r50, %r56, 3; setp.ge.s32 %p28, %r50, %r24; or.pred %p30, %p20, %p28; @%p30 bra BB0_27; ld.global.nc.f32 %f55, [%rd17+12]; add.f32 %f72, %f72, %f55; add.f32 %f71, %f71, 0f3F800000; BB0_27: add.s32 %r56, %r56, 4; add.s32 %r53, %r53, 4; setp.lt.s32 %p31, %r53, %r27; add.s64 %rd17, %rd17, 16; @%p31 bra BB0_19; BB0_28: add.s32 %r52, %r52, 1; setp.lt.s32 %p32, %r52, %r28; @%p32 bra BB0_3; BB0_29: cvta.to.global.u64 %rd14, %rd5; mad.lo.s32 %r51, %r2, %r23, %r1; mul.wide.s32 %rd15, %r51, 4; add.s64 %rd16, %rd14, %rd15; div.rn.f32 %f56, %f72, %f71; st.global.f32 [%rd16], %f56; BB0_30: ret; } ` resize_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<85>; .reg .b32 %r<58>; .reg .b64 %rd<18>; ld.param.u64 %rd5, [resize_param_0]; ld.param.u32 %r23, [resize_param_1]; ld.param.u32 %r29, [resize_param_2]; ld.param.u64 %rd6, [resize_param_4]; ld.param.u32 %r24, [resize_param_5]; ld.param.u32 %r25, [resize_param_6]; ld.param.u32 %r26, [resize_param_8]; ld.param.u32 %r27, [resize_param_9]; ld.param.u32 %r28, [resize_param_10]; cvta.to.global.u64 %rd1, %rd6; mov.u32 %r30, %ntid.x; mov.u32 %r31, %ctaid.x; mov.u32 %r32, %tid.x; mad.lo.s32 %r1, %r30, %r31, %r32; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r2, %r33, %r34, %r35; setp.ge.s32 %p1, %r1, %r23; setp.ge.s32 %p2, %r2, %r29; or.pred %p3, %p1, %p2; @%p3 bra BB0_30; mov.f32 %f71, 0f00000000; setp.lt.s32 %p4, %r28, 1; mov.f32 %f72, %f71; @%p4 bra BB0_29; mul.lo.s32 %r3, %r2, %r28; mul.lo.s32 %r4, %r1, %r27; mul.lo.s32 %r5, %r26, %r25; and.b32 %r6, %r27, 3; mov.f32 %f71, 0f00000000; mov.u32 %r52, 0; mov.f32 %f72, %f71; BB0_3: add.s32 %r8, %r52, %r3; setp.lt.s32 %p5, %r27, 1; @%p5 bra BB0_28; add.s32 %r40, %r8, %r5; mul.lo.s32 %r9, %r40, %r24; mov.u32 %r53, 0; mov.f32 %f48, 0f00000000; setp.eq.s32 %p6, %r6, 0; @%p6 bra BB0_5; setp.eq.s32 %p7, %r6, 1; @%p7 bra BB0_13; setp.eq.s32 %p8, %r6, 2; @%p8 bra BB0_10; setp.ge.s32 %p9, %r8, %r25; setp.ge.s32 %p10, %r4, %r24; mov.u32 %r53, 1; or.pred %p11, %p9, %p10; @%p11 bra BB0_10; add.s32 %r43, %r4, %r9; mul.wide.s32 %rd7, %r43, 4; add.s64 %rd8, %rd1, %rd7; ld.global.nc.f32 %f49, [%rd8]; add.f32 %f72, %f72, %f49; add.f32 %f71, %f71, 0f3F800000; BB0_10: add.s32 %r11, %r53, %r4; setp.ge.s32 %p12, %r11, %r24; setp.ge.s32 %p13, %r8, %r25; or.pred %p14, %p13, %p12; @%p14 bra BB0_12; add.s32 %r44, %r11, %r9; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd1, %rd9; ld.global.nc.f32 %f50, [%rd10]; add.f32 %f72, %f72, %f50; add.f32 %f71, %f71, 0f3F800000; BB0_12: add.s32 %r53, %r53, 1; BB0_13: add.s32 %r14, %r53, %r4; setp.ge.s32 %p15, %r14, %r24; setp.ge.s32 %p16, %r8, %r25; or.pred %p17, %p16, %p15; @%p17 bra BB0_14; add.s32 %r45, %r14, %r9; mul.wide.s32 %rd11, %r45, 4; add.s64 %rd12, %rd1, %rd11; ld.global.nc.f32 %f51, [%rd12]; add.f32 %f68, %f72, %f51; add.f32 %f67, %f71, 0f3F800000; bra.uni BB0_16; BB0_5: mov.f32 %f67, %f71; mov.f32 %f68, %f72; mov.f32 %f71, %f48; mov.f32 %f72, %f48; bra.uni BB0_17; BB0_14: mov.f32 %f68, %f72; mov.f32 %f67, %f71; BB0_16: add.s32 %r53, %r53, 1; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_17: setp.lt.u32 %p18, %r27, 4; @%p18 bra BB0_28; add.s32 %r56, %r4, %r53; mad.lo.s32 %r47, %r24, %r40, %r56; mul.wide.s32 %rd13, %r47, 4; add.s64 %rd17, %rd1, %rd13; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_19: setp.ge.s32 %p19, %r56, %r24; setp.ge.s32 %p20, %r8, %r25; or.pred %p21, %p20, %p19; @%p21 bra BB0_21; ld.global.nc.f32 %f52, [%rd17]; add.f32 %f72, %f72, %f52; add.f32 %f71, %f71, 0f3F800000; BB0_21: add.s32 %r48, %r56, 1; setp.ge.s32 %p22, %r48, %r24; or.pred %p24, %p20, %p22; @%p24 bra BB0_23; ld.global.nc.f32 %f53, [%rd17+4]; add.f32 %f72, %f72, %f53; add.f32 %f71, %f71, 0f3F800000; BB0_23: add.s32 %r49, %r56, 2; setp.ge.s32 %p25, %r49, %r24; or.pred %p27, %p20, %p25; @%p27 bra BB0_25; ld.global.nc.f32 %f54, [%rd17+8]; add.f32 %f72, %f72, %f54; add.f32 %f71, %f71, 0f3F800000; BB0_25: add.s32 %r50, %r56, 3; setp.ge.s32 %p28, %r50, %r24; or.pred %p30, %p20, %p28; @%p30 bra BB0_27; ld.global.nc.f32 %f55, [%rd17+12]; add.f32 %f72, %f72, %f55; add.f32 %f71, %f71, 0f3F800000; BB0_27: add.s32 %r56, %r56, 4; add.s32 %r53, %r53, 4; setp.lt.s32 %p31, %r53, %r27; add.s64 %rd17, %rd17, 16; @%p31 bra BB0_19; BB0_28: add.s32 %r52, %r52, 1; setp.lt.s32 %p32, %r52, %r28; @%p32 bra BB0_3; BB0_29: cvta.to.global.u64 %rd14, %rd5; mad.lo.s32 %r51, %r2, %r23, %r1; mul.wide.s32 %rd15, %r51, 4; add.s64 %rd16, %rd14, %rd15; div.rn.f32 %f56, %f72, %f71; st.global.f32 [%rd16], %f56; BB0_30: ret; } ` resize_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<85>; .reg .b32 %r<58>; .reg .b64 %rd<18>; ld.param.u64 %rd5, [resize_param_0]; ld.param.u32 %r23, [resize_param_1]; ld.param.u32 %r29, [resize_param_2]; ld.param.u64 %rd6, [resize_param_4]; ld.param.u32 %r24, [resize_param_5]; ld.param.u32 %r25, [resize_param_6]; ld.param.u32 %r26, [resize_param_8]; ld.param.u32 %r27, [resize_param_9]; ld.param.u32 %r28, [resize_param_10]; cvta.to.global.u64 %rd1, %rd6; mov.u32 %r30, %ntid.x; mov.u32 %r31, %ctaid.x; mov.u32 %r32, %tid.x; mad.lo.s32 %r1, %r30, %r31, %r32; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r2, %r33, %r34, %r35; setp.ge.s32 %p1, %r1, %r23; setp.ge.s32 %p2, %r2, %r29; or.pred %p3, %p1, %p2; @%p3 bra BB0_30; mov.f32 %f71, 0f00000000; setp.lt.s32 %p4, %r28, 1; mov.f32 %f72, %f71; @%p4 bra BB0_29; mul.lo.s32 %r3, %r2, %r28; mul.lo.s32 %r4, %r1, %r27; mul.lo.s32 %r5, %r26, %r25; and.b32 %r6, %r27, 3; mov.f32 %f71, 0f00000000; mov.u32 %r52, 0; mov.f32 %f72, %f71; BB0_3: add.s32 %r8, %r52, %r3; setp.lt.s32 %p5, %r27, 1; @%p5 bra BB0_28; add.s32 %r40, %r8, %r5; mul.lo.s32 %r9, %r40, %r24; mov.u32 %r53, 0; mov.f32 %f48, 0f00000000; setp.eq.s32 %p6, %r6, 0; @%p6 bra BB0_5; setp.eq.s32 %p7, %r6, 1; @%p7 bra BB0_13; setp.eq.s32 %p8, %r6, 2; @%p8 bra BB0_10; setp.ge.s32 %p9, %r8, %r25; setp.ge.s32 %p10, %r4, %r24; mov.u32 %r53, 1; or.pred %p11, %p9, %p10; @%p11 bra BB0_10; add.s32 %r43, %r4, %r9; mul.wide.s32 %rd7, %r43, 4; add.s64 %rd8, %rd1, %rd7; ld.global.nc.f32 %f49, [%rd8]; add.f32 %f72, %f72, %f49; add.f32 %f71, %f71, 0f3F800000; BB0_10: add.s32 %r11, %r53, %r4; setp.ge.s32 %p12, %r11, %r24; setp.ge.s32 %p13, %r8, %r25; or.pred %p14, %p13, %p12; @%p14 bra BB0_12; add.s32 %r44, %r11, %r9; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd1, %rd9; ld.global.nc.f32 %f50, [%rd10]; add.f32 %f72, %f72, %f50; add.f32 %f71, %f71, 0f3F800000; BB0_12: add.s32 %r53, %r53, 1; BB0_13: add.s32 %r14, %r53, %r4; setp.ge.s32 %p15, %r14, %r24; setp.ge.s32 %p16, %r8, %r25; or.pred %p17, %p16, %p15; @%p17 bra BB0_14; add.s32 %r45, %r14, %r9; mul.wide.s32 %rd11, %r45, 4; add.s64 %rd12, %rd1, %rd11; ld.global.nc.f32 %f51, [%rd12]; add.f32 %f68, %f72, %f51; add.f32 %f67, %f71, 0f3F800000; bra.uni BB0_16; BB0_5: mov.f32 %f67, %f71; mov.f32 %f68, %f72; mov.f32 %f71, %f48; mov.f32 %f72, %f48; bra.uni BB0_17; BB0_14: mov.f32 %f68, %f72; mov.f32 %f67, %f71; BB0_16: add.s32 %r53, %r53, 1; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_17: setp.lt.u32 %p18, %r27, 4; @%p18 bra BB0_28; add.s32 %r56, %r4, %r53; mad.lo.s32 %r47, %r24, %r40, %r56; mul.wide.s32 %rd13, %r47, 4; add.s64 %rd17, %rd1, %rd13; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_19: setp.ge.s32 %p19, %r56, %r24; setp.ge.s32 %p20, %r8, %r25; or.pred %p21, %p20, %p19; @%p21 bra BB0_21; ld.global.nc.f32 %f52, [%rd17]; add.f32 %f72, %f72, %f52; add.f32 %f71, %f71, 0f3F800000; BB0_21: add.s32 %r48, %r56, 1; setp.ge.s32 %p22, %r48, %r24; or.pred %p24, %p20, %p22; @%p24 bra BB0_23; ld.global.nc.f32 %f53, [%rd17+4]; add.f32 %f72, %f72, %f53; add.f32 %f71, %f71, 0f3F800000; BB0_23: add.s32 %r49, %r56, 2; setp.ge.s32 %p25, %r49, %r24; or.pred %p27, %p20, %p25; @%p27 bra BB0_25; ld.global.nc.f32 %f54, [%rd17+8]; add.f32 %f72, %f72, %f54; add.f32 %f71, %f71, 0f3F800000; BB0_25: add.s32 %r50, %r56, 3; setp.ge.s32 %p28, %r50, %r24; or.pred %p30, %p20, %p28; @%p30 bra BB0_27; ld.global.nc.f32 %f55, [%rd17+12]; add.f32 %f72, %f72, %f55; add.f32 %f71, %f71, 0f3F800000; BB0_27: add.s32 %r56, %r56, 4; add.s32 %r53, %r53, 4; setp.lt.s32 %p31, %r53, %r27; add.s64 %rd17, %rd17, 16; @%p31 bra BB0_19; BB0_28: add.s32 %r52, %r52, 1; setp.lt.s32 %p32, %r52, %r28; @%p32 bra BB0_3; BB0_29: cvta.to.global.u64 %rd14, %rd5; mad.lo.s32 %r51, %r2, %r23, %r1; mul.wide.s32 %rd15, %r51, 4; add.s64 %rd16, %rd14, %rd15; div.rn.f32 %f56, %f72, %f71; st.global.f32 [%rd16], %f56; BB0_30: ret; } ` resize_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<85>; .reg .b32 %r<58>; .reg .b64 %rd<18>; ld.param.u64 %rd5, [resize_param_0]; ld.param.u32 %r23, [resize_param_1]; ld.param.u32 %r29, [resize_param_2]; ld.param.u64 %rd6, [resize_param_4]; ld.param.u32 %r24, [resize_param_5]; ld.param.u32 %r25, [resize_param_6]; ld.param.u32 %r26, [resize_param_8]; ld.param.u32 %r27, [resize_param_9]; ld.param.u32 %r28, [resize_param_10]; cvta.to.global.u64 %rd1, %rd6; mov.u32 %r30, %ntid.x; mov.u32 %r31, %ctaid.x; mov.u32 %r32, %tid.x; mad.lo.s32 %r1, %r30, %r31, %r32; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r2, %r33, %r34, %r35; setp.ge.s32 %p1, %r1, %r23; setp.ge.s32 %p2, %r2, %r29; or.pred %p3, %p1, %p2; @%p3 bra BB0_30; mov.f32 %f71, 0f00000000; setp.lt.s32 %p4, %r28, 1; mov.f32 %f72, %f71; @%p4 bra BB0_29; mul.lo.s32 %r3, %r2, %r28; mul.lo.s32 %r4, %r1, %r27; mul.lo.s32 %r5, %r26, %r25; and.b32 %r6, %r27, 3; mov.f32 %f71, 0f00000000; mov.u32 %r52, 0; mov.f32 %f72, %f71; BB0_3: add.s32 %r8, %r52, %r3; setp.lt.s32 %p5, %r27, 1; @%p5 bra BB0_28; add.s32 %r40, %r8, %r5; mul.lo.s32 %r9, %r40, %r24; mov.u32 %r53, 0; mov.f32 %f48, 0f00000000; setp.eq.s32 %p6, %r6, 0; @%p6 bra BB0_5; setp.eq.s32 %p7, %r6, 1; @%p7 bra BB0_13; setp.eq.s32 %p8, %r6, 2; @%p8 bra BB0_10; setp.ge.s32 %p9, %r8, %r25; setp.ge.s32 %p10, %r4, %r24; mov.u32 %r53, 1; or.pred %p11, %p9, %p10; @%p11 bra BB0_10; add.s32 %r43, %r4, %r9; mul.wide.s32 %rd7, %r43, 4; add.s64 %rd8, %rd1, %rd7; ld.global.nc.f32 %f49, [%rd8]; add.f32 %f72, %f72, %f49; add.f32 %f71, %f71, 0f3F800000; BB0_10: add.s32 %r11, %r53, %r4; setp.ge.s32 %p12, %r11, %r24; setp.ge.s32 %p13, %r8, %r25; or.pred %p14, %p13, %p12; @%p14 bra BB0_12; add.s32 %r44, %r11, %r9; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd1, %rd9; ld.global.nc.f32 %f50, [%rd10]; add.f32 %f72, %f72, %f50; add.f32 %f71, %f71, 0f3F800000; BB0_12: add.s32 %r53, %r53, 1; BB0_13: add.s32 %r14, %r53, %r4; setp.ge.s32 %p15, %r14, %r24; setp.ge.s32 %p16, %r8, %r25; or.pred %p17, %p16, %p15; @%p17 bra BB0_14; add.s32 %r45, %r14, %r9; mul.wide.s32 %rd11, %r45, 4; add.s64 %rd12, %rd1, %rd11; ld.global.nc.f32 %f51, [%rd12]; add.f32 %f68, %f72, %f51; add.f32 %f67, %f71, 0f3F800000; bra.uni BB0_16; BB0_5: mov.f32 %f67, %f71; mov.f32 %f68, %f72; mov.f32 %f71, %f48; mov.f32 %f72, %f48; bra.uni BB0_17; BB0_14: mov.f32 %f68, %f72; mov.f32 %f67, %f71; BB0_16: add.s32 %r53, %r53, 1; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_17: setp.lt.u32 %p18, %r27, 4; @%p18 bra BB0_28; add.s32 %r56, %r4, %r53; mad.lo.s32 %r47, %r24, %r40, %r56; mul.wide.s32 %rd13, %r47, 4; add.s64 %rd17, %rd1, %rd13; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_19: setp.ge.s32 %p19, %r56, %r24; setp.ge.s32 %p20, %r8, %r25; or.pred %p21, %p20, %p19; @%p21 bra BB0_21; ld.global.nc.f32 %f52, [%rd17]; add.f32 %f72, %f72, %f52; add.f32 %f71, %f71, 0f3F800000; BB0_21: add.s32 %r48, %r56, 1; setp.ge.s32 %p22, %r48, %r24; or.pred %p24, %p20, %p22; @%p24 bra BB0_23; ld.global.nc.f32 %f53, [%rd17+4]; add.f32 %f72, %f72, %f53; add.f32 %f71, %f71, 0f3F800000; BB0_23: add.s32 %r49, %r56, 2; setp.ge.s32 %p25, %r49, %r24; or.pred %p27, %p20, %p25; @%p27 bra BB0_25; ld.global.nc.f32 %f54, [%rd17+8]; add.f32 %f72, %f72, %f54; add.f32 %f71, %f71, 0f3F800000; BB0_25: add.s32 %r50, %r56, 3; setp.ge.s32 %p28, %r50, %r24; or.pred %p30, %p20, %p28; @%p30 bra BB0_27; ld.global.nc.f32 %f55, [%rd17+12]; add.f32 %f72, %f72, %f55; add.f32 %f71, %f71, 0f3F800000; BB0_27: add.s32 %r56, %r56, 4; add.s32 %r53, %r53, 4; setp.lt.s32 %p31, %r53, %r27; add.s64 %rd17, %rd17, 16; @%p31 bra BB0_19; BB0_28: add.s32 %r52, %r52, 1; setp.lt.s32 %p32, %r52, %r28; @%p32 bra BB0_3; BB0_29: cvta.to.global.u64 %rd14, %rd5; mad.lo.s32 %r51, %r2, %r23, %r1; mul.wide.s32 %rd15, %r51, 4; add.s64 %rd16, %rd14, %rd15; div.rn.f32 %f56, %f72, %f71; st.global.f32 [%rd16], %f56; BB0_30: ret; } ` resize_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<85>; .reg .b32 %r<58>; .reg .b64 %rd<18>; ld.param.u64 %rd5, [resize_param_0]; ld.param.u32 %r23, [resize_param_1]; ld.param.u32 %r29, [resize_param_2]; ld.param.u64 %rd6, [resize_param_4]; ld.param.u32 %r24, [resize_param_5]; ld.param.u32 %r25, [resize_param_6]; ld.param.u32 %r26, [resize_param_8]; ld.param.u32 %r27, [resize_param_9]; ld.param.u32 %r28, [resize_param_10]; cvta.to.global.u64 %rd1, %rd6; mov.u32 %r30, %ntid.x; mov.u32 %r31, %ctaid.x; mov.u32 %r32, %tid.x; mad.lo.s32 %r1, %r30, %r31, %r32; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r2, %r33, %r34, %r35; setp.ge.s32 %p1, %r1, %r23; setp.ge.s32 %p2, %r2, %r29; or.pred %p3, %p1, %p2; @%p3 bra BB0_30; mov.f32 %f71, 0f00000000; setp.lt.s32 %p4, %r28, 1; mov.f32 %f72, %f71; @%p4 bra BB0_29; mul.lo.s32 %r3, %r2, %r28; mul.lo.s32 %r4, %r1, %r27; mul.lo.s32 %r5, %r26, %r25; and.b32 %r6, %r27, 3; mov.f32 %f71, 0f00000000; mov.u32 %r52, 0; mov.f32 %f72, %f71; BB0_3: add.s32 %r8, %r52, %r3; setp.lt.s32 %p5, %r27, 1; @%p5 bra BB0_28; add.s32 %r40, %r8, %r5; mul.lo.s32 %r9, %r40, %r24; mov.u32 %r53, 0; mov.f32 %f48, 0f00000000; setp.eq.s32 %p6, %r6, 0; @%p6 bra BB0_5; setp.eq.s32 %p7, %r6, 1; @%p7 bra BB0_13; setp.eq.s32 %p8, %r6, 2; @%p8 bra BB0_10; setp.ge.s32 %p9, %r8, %r25; setp.ge.s32 %p10, %r4, %r24; mov.u32 %r53, 1; or.pred %p11, %p9, %p10; @%p11 bra BB0_10; add.s32 %r43, %r4, %r9; mul.wide.s32 %rd7, %r43, 4; add.s64 %rd8, %rd1, %rd7; ld.global.nc.f32 %f49, [%rd8]; add.f32 %f72, %f72, %f49; add.f32 %f71, %f71, 0f3F800000; BB0_10: add.s32 %r11, %r53, %r4; setp.ge.s32 %p12, %r11, %r24; setp.ge.s32 %p13, %r8, %r25; or.pred %p14, %p13, %p12; @%p14 bra BB0_12; add.s32 %r44, %r11, %r9; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd1, %rd9; ld.global.nc.f32 %f50, [%rd10]; add.f32 %f72, %f72, %f50; add.f32 %f71, %f71, 0f3F800000; BB0_12: add.s32 %r53, %r53, 1; BB0_13: add.s32 %r14, %r53, %r4; setp.ge.s32 %p15, %r14, %r24; setp.ge.s32 %p16, %r8, %r25; or.pred %p17, %p16, %p15; @%p17 bra BB0_14; add.s32 %r45, %r14, %r9; mul.wide.s32 %rd11, %r45, 4; add.s64 %rd12, %rd1, %rd11; ld.global.nc.f32 %f51, [%rd12]; add.f32 %f68, %f72, %f51; add.f32 %f67, %f71, 0f3F800000; bra.uni BB0_16; BB0_5: mov.f32 %f67, %f71; mov.f32 %f68, %f72; mov.f32 %f71, %f48; mov.f32 %f72, %f48; bra.uni BB0_17; BB0_14: mov.f32 %f68, %f72; mov.f32 %f67, %f71; BB0_16: add.s32 %r53, %r53, 1; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_17: setp.lt.u32 %p18, %r27, 4; @%p18 bra BB0_28; add.s32 %r56, %r4, %r53; mad.lo.s32 %r47, %r24, %r40, %r56; mul.wide.s32 %rd13, %r47, 4; add.s64 %rd17, %rd1, %rd13; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_19: setp.ge.s32 %p19, %r56, %r24; setp.ge.s32 %p20, %r8, %r25; or.pred %p21, %p20, %p19; @%p21 bra BB0_21; ld.global.nc.f32 %f52, [%rd17]; add.f32 %f72, %f72, %f52; add.f32 %f71, %f71, 0f3F800000; BB0_21: add.s32 %r48, %r56, 1; setp.ge.s32 %p22, %r48, %r24; or.pred %p24, %p20, %p22; @%p24 bra BB0_23; ld.global.nc.f32 %f53, [%rd17+4]; add.f32 %f72, %f72, %f53; add.f32 %f71, %f71, 0f3F800000; BB0_23: add.s32 %r49, %r56, 2; setp.ge.s32 %p25, %r49, %r24; or.pred %p27, %p20, %p25; @%p27 bra BB0_25; ld.global.nc.f32 %f54, [%rd17+8]; add.f32 %f72, %f72, %f54; add.f32 %f71, %f71, 0f3F800000; BB0_25: add.s32 %r50, %r56, 3; setp.ge.s32 %p28, %r50, %r24; or.pred %p30, %p20, %p28; @%p30 bra BB0_27; ld.global.nc.f32 %f55, [%rd17+12]; add.f32 %f72, %f72, %f55; add.f32 %f71, %f71, 0f3F800000; BB0_27: add.s32 %r56, %r56, 4; add.s32 %r53, %r53, 4; setp.lt.s32 %p31, %r53, %r27; add.s64 %rd17, %rd17, 16; @%p31 bra BB0_19; BB0_28: add.s32 %r52, %r52, 1; setp.lt.s32 %p32, %r52, %r28; @%p32 bra BB0_3; BB0_29: cvta.to.global.u64 %rd14, %rd5; mad.lo.s32 %r51, %r2, %r23, %r1; mul.wide.s32 %rd15, %r51, 4; add.s64 %rd16, %rd14, %rd15; div.rn.f32 %f56, %f72, %f71; st.global.f32 [%rd16], %f56; BB0_30: ret; } ` resize_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<85>; .reg .b32 %r<58>; .reg .b64 %rd<18>; ld.param.u64 %rd5, [resize_param_0]; ld.param.u32 %r23, [resize_param_1]; ld.param.u32 %r29, [resize_param_2]; ld.param.u64 %rd6, [resize_param_4]; ld.param.u32 %r24, [resize_param_5]; ld.param.u32 %r25, [resize_param_6]; ld.param.u32 %r26, [resize_param_8]; ld.param.u32 %r27, [resize_param_9]; ld.param.u32 %r28, [resize_param_10]; cvta.to.global.u64 %rd1, %rd6; mov.u32 %r30, %ntid.x; mov.u32 %r31, %ctaid.x; mov.u32 %r32, %tid.x; mad.lo.s32 %r1, %r30, %r31, %r32; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r2, %r33, %r34, %r35; setp.ge.s32 %p1, %r1, %r23; setp.ge.s32 %p2, %r2, %r29; or.pred %p3, %p1, %p2; @%p3 bra BB0_30; mov.f32 %f71, 0f00000000; setp.lt.s32 %p4, %r28, 1; mov.f32 %f72, %f71; @%p4 bra BB0_29; mul.lo.s32 %r3, %r2, %r28; mul.lo.s32 %r4, %r1, %r27; mul.lo.s32 %r5, %r26, %r25; and.b32 %r6, %r27, 3; mov.f32 %f71, 0f00000000; mov.u32 %r52, 0; mov.f32 %f72, %f71; BB0_3: add.s32 %r8, %r52, %r3; setp.lt.s32 %p5, %r27, 1; @%p5 bra BB0_28; add.s32 %r40, %r8, %r5; mul.lo.s32 %r9, %r40, %r24; mov.u32 %r53, 0; mov.f32 %f48, 0f00000000; setp.eq.s32 %p6, %r6, 0; @%p6 bra BB0_5; setp.eq.s32 %p7, %r6, 1; @%p7 bra BB0_13; setp.eq.s32 %p8, %r6, 2; @%p8 bra BB0_10; setp.ge.s32 %p9, %r8, %r25; setp.ge.s32 %p10, %r4, %r24; mov.u32 %r53, 1; or.pred %p11, %p9, %p10; @%p11 bra BB0_10; add.s32 %r43, %r4, %r9; mul.wide.s32 %rd7, %r43, 4; add.s64 %rd8, %rd1, %rd7; ld.global.nc.f32 %f49, [%rd8]; add.f32 %f72, %f72, %f49; add.f32 %f71, %f71, 0f3F800000; BB0_10: add.s32 %r11, %r53, %r4; setp.ge.s32 %p12, %r11, %r24; setp.ge.s32 %p13, %r8, %r25; or.pred %p14, %p13, %p12; @%p14 bra BB0_12; add.s32 %r44, %r11, %r9; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd1, %rd9; ld.global.nc.f32 %f50, [%rd10]; add.f32 %f72, %f72, %f50; add.f32 %f71, %f71, 0f3F800000; BB0_12: add.s32 %r53, %r53, 1; BB0_13: add.s32 %r14, %r53, %r4; setp.ge.s32 %p15, %r14, %r24; setp.ge.s32 %p16, %r8, %r25; or.pred %p17, %p16, %p15; @%p17 bra BB0_14; add.s32 %r45, %r14, %r9; mul.wide.s32 %rd11, %r45, 4; add.s64 %rd12, %rd1, %rd11; ld.global.nc.f32 %f51, [%rd12]; add.f32 %f68, %f72, %f51; add.f32 %f67, %f71, 0f3F800000; bra.uni BB0_16; BB0_5: mov.f32 %f67, %f71; mov.f32 %f68, %f72; mov.f32 %f71, %f48; mov.f32 %f72, %f48; bra.uni BB0_17; BB0_14: mov.f32 %f68, %f72; mov.f32 %f67, %f71; BB0_16: add.s32 %r53, %r53, 1; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_17: setp.lt.u32 %p18, %r27, 4; @%p18 bra BB0_28; add.s32 %r56, %r4, %r53; mad.lo.s32 %r47, %r24, %r40, %r56; mul.wide.s32 %rd13, %r47, 4; add.s64 %rd17, %rd1, %rd13; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_19: setp.ge.s32 %p19, %r56, %r24; setp.ge.s32 %p20, %r8, %r25; or.pred %p21, %p20, %p19; @%p21 bra BB0_21; ld.global.nc.f32 %f52, [%rd17]; add.f32 %f72, %f72, %f52; add.f32 %f71, %f71, 0f3F800000; BB0_21: add.s32 %r48, %r56, 1; setp.ge.s32 %p22, %r48, %r24; or.pred %p24, %p20, %p22; @%p24 bra BB0_23; ld.global.nc.f32 %f53, [%rd17+4]; add.f32 %f72, %f72, %f53; add.f32 %f71, %f71, 0f3F800000; BB0_23: add.s32 %r49, %r56, 2; setp.ge.s32 %p25, %r49, %r24; or.pred %p27, %p20, %p25; @%p27 bra BB0_25; ld.global.nc.f32 %f54, [%rd17+8]; add.f32 %f72, %f72, %f54; add.f32 %f71, %f71, 0f3F800000; BB0_25: add.s32 %r50, %r56, 3; setp.ge.s32 %p28, %r50, %r24; or.pred %p30, %p20, %p28; @%p30 bra BB0_27; ld.global.nc.f32 %f55, [%rd17+12]; add.f32 %f72, %f72, %f55; add.f32 %f71, %f71, 0f3F800000; BB0_27: add.s32 %r56, %r56, 4; add.s32 %r53, %r53, 4; setp.lt.s32 %p31, %r53, %r27; add.s64 %rd17, %rd17, 16; @%p31 bra BB0_19; BB0_28: add.s32 %r52, %r52, 1; setp.lt.s32 %p32, %r52, %r28; @%p32 bra BB0_3; BB0_29: cvta.to.global.u64 %rd14, %rd5; mad.lo.s32 %r51, %r2, %r23, %r1; mul.wide.s32 %rd15, %r51, 4; add.s64 %rd16, %rd14, %rd15; div.rn.f32 %f56, %f72, %f71; st.global.f32 [%rd16], %f56; BB0_30: ret; } ` resize_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<85>; .reg .b32 %r<58>; .reg .b64 %rd<18>; ld.param.u64 %rd5, [resize_param_0]; ld.param.u32 %r23, [resize_param_1]; ld.param.u32 %r29, [resize_param_2]; ld.param.u64 %rd6, [resize_param_4]; ld.param.u32 %r24, [resize_param_5]; ld.param.u32 %r25, [resize_param_6]; ld.param.u32 %r26, [resize_param_8]; ld.param.u32 %r27, [resize_param_9]; ld.param.u32 %r28, [resize_param_10]; cvta.to.global.u64 %rd1, %rd6; mov.u32 %r30, %ntid.x; mov.u32 %r31, %ctaid.x; mov.u32 %r32, %tid.x; mad.lo.s32 %r1, %r30, %r31, %r32; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r2, %r33, %r34, %r35; setp.ge.s32 %p1, %r1, %r23; setp.ge.s32 %p2, %r2, %r29; or.pred %p3, %p1, %p2; @%p3 bra BB0_30; mov.f32 %f71, 0f00000000; setp.lt.s32 %p4, %r28, 1; mov.f32 %f72, %f71; @%p4 bra BB0_29; mul.lo.s32 %r3, %r2, %r28; mul.lo.s32 %r4, %r1, %r27; mul.lo.s32 %r5, %r26, %r25; and.b32 %r6, %r27, 3; mov.f32 %f71, 0f00000000; mov.u32 %r52, 0; mov.f32 %f72, %f71; BB0_3: add.s32 %r8, %r52, %r3; setp.lt.s32 %p5, %r27, 1; @%p5 bra BB0_28; add.s32 %r40, %r8, %r5; mul.lo.s32 %r9, %r40, %r24; mov.u32 %r53, 0; mov.f32 %f48, 0f00000000; setp.eq.s32 %p6, %r6, 0; @%p6 bra BB0_5; setp.eq.s32 %p7, %r6, 1; @%p7 bra BB0_13; setp.eq.s32 %p8, %r6, 2; @%p8 bra BB0_10; setp.ge.s32 %p9, %r8, %r25; setp.ge.s32 %p10, %r4, %r24; mov.u32 %r53, 1; or.pred %p11, %p9, %p10; @%p11 bra BB0_10; add.s32 %r43, %r4, %r9; mul.wide.s32 %rd7, %r43, 4; add.s64 %rd8, %rd1, %rd7; ld.global.nc.f32 %f49, [%rd8]; add.f32 %f72, %f72, %f49; add.f32 %f71, %f71, 0f3F800000; BB0_10: add.s32 %r11, %r53, %r4; setp.ge.s32 %p12, %r11, %r24; setp.ge.s32 %p13, %r8, %r25; or.pred %p14, %p13, %p12; @%p14 bra BB0_12; add.s32 %r44, %r11, %r9; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd1, %rd9; ld.global.nc.f32 %f50, [%rd10]; add.f32 %f72, %f72, %f50; add.f32 %f71, %f71, 0f3F800000; BB0_12: add.s32 %r53, %r53, 1; BB0_13: add.s32 %r14, %r53, %r4; setp.ge.s32 %p15, %r14, %r24; setp.ge.s32 %p16, %r8, %r25; or.pred %p17, %p16, %p15; @%p17 bra BB0_14; add.s32 %r45, %r14, %r9; mul.wide.s32 %rd11, %r45, 4; add.s64 %rd12, %rd1, %rd11; ld.global.nc.f32 %f51, [%rd12]; add.f32 %f68, %f72, %f51; add.f32 %f67, %f71, 0f3F800000; bra.uni BB0_16; BB0_5: mov.f32 %f67, %f71; mov.f32 %f68, %f72; mov.f32 %f71, %f48; mov.f32 %f72, %f48; bra.uni BB0_17; BB0_14: mov.f32 %f68, %f72; mov.f32 %f67, %f71; BB0_16: add.s32 %r53, %r53, 1; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_17: setp.lt.u32 %p18, %r27, 4; @%p18 bra BB0_28; add.s32 %r56, %r4, %r53; mad.lo.s32 %r47, %r24, %r40, %r56; mul.wide.s32 %rd13, %r47, 4; add.s64 %rd17, %rd1, %rd13; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_19: setp.ge.s32 %p19, %r56, %r24; setp.ge.s32 %p20, %r8, %r25; or.pred %p21, %p20, %p19; @%p21 bra BB0_21; ld.global.nc.f32 %f52, [%rd17]; add.f32 %f72, %f72, %f52; add.f32 %f71, %f71, 0f3F800000; BB0_21: add.s32 %r48, %r56, 1; setp.ge.s32 %p22, %r48, %r24; or.pred %p24, %p20, %p22; @%p24 bra BB0_23; ld.global.nc.f32 %f53, [%rd17+4]; add.f32 %f72, %f72, %f53; add.f32 %f71, %f71, 0f3F800000; BB0_23: add.s32 %r49, %r56, 2; setp.ge.s32 %p25, %r49, %r24; or.pred %p27, %p20, %p25; @%p27 bra BB0_25; ld.global.nc.f32 %f54, [%rd17+8]; add.f32 %f72, %f72, %f54; add.f32 %f71, %f71, 0f3F800000; BB0_25: add.s32 %r50, %r56, 3; setp.ge.s32 %p28, %r50, %r24; or.pred %p30, %p20, %p28; @%p30 bra BB0_27; ld.global.nc.f32 %f55, [%rd17+12]; add.f32 %f72, %f72, %f55; add.f32 %f71, %f71, 0f3F800000; BB0_27: add.s32 %r56, %r56, 4; add.s32 %r53, %r53, 4; setp.lt.s32 %p31, %r53, %r27; add.s64 %rd17, %rd17, 16; @%p31 bra BB0_19; BB0_28: add.s32 %r52, %r52, 1; setp.lt.s32 %p32, %r52, %r28; @%p32 bra BB0_3; BB0_29: cvta.to.global.u64 %rd14, %rd5; mad.lo.s32 %r51, %r2, %r23, %r1; mul.wide.s32 %rd15, %r51, 4; add.s64 %rd16, %rd14, %rd15; div.rn.f32 %f56, %f72, %f71; st.global.f32 [%rd16], %f56; BB0_30: ret; } ` resize_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<85>; .reg .b32 %r<58>; .reg .b64 %rd<18>; ld.param.u64 %rd5, [resize_param_0]; ld.param.u32 %r23, [resize_param_1]; ld.param.u32 %r29, [resize_param_2]; ld.param.u64 %rd6, [resize_param_4]; ld.param.u32 %r24, [resize_param_5]; ld.param.u32 %r25, [resize_param_6]; ld.param.u32 %r26, [resize_param_8]; ld.param.u32 %r27, [resize_param_9]; ld.param.u32 %r28, [resize_param_10]; cvta.to.global.u64 %rd1, %rd6; mov.u32 %r30, %ntid.x; mov.u32 %r31, %ctaid.x; mov.u32 %r32, %tid.x; mad.lo.s32 %r1, %r30, %r31, %r32; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r2, %r33, %r34, %r35; setp.ge.s32 %p1, %r1, %r23; setp.ge.s32 %p2, %r2, %r29; or.pred %p3, %p1, %p2; @%p3 bra BB0_30; mov.f32 %f71, 0f00000000; setp.lt.s32 %p4, %r28, 1; mov.f32 %f72, %f71; @%p4 bra BB0_29; mul.lo.s32 %r3, %r2, %r28; mul.lo.s32 %r4, %r1, %r27; mul.lo.s32 %r5, %r26, %r25; and.b32 %r6, %r27, 3; mov.f32 %f71, 0f00000000; mov.u32 %r52, 0; mov.f32 %f72, %f71; BB0_3: add.s32 %r8, %r52, %r3; setp.lt.s32 %p5, %r27, 1; @%p5 bra BB0_28; add.s32 %r40, %r8, %r5; mul.lo.s32 %r9, %r40, %r24; mov.u32 %r53, 0; mov.f32 %f48, 0f00000000; setp.eq.s32 %p6, %r6, 0; @%p6 bra BB0_5; setp.eq.s32 %p7, %r6, 1; @%p7 bra BB0_13; setp.eq.s32 %p8, %r6, 2; @%p8 bra BB0_10; setp.ge.s32 %p9, %r8, %r25; setp.ge.s32 %p10, %r4, %r24; mov.u32 %r53, 1; or.pred %p11, %p9, %p10; @%p11 bra BB0_10; add.s32 %r43, %r4, %r9; mul.wide.s32 %rd7, %r43, 4; add.s64 %rd8, %rd1, %rd7; ld.global.nc.f32 %f49, [%rd8]; add.f32 %f72, %f72, %f49; add.f32 %f71, %f71, 0f3F800000; BB0_10: add.s32 %r11, %r53, %r4; setp.ge.s32 %p12, %r11, %r24; setp.ge.s32 %p13, %r8, %r25; or.pred %p14, %p13, %p12; @%p14 bra BB0_12; add.s32 %r44, %r11, %r9; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd1, %rd9; ld.global.nc.f32 %f50, [%rd10]; add.f32 %f72, %f72, %f50; add.f32 %f71, %f71, 0f3F800000; BB0_12: add.s32 %r53, %r53, 1; BB0_13: add.s32 %r14, %r53, %r4; setp.ge.s32 %p15, %r14, %r24; setp.ge.s32 %p16, %r8, %r25; or.pred %p17, %p16, %p15; @%p17 bra BB0_14; add.s32 %r45, %r14, %r9; mul.wide.s32 %rd11, %r45, 4; add.s64 %rd12, %rd1, %rd11; ld.global.nc.f32 %f51, [%rd12]; add.f32 %f68, %f72, %f51; add.f32 %f67, %f71, 0f3F800000; bra.uni BB0_16; BB0_5: mov.f32 %f67, %f71; mov.f32 %f68, %f72; mov.f32 %f71, %f48; mov.f32 %f72, %f48; bra.uni BB0_17; BB0_14: mov.f32 %f68, %f72; mov.f32 %f67, %f71; BB0_16: add.s32 %r53, %r53, 1; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_17: setp.lt.u32 %p18, %r27, 4; @%p18 bra BB0_28; add.s32 %r56, %r4, %r53; mad.lo.s32 %r47, %r24, %r40, %r56; mul.wide.s32 %rd13, %r47, 4; add.s64 %rd17, %rd1, %rd13; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_19: setp.ge.s32 %p19, %r56, %r24; setp.ge.s32 %p20, %r8, %r25; or.pred %p21, %p20, %p19; @%p21 bra BB0_21; ld.global.nc.f32 %f52, [%rd17]; add.f32 %f72, %f72, %f52; add.f32 %f71, %f71, 0f3F800000; BB0_21: add.s32 %r48, %r56, 1; setp.ge.s32 %p22, %r48, %r24; or.pred %p24, %p20, %p22; @%p24 bra BB0_23; ld.global.nc.f32 %f53, [%rd17+4]; add.f32 %f72, %f72, %f53; add.f32 %f71, %f71, 0f3F800000; BB0_23: add.s32 %r49, %r56, 2; setp.ge.s32 %p25, %r49, %r24; or.pred %p27, %p20, %p25; @%p27 bra BB0_25; ld.global.nc.f32 %f54, [%rd17+8]; add.f32 %f72, %f72, %f54; add.f32 %f71, %f71, 0f3F800000; BB0_25: add.s32 %r50, %r56, 3; setp.ge.s32 %p28, %r50, %r24; or.pred %p30, %p20, %p28; @%p30 bra BB0_27; ld.global.nc.f32 %f55, [%rd17+12]; add.f32 %f72, %f72, %f55; add.f32 %f71, %f71, 0f3F800000; BB0_27: add.s32 %r56, %r56, 4; add.s32 %r53, %r53, 4; setp.lt.s32 %p31, %r53, %r27; add.s64 %rd17, %rd17, 16; @%p31 bra BB0_19; BB0_28: add.s32 %r52, %r52, 1; setp.lt.s32 %p32, %r52, %r28; @%p32 bra BB0_3; BB0_29: cvta.to.global.u64 %rd14, %rd5; mad.lo.s32 %r51, %r2, %r23, %r1; mul.wide.s32 %rd15, %r51, 4; add.s64 %rd16, %rd14, %rd15; div.rn.f32 %f56, %f72, %f71; st.global.f32 [%rd16], %f56; BB0_30: ret; } ` resize_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl resize .visible .entry resize( .param .u64 resize_param_0, .param .u32 resize_param_1, .param .u32 resize_param_2, .param .u32 resize_param_3, .param .u64 resize_param_4, .param .u32 resize_param_5, .param .u32 resize_param_6, .param .u32 resize_param_7, .param .u32 resize_param_8, .param .u32 resize_param_9, .param .u32 resize_param_10 ) { .reg .pred %p<33>; .reg .f32 %f<85>; .reg .b32 %r<58>; .reg .b64 %rd<18>; ld.param.u64 %rd5, [resize_param_0]; ld.param.u32 %r23, [resize_param_1]; ld.param.u32 %r29, [resize_param_2]; ld.param.u64 %rd6, [resize_param_4]; ld.param.u32 %r24, [resize_param_5]; ld.param.u32 %r25, [resize_param_6]; ld.param.u32 %r26, [resize_param_8]; ld.param.u32 %r27, [resize_param_9]; ld.param.u32 %r28, [resize_param_10]; cvta.to.global.u64 %rd1, %rd6; mov.u32 %r30, %ntid.x; mov.u32 %r31, %ctaid.x; mov.u32 %r32, %tid.x; mad.lo.s32 %r1, %r30, %r31, %r32; mov.u32 %r33, %ntid.y; mov.u32 %r34, %ctaid.y; mov.u32 %r35, %tid.y; mad.lo.s32 %r2, %r33, %r34, %r35; setp.ge.s32 %p1, %r1, %r23; setp.ge.s32 %p2, %r2, %r29; or.pred %p3, %p1, %p2; @%p3 bra BB0_30; mov.f32 %f71, 0f00000000; setp.lt.s32 %p4, %r28, 1; mov.f32 %f72, %f71; @%p4 bra BB0_29; mul.lo.s32 %r3, %r2, %r28; mul.lo.s32 %r4, %r1, %r27; mul.lo.s32 %r5, %r26, %r25; and.b32 %r6, %r27, 3; mov.f32 %f71, 0f00000000; mov.u32 %r52, 0; mov.f32 %f72, %f71; BB0_3: add.s32 %r8, %r52, %r3; setp.lt.s32 %p5, %r27, 1; @%p5 bra BB0_28; add.s32 %r40, %r8, %r5; mul.lo.s32 %r9, %r40, %r24; mov.u32 %r53, 0; mov.f32 %f48, 0f00000000; setp.eq.s32 %p6, %r6, 0; @%p6 bra BB0_5; setp.eq.s32 %p7, %r6, 1; @%p7 bra BB0_13; setp.eq.s32 %p8, %r6, 2; @%p8 bra BB0_10; setp.ge.s32 %p9, %r8, %r25; setp.ge.s32 %p10, %r4, %r24; mov.u32 %r53, 1; or.pred %p11, %p9, %p10; @%p11 bra BB0_10; add.s32 %r43, %r4, %r9; mul.wide.s32 %rd7, %r43, 4; add.s64 %rd8, %rd1, %rd7; ld.global.nc.f32 %f49, [%rd8]; add.f32 %f72, %f72, %f49; add.f32 %f71, %f71, 0f3F800000; BB0_10: add.s32 %r11, %r53, %r4; setp.ge.s32 %p12, %r11, %r24; setp.ge.s32 %p13, %r8, %r25; or.pred %p14, %p13, %p12; @%p14 bra BB0_12; add.s32 %r44, %r11, %r9; mul.wide.s32 %rd9, %r44, 4; add.s64 %rd10, %rd1, %rd9; ld.global.nc.f32 %f50, [%rd10]; add.f32 %f72, %f72, %f50; add.f32 %f71, %f71, 0f3F800000; BB0_12: add.s32 %r53, %r53, 1; BB0_13: add.s32 %r14, %r53, %r4; setp.ge.s32 %p15, %r14, %r24; setp.ge.s32 %p16, %r8, %r25; or.pred %p17, %p16, %p15; @%p17 bra BB0_14; add.s32 %r45, %r14, %r9; mul.wide.s32 %rd11, %r45, 4; add.s64 %rd12, %rd1, %rd11; ld.global.nc.f32 %f51, [%rd12]; add.f32 %f68, %f72, %f51; add.f32 %f67, %f71, 0f3F800000; bra.uni BB0_16; BB0_5: mov.f32 %f67, %f71; mov.f32 %f68, %f72; mov.f32 %f71, %f48; mov.f32 %f72, %f48; bra.uni BB0_17; BB0_14: mov.f32 %f68, %f72; mov.f32 %f67, %f71; BB0_16: add.s32 %r53, %r53, 1; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_17: setp.lt.u32 %p18, %r27, 4; @%p18 bra BB0_28; add.s32 %r56, %r4, %r53; mad.lo.s32 %r47, %r24, %r40, %r56; mul.wide.s32 %rd13, %r47, 4; add.s64 %rd17, %rd1, %rd13; mov.f32 %f71, %f67; mov.f32 %f72, %f68; BB0_19: setp.ge.s32 %p19, %r56, %r24; setp.ge.s32 %p20, %r8, %r25; or.pred %p21, %p20, %p19; @%p21 bra BB0_21; ld.global.nc.f32 %f52, [%rd17]; add.f32 %f72, %f72, %f52; add.f32 %f71, %f71, 0f3F800000; BB0_21: add.s32 %r48, %r56, 1; setp.ge.s32 %p22, %r48, %r24; or.pred %p24, %p20, %p22; @%p24 bra BB0_23; ld.global.nc.f32 %f53, [%rd17+4]; add.f32 %f72, %f72, %f53; add.f32 %f71, %f71, 0f3F800000; BB0_23: add.s32 %r49, %r56, 2; setp.ge.s32 %p25, %r49, %r24; or.pred %p27, %p20, %p25; @%p27 bra BB0_25; ld.global.nc.f32 %f54, [%rd17+8]; add.f32 %f72, %f72, %f54; add.f32 %f71, %f71, 0f3F800000; BB0_25: add.s32 %r50, %r56, 3; setp.ge.s32 %p28, %r50, %r24; or.pred %p30, %p20, %p28; @%p30 bra BB0_27; ld.global.nc.f32 %f55, [%rd17+12]; add.f32 %f72, %f72, %f55; add.f32 %f71, %f71, 0f3F800000; BB0_27: add.s32 %r56, %r56, 4; add.s32 %r53, %r53, 4; setp.lt.s32 %p31, %r53, %r27; add.s64 %rd17, %rd17, 16; @%p31 bra BB0_19; BB0_28: add.s32 %r52, %r52, 1; setp.lt.s32 %p32, %r52, %r28; @%p32 bra BB0_3; BB0_29: cvta.to.global.u64 %rd14, %rd5; mad.lo.s32 %r51, %r2, %r23, %r1; mul.wide.s32 %rd15, %r51, 4; add.s64 %rd16, %rd14, %rd15; div.rn.f32 %f56, %f72, %f71; st.global.f32 [%rd16], %f56; BB0_30: ret; } ` ) mumax3-3.10/cuda/shift.go000066400000000000000000000027441371432437400152460ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) // shift dst by shx cells (positive or negative) along X-axis. // new edge value is clampL at left edge or clampR at right edge. func ShiftX(dst, src *data.Slice, shiftX int, clampL, clampR float32) { util.Argument(dst.NComp() == 1 && src.NComp() == 1) util.Assert(dst.Len() == src.Len()) N := dst.Size() cfg := make3DConf(N) k_shiftx_async(dst.DevPtr(0), src.DevPtr(0), N[X], N[Y], N[Z], shiftX, clampL, clampR, cfg) } func ShiftY(dst, src *data.Slice, shiftY int, clampL, clampR float32) { util.Argument(dst.NComp() == 1 && src.NComp() == 1) util.Assert(dst.Len() == src.Len()) N := dst.Size() cfg := make3DConf(N) k_shifty_async(dst.DevPtr(0), src.DevPtr(0), N[X], N[Y], N[Z], shiftY, clampL, clampR, cfg) } func ShiftZ(dst, src *data.Slice, shiftZ int, clampL, clampR float32) { util.Argument(dst.NComp() == 1 && src.NComp() == 1) util.Assert(dst.Len() == src.Len()) N := dst.Size() cfg := make3DConf(N) k_shiftz_async(dst.DevPtr(0), src.DevPtr(0), N[X], N[Y], N[Z], shiftZ, clampL, clampR, cfg) } // Like Shift, but for bytes func ShiftBytes(dst, src *Bytes, m *data.Mesh, shiftX int, clamp byte) { N := m.Size() cfg := make3DConf(N) k_shiftbytes_async(dst.Ptr, src.Ptr, N[X], N[Y], N[Z], shiftX, clamp, cfg) } func ShiftBytesY(dst, src *Bytes, m *data.Mesh, shiftY int, clamp byte) { N := m.Size() cfg := make3DConf(N) k_shiftbytesy_async(dst.Ptr, src.Ptr, N[X], N[Y], N[Z], shiftY, clamp, cfg) } mumax3-3.10/cuda/shiftbytes.cu000066400000000000000000000013671371432437400163170ustar00rootroot00000000000000#include #include "stencil.h" // shift dst by shx cells (positive or negative) along X-axis. // new edge value is clampL at left edge or clampR at right edge. extern "C" __global__ void shiftbytes(uint8_t* __restrict__ dst, uint8_t* __restrict__ src, int Nx, int Ny, int Nz, int shx, uint8_t clamp) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if(ix < Nx && iy < Ny && iz < Nz) { int ix2 = ix-shx; uint8_t newval; if (ix2 < 0 || ix2 >= Nx) { newval = clamp; } else { newval = src[idx(ix2, iy, iz)]; } dst[idx(ix, iy, iz)] = newval; } } mumax3-3.10/cuda/shiftbytes_wrapper.go000066400000000000000000000604041371432437400200520ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for shiftbytes kernel var shiftbytes_code cu.Function // Stores the arguments for shiftbytes kernel invocation type shiftbytes_args_t struct { arg_dst unsafe.Pointer arg_src unsafe.Pointer arg_Nx int arg_Ny int arg_Nz int arg_shx int arg_clamp byte argptr [7]unsafe.Pointer sync.Mutex } // Stores the arguments for shiftbytes kernel invocation var shiftbytes_args shiftbytes_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. shiftbytes_args.argptr[0] = unsafe.Pointer(&shiftbytes_args.arg_dst) shiftbytes_args.argptr[1] = unsafe.Pointer(&shiftbytes_args.arg_src) shiftbytes_args.argptr[2] = unsafe.Pointer(&shiftbytes_args.arg_Nx) shiftbytes_args.argptr[3] = unsafe.Pointer(&shiftbytes_args.arg_Ny) shiftbytes_args.argptr[4] = unsafe.Pointer(&shiftbytes_args.arg_Nz) shiftbytes_args.argptr[5] = unsafe.Pointer(&shiftbytes_args.arg_shx) shiftbytes_args.argptr[6] = unsafe.Pointer(&shiftbytes_args.arg_clamp) } // Wrapper for shiftbytes CUDA kernel, asynchronous. func k_shiftbytes_async(dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz int, shx int, clamp byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("shiftbytes") } shiftbytes_args.Lock() defer shiftbytes_args.Unlock() if shiftbytes_code == 0 { shiftbytes_code = fatbinLoad(shiftbytes_map, "shiftbytes") } shiftbytes_args.arg_dst = dst shiftbytes_args.arg_src = src shiftbytes_args.arg_Nx = Nx shiftbytes_args.arg_Ny = Ny shiftbytes_args.arg_Nz = Nz shiftbytes_args.arg_shx = shx shiftbytes_args.arg_clamp = clamp args := shiftbytes_args.argptr[:] cu.LaunchKernel(shiftbytes_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("shiftbytes") } } // maps compute capability on PTX code for shiftbytes kernel. var shiftbytes_map = map[int]string{0: "", 30: shiftbytes_ptx_30, 32: shiftbytes_ptx_32, 35: shiftbytes_ptx_35, 37: shiftbytes_ptx_37, 50: shiftbytes_ptx_50, 52: shiftbytes_ptx_52, 53: shiftbytes_ptx_53, 60: shiftbytes_ptx_60, 61: shiftbytes_ptx_61, 62: shiftbytes_ptx_62, 70: shiftbytes_ptx_70, 72: shiftbytes_ptx_72, 75: shiftbytes_ptx_75} // shiftbytes PTX code for various compute capabilities. const ( shiftbytes_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; ld.param.u8 %rs4, [shiftbytes_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd7, %r21; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytes_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; ld.param.u8 %rs4, [shiftbytes_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd7, %r21; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytes_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; ld.param.u8 %rs4, [shiftbytes_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd7, %r21; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytes_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; ld.param.u8 %rs4, [shiftbytes_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd7, %r21; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytes_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; ld.param.u8 %rs4, [shiftbytes_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd7, %r21; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytes_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; ld.param.u8 %rs4, [shiftbytes_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd7, %r21; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytes_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; ld.param.u8 %rs4, [shiftbytes_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd7, %r21; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytes_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; ld.param.u8 %rs4, [shiftbytes_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd7, %r21; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytes_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; ld.param.u8 %rs4, [shiftbytes_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd7, %r21; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytes_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; ld.param.u8 %rs4, [shiftbytes_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd7, %r21; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytes_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; ld.param.u8 %rs4, [shiftbytes_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd7, %r21; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytes_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; ld.param.u8 %rs4, [shiftbytes_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd7, %r21; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytes_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl shiftbytes .visible .entry shiftbytes( .param .u64 shiftbytes_param_0, .param .u64 shiftbytes_param_1, .param .u32 shiftbytes_param_2, .param .u32 shiftbytes_param_3, .param .u32 shiftbytes_param_4, .param .u32 shiftbytes_param_5, .param .u8 shiftbytes_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytes_param_0]; ld.param.u64 %rd2, [shiftbytes_param_1]; ld.param.u32 %r6, [shiftbytes_param_2]; ld.param.u32 %r7, [shiftbytes_param_3]; ld.param.u32 %r9, [shiftbytes_param_4]; ld.param.u32 %r8, [shiftbytes_param_5]; ld.param.u8 %rs4, [shiftbytes_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r1, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r6; or.pred %p8, %p6, %p7; mad.lo.s32 %r19, %r3, %r7, %r2; mul.lo.s32 %r5, %r19, %r6; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r20, %r5, %r4; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r1; cvt.s64.s32 %rd7, %r21; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` ) mumax3-3.10/cuda/shiftbytesy.cu000066400000000000000000000012671371432437400165070ustar00rootroot00000000000000#include #include "stencil.h" // shift dst by shy cells (positive or negative) along Y-axis. extern "C" __global__ void shiftbytesy(uint8_t* __restrict__ dst, uint8_t* __restrict__ src, int Nx, int Ny, int Nz, int shy, uint8_t clamp) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if(ix < Nx && iy < Ny && iz < Nz) { int iy2 = iy-shy; uint8_t newval; if (iy2 < 0 || iy2 >= Ny) { newval = clamp; } else { newval = src[idx(ix, iy2, iz)]; } dst[idx(ix, iy, iz)] = newval; } } mumax3-3.10/cuda/shiftbytesy_wrapper.go000066400000000000000000000617401371432437400202470ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for shiftbytesy kernel var shiftbytesy_code cu.Function // Stores the arguments for shiftbytesy kernel invocation type shiftbytesy_args_t struct { arg_dst unsafe.Pointer arg_src unsafe.Pointer arg_Nx int arg_Ny int arg_Nz int arg_shy int arg_clamp byte argptr [7]unsafe.Pointer sync.Mutex } // Stores the arguments for shiftbytesy kernel invocation var shiftbytesy_args shiftbytesy_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. shiftbytesy_args.argptr[0] = unsafe.Pointer(&shiftbytesy_args.arg_dst) shiftbytesy_args.argptr[1] = unsafe.Pointer(&shiftbytesy_args.arg_src) shiftbytesy_args.argptr[2] = unsafe.Pointer(&shiftbytesy_args.arg_Nx) shiftbytesy_args.argptr[3] = unsafe.Pointer(&shiftbytesy_args.arg_Ny) shiftbytesy_args.argptr[4] = unsafe.Pointer(&shiftbytesy_args.arg_Nz) shiftbytesy_args.argptr[5] = unsafe.Pointer(&shiftbytesy_args.arg_shy) shiftbytesy_args.argptr[6] = unsafe.Pointer(&shiftbytesy_args.arg_clamp) } // Wrapper for shiftbytesy CUDA kernel, asynchronous. func k_shiftbytesy_async(dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz int, shy int, clamp byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("shiftbytesy") } shiftbytesy_args.Lock() defer shiftbytesy_args.Unlock() if shiftbytesy_code == 0 { shiftbytesy_code = fatbinLoad(shiftbytesy_map, "shiftbytesy") } shiftbytesy_args.arg_dst = dst shiftbytesy_args.arg_src = src shiftbytesy_args.arg_Nx = Nx shiftbytesy_args.arg_Ny = Ny shiftbytesy_args.arg_Nz = Nz shiftbytesy_args.arg_shy = shy shiftbytesy_args.arg_clamp = clamp args := shiftbytesy_args.argptr[:] cu.LaunchKernel(shiftbytesy_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("shiftbytesy") } } // maps compute capability on PTX code for shiftbytesy kernel. var shiftbytesy_map = map[int]string{0: "", 30: shiftbytesy_ptx_30, 32: shiftbytesy_ptx_32, 35: shiftbytesy_ptx_35, 37: shiftbytesy_ptx_37, 50: shiftbytesy_ptx_50, 52: shiftbytesy_ptx_52, 53: shiftbytesy_ptx_53, 60: shiftbytesy_ptx_60, 61: shiftbytesy_ptx_61, 62: shiftbytesy_ptx_62, 70: shiftbytesy_ptx_70, 72: shiftbytesy_ptx_72, 75: shiftbytesy_ptx_75} // shiftbytesy PTX code for various compute capabilities. const ( shiftbytesy_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; ld.param.u8 %rs4, [shiftbytesy_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r19, %r5, %r4; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd7, %r22; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytesy_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; ld.param.u8 %rs4, [shiftbytesy_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r19, %r5, %r4; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd7, %r22; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytesy_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; ld.param.u8 %rs4, [shiftbytesy_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r19, %r5, %r4; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd7, %r22; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytesy_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; ld.param.u8 %rs4, [shiftbytesy_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r19, %r5, %r4; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd7, %r22; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytesy_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; ld.param.u8 %rs4, [shiftbytesy_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r19, %r5, %r4; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd7, %r22; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytesy_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; ld.param.u8 %rs4, [shiftbytesy_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r19, %r5, %r4; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd7, %r22; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytesy_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; ld.param.u8 %rs4, [shiftbytesy_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r19, %r5, %r4; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd7, %r22; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytesy_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; ld.param.u8 %rs4, [shiftbytesy_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r19, %r5, %r4; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd7, %r22; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytesy_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; ld.param.u8 %rs4, [shiftbytesy_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r19, %r5, %r4; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd7, %r22; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytesy_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; ld.param.u8 %rs4, [shiftbytesy_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r19, %r5, %r4; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd7, %r22; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytesy_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; ld.param.u8 %rs4, [shiftbytesy_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r19, %r5, %r4; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd7, %r22; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytesy_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; ld.param.u8 %rs4, [shiftbytesy_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r19, %r5, %r4; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd7, %r22; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` shiftbytesy_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl shiftbytesy .visible .entry shiftbytesy( .param .u64 shiftbytesy_param_0, .param .u64 shiftbytesy_param_1, .param .u32 shiftbytesy_param_2, .param .u32 shiftbytesy_param_3, .param .u32 shiftbytesy_param_4, .param .u32 shiftbytesy_param_5, .param .u8 shiftbytesy_param_6 ) { .reg .pred %p<9>; .reg .b16 %rs<5>; .reg .b32 %r<23>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftbytesy_param_0]; ld.param.u64 %rd2, [shiftbytesy_param_1]; ld.param.u32 %r6, [shiftbytesy_param_2]; ld.param.u32 %r7, [shiftbytesy_param_3]; ld.param.u32 %r9, [shiftbytesy_param_4]; ld.param.u32 %r8, [shiftbytesy_param_5]; ld.param.u8 %rs4, [shiftbytesy_param_6]; mov.u32 %r10, %ntid.x; mov.u32 %r11, %ctaid.x; mov.u32 %r12, %tid.x; mad.lo.s32 %r1, %r10, %r11, %r12; mov.u32 %r13, %ntid.y; mov.u32 %r14, %ctaid.y; mov.u32 %r15, %tid.y; mad.lo.s32 %r2, %r13, %r14, %r15; mov.u32 %r16, %ntid.z; mov.u32 %r17, %ctaid.z; mov.u32 %r18, %tid.z; mad.lo.s32 %r3, %r16, %r17, %r18; setp.ge.s32 %p1, %r1, %r6; setp.ge.s32 %p2, %r2, %r7; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r9; or.pred %p5, %p3, %p4; @%p5 bra BB0_4; sub.s32 %r4, %r2, %r8; setp.lt.s32 %p6, %r4, 0; setp.ge.s32 %p7, %r4, %r7; or.pred %p8, %p6, %p7; mul.lo.s32 %r5, %r3, %r7; @%p8 bra BB0_3; cvta.to.global.u64 %rd3, %rd2; add.s32 %r19, %r5, %r4; mad.lo.s32 %r20, %r19, %r6, %r1; cvt.s64.s32 %rd4, %r20; add.s64 %rd5, %rd3, %rd4; ld.global.nc.u8 %rs4, [%rd5]; BB0_3: cvta.to.global.u64 %rd6, %rd1; add.s32 %r21, %r5, %r2; mad.lo.s32 %r22, %r21, %r6, %r1; cvt.s64.s32 %rd7, %r22; add.s64 %rd8, %rd6, %rd7; st.global.u8 [%rd8], %rs4; BB0_4: ret; } ` ) mumax3-3.10/cuda/shiftx.cu000066400000000000000000000014231371432437400154310ustar00rootroot00000000000000#include "stencil.h" // shift dst by shx cells (positive or negative) along X-axis. // new edge value is clampL at left edge or clampR at right edge. extern "C" __global__ void shiftx(float* __restrict__ dst, float* __restrict__ src, int Nx, int Ny, int Nz, int shx, float clampL, float clampR) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if(ix < Nx && iy < Ny && iz < Nz) { int ix2 = ix-shx; float newval; if (ix2 < 0) { newval = clampL; } else if (ix2 >= Nx) { newval = clampR; } else { newval = src[idx(ix2, iy, iz)]; } dst[idx(ix, iy, iz)] = newval; } } mumax3-3.10/cuda/shiftx_wrapper.go000066400000000000000000000613531371432437400171770ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for shiftx kernel var shiftx_code cu.Function // Stores the arguments for shiftx kernel invocation type shiftx_args_t struct { arg_dst unsafe.Pointer arg_src unsafe.Pointer arg_Nx int arg_Ny int arg_Nz int arg_shx int arg_clampL float32 arg_clampR float32 argptr [8]unsafe.Pointer sync.Mutex } // Stores the arguments for shiftx kernel invocation var shiftx_args shiftx_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. shiftx_args.argptr[0] = unsafe.Pointer(&shiftx_args.arg_dst) shiftx_args.argptr[1] = unsafe.Pointer(&shiftx_args.arg_src) shiftx_args.argptr[2] = unsafe.Pointer(&shiftx_args.arg_Nx) shiftx_args.argptr[3] = unsafe.Pointer(&shiftx_args.arg_Ny) shiftx_args.argptr[4] = unsafe.Pointer(&shiftx_args.arg_Nz) shiftx_args.argptr[5] = unsafe.Pointer(&shiftx_args.arg_shx) shiftx_args.argptr[6] = unsafe.Pointer(&shiftx_args.arg_clampL) shiftx_args.argptr[7] = unsafe.Pointer(&shiftx_args.arg_clampR) } // Wrapper for shiftx CUDA kernel, asynchronous. func k_shiftx_async(dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz int, shx int, clampL float32, clampR float32, cfg *config) { if Synchronous { // debug Sync() timer.Start("shiftx") } shiftx_args.Lock() defer shiftx_args.Unlock() if shiftx_code == 0 { shiftx_code = fatbinLoad(shiftx_map, "shiftx") } shiftx_args.arg_dst = dst shiftx_args.arg_src = src shiftx_args.arg_Nx = Nx shiftx_args.arg_Ny = Ny shiftx_args.arg_Nz = Nz shiftx_args.arg_shx = shx shiftx_args.arg_clampL = clampL shiftx_args.arg_clampR = clampR args := shiftx_args.argptr[:] cu.LaunchKernel(shiftx_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("shiftx") } } // maps compute capability on PTX code for shiftx kernel. var shiftx_map = map[int]string{0: "", 30: shiftx_ptx_30, 32: shiftx_ptx_32, 35: shiftx_ptx_35, 37: shiftx_ptx_37, 50: shiftx_ptx_50, 52: shiftx_ptx_52, 53: shiftx_ptx_53, 60: shiftx_ptx_60, 61: shiftx_ptx_61, 62: shiftx_ptx_62, 70: shiftx_ptx_70, 72: shiftx_ptx_72, 75: shiftx_ptx_75} // shiftx PTX code for various compute capabilities. const ( shiftx_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftx_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftx_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftx_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftx_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftx_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftx_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftx_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftx_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftx_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftx_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftx_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftx_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl shiftx .visible .entry shiftx( .param .u64 shiftx_param_0, .param .u64 shiftx_param_1, .param .u32 shiftx_param_2, .param .u32 shiftx_param_3, .param .u32 shiftx_param_4, .param .u32 shiftx_param_5, .param .f32 shiftx_param_6, .param .f32 shiftx_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftx_param_0]; ld.param.u64 %rd2, [shiftx_param_1]; ld.param.u32 %r5, [shiftx_param_2]; ld.param.u32 %r6, [shiftx_param_3]; ld.param.u32 %r8, [shiftx_param_4]; ld.param.u32 %r7, [shiftx_param_5]; ld.param.f32 %f5, [shiftx_param_6]; ld.param.f32 %f4, [shiftx_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r1, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r5; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r4; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` ) mumax3-3.10/cuda/shifty.cu000066400000000000000000000014231371432437400154320ustar00rootroot00000000000000#include "stencil.h" // shift dst by shy cells (positive or negative) along Y-axis. // new edge value is clampL at left edge or clampR at right edge. extern "C" __global__ void shifty(float* __restrict__ dst, float* __restrict__ src, int Nx, int Ny, int Nz, int shy, float clampL, float clampR) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if(ix < Nx && iy < Ny && iz < Nz) { int iy2 = iy-shy; float newval; if (iy2 < 0) { newval = clampL; } else if (iy2 >= Ny) { newval = clampR; } else { newval = src[idx(ix, iy2, iz)]; } dst[idx(ix, iy, iz)] = newval; } } mumax3-3.10/cuda/shifty_wrapper.go000066400000000000000000000613531371432437400172000ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for shifty kernel var shifty_code cu.Function // Stores the arguments for shifty kernel invocation type shifty_args_t struct { arg_dst unsafe.Pointer arg_src unsafe.Pointer arg_Nx int arg_Ny int arg_Nz int arg_shy int arg_clampL float32 arg_clampR float32 argptr [8]unsafe.Pointer sync.Mutex } // Stores the arguments for shifty kernel invocation var shifty_args shifty_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. shifty_args.argptr[0] = unsafe.Pointer(&shifty_args.arg_dst) shifty_args.argptr[1] = unsafe.Pointer(&shifty_args.arg_src) shifty_args.argptr[2] = unsafe.Pointer(&shifty_args.arg_Nx) shifty_args.argptr[3] = unsafe.Pointer(&shifty_args.arg_Ny) shifty_args.argptr[4] = unsafe.Pointer(&shifty_args.arg_Nz) shifty_args.argptr[5] = unsafe.Pointer(&shifty_args.arg_shy) shifty_args.argptr[6] = unsafe.Pointer(&shifty_args.arg_clampL) shifty_args.argptr[7] = unsafe.Pointer(&shifty_args.arg_clampR) } // Wrapper for shifty CUDA kernel, asynchronous. func k_shifty_async(dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz int, shy int, clampL float32, clampR float32, cfg *config) { if Synchronous { // debug Sync() timer.Start("shifty") } shifty_args.Lock() defer shifty_args.Unlock() if shifty_code == 0 { shifty_code = fatbinLoad(shifty_map, "shifty") } shifty_args.arg_dst = dst shifty_args.arg_src = src shifty_args.arg_Nx = Nx shifty_args.arg_Ny = Ny shifty_args.arg_Nz = Nz shifty_args.arg_shy = shy shifty_args.arg_clampL = clampL shifty_args.arg_clampR = clampR args := shifty_args.argptr[:] cu.LaunchKernel(shifty_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("shifty") } } // maps compute capability on PTX code for shifty kernel. var shifty_map = map[int]string{0: "", 30: shifty_ptx_30, 32: shifty_ptx_32, 35: shifty_ptx_35, 37: shifty_ptx_37, 50: shifty_ptx_50, 52: shifty_ptx_52, 53: shifty_ptx_53, 60: shifty_ptx_60, 61: shifty_ptx_61, 62: shifty_ptx_62, 70: shifty_ptx_70, 72: shifty_ptx_72, 75: shifty_ptx_75} // shifty PTX code for various compute capabilities. const ( shifty_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shifty_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shifty_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shifty_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shifty_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shifty_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shifty_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shifty_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shifty_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shifty_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shifty_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shifty_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shifty_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl shifty .visible .entry shifty( .param .u64 shifty_param_0, .param .u64 shifty_param_1, .param .u32 shifty_param_2, .param .u32 shifty_param_3, .param .u32 shifty_param_4, .param .u32 shifty_param_5, .param .f32 shifty_param_6, .param .f32 shifty_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shifty_param_0]; ld.param.u64 %rd2, [shifty_param_1]; ld.param.u32 %r5, [shifty_param_2]; ld.param.u32 %r6, [shifty_param_3]; ld.param.u32 %r8, [shifty_param_4]; ld.param.u32 %r7, [shifty_param_5]; ld.param.f32 %f5, [shifty_param_6]; ld.param.f32 %f4, [shifty_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r8; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r2, %r7; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r6; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r3, %r6, %r4; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` ) mumax3-3.10/cuda/shiftz.cu000066400000000000000000000014231371432437400154330ustar00rootroot00000000000000#include "stencil.h" // shift dst by shy cells (positive or negative) along Z-axis. // new edge value is clampL at left edge or clampR at right edge. extern "C" __global__ void shiftz(float* __restrict__ dst, float* __restrict__ src, int Nx, int Ny, int Nz, int shz, float clampL, float clampR) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if(ix < Nx && iy < Ny && iz < Nz) { int iz2 = iz-shz; float newval; if (iz2 < 0) { newval = clampL; } else if (iz2 >= Nz) { newval = clampR; } else { newval = src[idx(ix, iy, iz2)]; } dst[idx(ix, iy, iz)] = newval; } } mumax3-3.10/cuda/shiftz_wrapper.go000066400000000000000000000613531371432437400172010ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for shiftz kernel var shiftz_code cu.Function // Stores the arguments for shiftz kernel invocation type shiftz_args_t struct { arg_dst unsafe.Pointer arg_src unsafe.Pointer arg_Nx int arg_Ny int arg_Nz int arg_shz int arg_clampL float32 arg_clampR float32 argptr [8]unsafe.Pointer sync.Mutex } // Stores the arguments for shiftz kernel invocation var shiftz_args shiftz_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. shiftz_args.argptr[0] = unsafe.Pointer(&shiftz_args.arg_dst) shiftz_args.argptr[1] = unsafe.Pointer(&shiftz_args.arg_src) shiftz_args.argptr[2] = unsafe.Pointer(&shiftz_args.arg_Nx) shiftz_args.argptr[3] = unsafe.Pointer(&shiftz_args.arg_Ny) shiftz_args.argptr[4] = unsafe.Pointer(&shiftz_args.arg_Nz) shiftz_args.argptr[5] = unsafe.Pointer(&shiftz_args.arg_shz) shiftz_args.argptr[6] = unsafe.Pointer(&shiftz_args.arg_clampL) shiftz_args.argptr[7] = unsafe.Pointer(&shiftz_args.arg_clampR) } // Wrapper for shiftz CUDA kernel, asynchronous. func k_shiftz_async(dst unsafe.Pointer, src unsafe.Pointer, Nx int, Ny int, Nz int, shz int, clampL float32, clampR float32, cfg *config) { if Synchronous { // debug Sync() timer.Start("shiftz") } shiftz_args.Lock() defer shiftz_args.Unlock() if shiftz_code == 0 { shiftz_code = fatbinLoad(shiftz_map, "shiftz") } shiftz_args.arg_dst = dst shiftz_args.arg_src = src shiftz_args.arg_Nx = Nx shiftz_args.arg_Ny = Ny shiftz_args.arg_Nz = Nz shiftz_args.arg_shz = shz shiftz_args.arg_clampL = clampL shiftz_args.arg_clampR = clampR args := shiftz_args.argptr[:] cu.LaunchKernel(shiftz_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("shiftz") } } // maps compute capability on PTX code for shiftz kernel. var shiftz_map = map[int]string{0: "", 30: shiftz_ptx_30, 32: shiftz_ptx_32, 35: shiftz_ptx_35, 37: shiftz_ptx_37, 50: shiftz_ptx_50, 52: shiftz_ptx_52, 53: shiftz_ptx_53, 60: shiftz_ptx_60, 61: shiftz_ptx_61, 62: shiftz_ptx_62, 70: shiftz_ptx_70, 72: shiftz_ptx_72, 75: shiftz_ptx_75} // shiftz PTX code for various compute capabilities. const ( shiftz_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftz_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftz_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftz_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftz_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftz_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftz_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftz_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftz_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftz_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftz_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftz_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` shiftz_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl shiftz .visible .entry shiftz( .param .u64 shiftz_param_0, .param .u64 shiftz_param_1, .param .u32 shiftz_param_2, .param .u32 shiftz_param_3, .param .u32 shiftz_param_4, .param .u32 shiftz_param_5, .param .f32 shiftz_param_6, .param .f32 shiftz_param_7 ) { .reg .pred %p<8>; .reg .f32 %f<6>; .reg .b32 %r<22>; .reg .b64 %rd<9>; ld.param.u64 %rd1, [shiftz_param_0]; ld.param.u64 %rd2, [shiftz_param_1]; ld.param.u32 %r5, [shiftz_param_2]; ld.param.u32 %r6, [shiftz_param_3]; ld.param.u32 %r7, [shiftz_param_4]; ld.param.u32 %r8, [shiftz_param_5]; ld.param.f32 %f5, [shiftz_param_6]; ld.param.f32 %f4, [shiftz_param_7]; mov.u32 %r9, %ntid.x; mov.u32 %r10, %ctaid.x; mov.u32 %r11, %tid.x; mad.lo.s32 %r1, %r9, %r10, %r11; mov.u32 %r12, %ntid.y; mov.u32 %r13, %ctaid.y; mov.u32 %r14, %tid.y; mad.lo.s32 %r2, %r12, %r13, %r14; mov.u32 %r15, %ntid.z; mov.u32 %r16, %ctaid.z; mov.u32 %r17, %tid.z; mad.lo.s32 %r3, %r15, %r16, %r17; setp.ge.s32 %p1, %r1, %r5; setp.ge.s32 %p2, %r2, %r6; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r7; or.pred %p5, %p3, %p4; @%p5 bra BB0_5; sub.s32 %r4, %r3, %r8; setp.lt.s32 %p6, %r4, 0; @%p6 bra BB0_4; setp.ge.s32 %p7, %r4, %r7; mov.f32 %f5, %f4; @%p7 bra BB0_4; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r18, %r4, %r6, %r2; mad.lo.s32 %r19, %r18, %r5, %r1; mul.wide.s32 %rd4, %r19, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f5, [%rd5]; BB0_4: cvta.to.global.u64 %rd6, %rd1; mad.lo.s32 %r20, %r3, %r6, %r2; mad.lo.s32 %r21, %r20, %r5, %r1; mul.wide.s32 %rd7, %r21, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f5; BB0_5: ret; } ` ) mumax3-3.10/cuda/slice.go000066400000000000000000000056721371432437400152330ustar00rootroot00000000000000package cuda import ( "math" "unsafe" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/data" "github.com/mumax/3/timer" "github.com/mumax/3/util" ) // Make a GPU Slice with nComp components each of size length. func NewSlice(nComp int, size [3]int) *data.Slice { return newSlice(nComp, size, MemAlloc, data.GPUMemory) } // Make a GPU Slice with nComp components each of size length. //func NewUnifiedSlice(nComp int, m *data.Mesh) *data.Slice { // return newSlice(nComp, m, cu.MemAllocHost, data.UnifiedMemory) //} func newSlice(nComp int, size [3]int, alloc func(int64) unsafe.Pointer, memType int8) *data.Slice { data.EnableGPU(memFree, cu.MemFreeHost, MemCpy, MemCpyDtoH, MemCpyHtoD) length := prod(size) bytes := int64(length) * cu.SIZEOF_FLOAT32 ptrs := make([]unsafe.Pointer, nComp) for c := range ptrs { ptrs[c] = unsafe.Pointer(alloc(bytes)) cu.MemsetD32(cu.DevicePtr(uintptr(ptrs[c])), 0, int64(length)) } return data.SliceFromPtrs(size, memType, ptrs) } // wrappers for data.EnableGPU arguments func memFree(ptr unsafe.Pointer) { cu.MemFree(cu.DevicePtr(uintptr(ptr))) } func MemCpyDtoH(dst, src unsafe.Pointer, bytes int64) { Sync() // sync previous kernels timer.Start("memcpyDtoH") cu.MemcpyDtoH(dst, cu.DevicePtr(uintptr(src)), bytes) Sync() // sync copy timer.Stop("memcpyDtoH") } func MemCpyHtoD(dst, src unsafe.Pointer, bytes int64) { Sync() // sync previous kernels timer.Start("memcpyHtoD") cu.MemcpyHtoD(cu.DevicePtr(uintptr(dst)), src, bytes) Sync() // sync copy timer.Stop("memcpyHtoD") } func MemCpy(dst, src unsafe.Pointer, bytes int64) { Sync() timer.Start("memcpy") cu.MemcpyAsync(cu.DevicePtr(uintptr(dst)), cu.DevicePtr(uintptr(src)), bytes, stream0) Sync() timer.Stop("memcpy") } // Memset sets the Slice's components to the specified values. // To be carefully used on unified slice (need sync) func Memset(s *data.Slice, val ...float32) { if Synchronous { // debug Sync() timer.Start("memset") } util.Argument(len(val) == s.NComp()) for c, v := range val { cu.MemsetD32Async(cu.DevicePtr(uintptr(s.DevPtr(c))), math.Float32bits(v), int64(s.Len()), stream0) } if Synchronous { //debug Sync() timer.Stop("memset") } } // Set all elements of all components to zero. func Zero(s *data.Slice) { Memset(s, make([]float32, s.NComp())...) } func SetCell(s *data.Slice, comp int, ix, iy, iz int, value float32) { SetElem(s, comp, s.Index(ix, iy, iz), value) } func SetElem(s *data.Slice, comp int, index int, value float32) { f := value dst := unsafe.Pointer(uintptr(s.DevPtr(comp)) + uintptr(index)*cu.SIZEOF_FLOAT32) MemCpyHtoD(dst, unsafe.Pointer(&f), cu.SIZEOF_FLOAT32) } func GetElem(s *data.Slice, comp int, index int) float32 { var f float32 src := unsafe.Pointer(uintptr(s.DevPtr(comp)) + uintptr(index)*cu.SIZEOF_FLOAT32) MemCpyDtoH(unsafe.Pointer(&f), src, cu.SIZEOF_FLOAT32) return f } func GetCell(s *data.Slice, comp, ix, iy, iz int) float32 { return GetElem(s, comp, s.Index(ix, iy, iz)) } mumax3-3.10/cuda/slice_test.go000066400000000000000000000031751371432437400162660ustar00rootroot00000000000000package cuda import ( "testing" "github.com/mumax/3/data" ) func TestSlice(t *testing.T) { N0, N1, N2 := 2, 4, 8 m := [3]int{N0, N1, N2} N := N0 * N1 * N2 a := NewSlice(3, m) defer a.Free() Memset(a, 1, 2, 3) if a.GPUAccess() == false { t.Fail() } if a.Len() != N { t.Fail() } if a.NComp() != 3 { t.Fail() } b := a.Comp(1) if b.GPUAccess() == false { t.Error("b.GPUAccess", b.GPUAccess()) } if b.Len() != N { t.Error("b.Len", b.Len()) } if b.NComp() != 1 { t.Error("b.NComp", b.NComp()) } if b.Size() != a.Size() { t.Fail() } } func TestCpy(t *testing.T) { N0, N1, N2 := 2, 4, 32 N := N0 * N1 * N2 mesh := [3]int{N0, N1, N2} h1 := make([]float32, N) for i := range h1 { h1[i] = float32(i) } hs := sliceFromList([][]float32{h1}, mesh) d := NewSlice(1, mesh) data.Copy(d, hs) d2 := NewSlice(1, mesh) data.Copy(d2, d) h2 := data.NewSlice(1, mesh) data.Copy(h2, d2) res := h2.Host()[0] for i := range res { if res[i] != h1[i] { t.Fail() } } } func TestSliceFree(t *testing.T) { N0, N1, N2 := 128, 1024, 1024 m := [3]int{N0, N1, N2} N := 17 // not freeing would attempt to allocate 17GB. for i := 0; i < N; i++ { a := NewSlice(2, m) a.Free() } a := NewSlice(2, m) a.Free() a.Free() // test double-free } func TestSliceHost(t *testing.T) { N0, N1, N2 := 1, 10, 10 m := [3]int{N0, N1, N2} a := NewSlice(3, m) defer a.Free() b := a.HostCopy().Host() if b[0][0] != 0 || b[1][42] != 0 || b[2][99] != 0 { t.Error("slice not inited to zero") } Memset(a, 1, 2, 3) b = a.HostCopy().Host() if b[0][0] != 1 || b[1][42] != 2 || b[2][99] != 3 { t.Error("slice memset") } } mumax3-3.10/cuda/slonczewski.go000066400000000000000000000015421371432437400164770ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" ) // Add Slonczewski ST torque to torque (Tesla). // see slonczewski.cu func AddSlonczewskiTorque2(torque, m *data.Slice, Msat, J, fixedP, alpha, pol, λ, ε_prime MSlice, thickness MSlice, flp float64, mesh *data.Mesh) { N := torque.Len() cfg := make1DConf(N) meshThickness := mesh.WorldSize()[Z] k_addslonczewskitorque2_async( torque.DevPtr(X), torque.DevPtr(Y), torque.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), Msat.DevPtr(0), Msat.Mul(0), J.DevPtr(Z), J.Mul(Z), fixedP.DevPtr(X), fixedP.Mul(X), fixedP.DevPtr(Y), fixedP.Mul(Y), fixedP.DevPtr(Z), fixedP.Mul(Z), alpha.DevPtr(0), alpha.Mul(0), pol.DevPtr(0), pol.Mul(0), λ.DevPtr(0), λ.Mul(0), ε_prime.DevPtr(0), ε_prime.Mul(0), thickness.DevPtr(0), thickness.Mul(0), float32(meshThickness), float32(flp), N, cfg) } mumax3-3.10/cuda/slonczewski2.cu000066400000000000000000000054341371432437400165670ustar00rootroot00000000000000// Original implementation by Mykola Dvornik for mumax2 // Modified for mumax3 by Arne Vansteenkiste, 2013, 2016 #include #include "float3.h" #include "constants.h" #include "amul.h" extern "C" __global__ void addslonczewskitorque2(float* __restrict__ tx, float* __restrict__ ty, float* __restrict__ tz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ Ms_, float Ms_mul, float* __restrict__ jz_, float jz_mul, float* __restrict__ px_, float px_mul, float* __restrict__ py_, float py_mul, float* __restrict__ pz_, float pz_mul, float* __restrict__ alpha_, float alpha_mul, float* __restrict__ pol_, float pol_mul, float* __restrict__ lambda_, float lambda_mul, float* __restrict__ epsPrime_, float epsPrime_mul, float* __restrict__ thickness_, float thickness_mul, float meshThickness, float freeLayerPosition, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { float3 m = make_float3(mx[i], my[i], mz[i]); float J = amul(jz_, jz_mul, i); float3 p = normalized(vmul(px_, py_, pz_, px_mul, py_mul, pz_mul, i)); float Ms = amul(Ms_, Ms_mul, i); float alpha = amul(alpha_, alpha_mul, i); float pol = amul(pol_, pol_mul, i); float lambda = amul(lambda_, lambda_mul, i); float epsilonPrime = amul(epsPrime_, epsPrime_mul, i); float thickness = amul(thickness_, thickness_mul, i); if (thickness == 0.0) { // if thickness is not set, use the thickness of the mesh instead thickness = meshThickness; } thickness *= freeLayerPosition; // switch sign if fixedlayer is at the bottom if (J == 0.0f || Ms == 0.0f) { return; } float beta = (HBAR / QE) * (J / (thickness*Ms) ); float lambda2 = lambda * lambda; float epsilon = pol * lambda2 / ((lambda2 + 1.0f) + (lambda2 - 1.0f) * dot(p, m)); float A = beta * epsilon; float B = beta * epsilonPrime; float gilb = 1.0f / (1.0f + alpha * alpha); float mxpxmFac = gilb * (A + alpha * B); float pxmFac = gilb * (B - alpha * A); float3 pxm = cross(p, m); float3 mxpxm = cross(m, pxm); tx[i] += mxpxmFac * mxpxm.x + pxmFac * pxm.x; ty[i] += mxpxmFac * mxpxm.y + pxmFac * pxm.y; tz[i] += mxpxmFac * mxpxm.z + pxmFac * pxm.z; } } mumax3-3.10/cuda/slonczewski2_wrapper.go000066400000000000000000003337211371432437400203300ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for addslonczewskitorque2 kernel var addslonczewskitorque2_code cu.Function // Stores the arguments for addslonczewskitorque2 kernel invocation type addslonczewskitorque2_args_t struct { arg_tx unsafe.Pointer arg_ty unsafe.Pointer arg_tz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_Ms_ unsafe.Pointer arg_Ms_mul float32 arg_jz_ unsafe.Pointer arg_jz_mul float32 arg_px_ unsafe.Pointer arg_px_mul float32 arg_py_ unsafe.Pointer arg_py_mul float32 arg_pz_ unsafe.Pointer arg_pz_mul float32 arg_alpha_ unsafe.Pointer arg_alpha_mul float32 arg_pol_ unsafe.Pointer arg_pol_mul float32 arg_lambda_ unsafe.Pointer arg_lambda_mul float32 arg_epsPrime_ unsafe.Pointer arg_epsPrime_mul float32 arg_thickness_ unsafe.Pointer arg_thickness_mul float32 arg_meshThickness float32 arg_freeLayerPosition float32 arg_N int argptr [29]unsafe.Pointer sync.Mutex } // Stores the arguments for addslonczewskitorque2 kernel invocation var addslonczewskitorque2_args addslonczewskitorque2_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. addslonczewskitorque2_args.argptr[0] = unsafe.Pointer(&addslonczewskitorque2_args.arg_tx) addslonczewskitorque2_args.argptr[1] = unsafe.Pointer(&addslonczewskitorque2_args.arg_ty) addslonczewskitorque2_args.argptr[2] = unsafe.Pointer(&addslonczewskitorque2_args.arg_tz) addslonczewskitorque2_args.argptr[3] = unsafe.Pointer(&addslonczewskitorque2_args.arg_mx) addslonczewskitorque2_args.argptr[4] = unsafe.Pointer(&addslonczewskitorque2_args.arg_my) addslonczewskitorque2_args.argptr[5] = unsafe.Pointer(&addslonczewskitorque2_args.arg_mz) addslonczewskitorque2_args.argptr[6] = unsafe.Pointer(&addslonczewskitorque2_args.arg_Ms_) addslonczewskitorque2_args.argptr[7] = unsafe.Pointer(&addslonczewskitorque2_args.arg_Ms_mul) addslonczewskitorque2_args.argptr[8] = unsafe.Pointer(&addslonczewskitorque2_args.arg_jz_) addslonczewskitorque2_args.argptr[9] = unsafe.Pointer(&addslonczewskitorque2_args.arg_jz_mul) addslonczewskitorque2_args.argptr[10] = unsafe.Pointer(&addslonczewskitorque2_args.arg_px_) addslonczewskitorque2_args.argptr[11] = unsafe.Pointer(&addslonczewskitorque2_args.arg_px_mul) addslonczewskitorque2_args.argptr[12] = unsafe.Pointer(&addslonczewskitorque2_args.arg_py_) addslonczewskitorque2_args.argptr[13] = unsafe.Pointer(&addslonczewskitorque2_args.arg_py_mul) addslonczewskitorque2_args.argptr[14] = unsafe.Pointer(&addslonczewskitorque2_args.arg_pz_) addslonczewskitorque2_args.argptr[15] = unsafe.Pointer(&addslonczewskitorque2_args.arg_pz_mul) addslonczewskitorque2_args.argptr[16] = unsafe.Pointer(&addslonczewskitorque2_args.arg_alpha_) addslonczewskitorque2_args.argptr[17] = unsafe.Pointer(&addslonczewskitorque2_args.arg_alpha_mul) addslonczewskitorque2_args.argptr[18] = unsafe.Pointer(&addslonczewskitorque2_args.arg_pol_) addslonczewskitorque2_args.argptr[19] = unsafe.Pointer(&addslonczewskitorque2_args.arg_pol_mul) addslonczewskitorque2_args.argptr[20] = unsafe.Pointer(&addslonczewskitorque2_args.arg_lambda_) addslonczewskitorque2_args.argptr[21] = unsafe.Pointer(&addslonczewskitorque2_args.arg_lambda_mul) addslonczewskitorque2_args.argptr[22] = unsafe.Pointer(&addslonczewskitorque2_args.arg_epsPrime_) addslonczewskitorque2_args.argptr[23] = unsafe.Pointer(&addslonczewskitorque2_args.arg_epsPrime_mul) addslonczewskitorque2_args.argptr[24] = unsafe.Pointer(&addslonczewskitorque2_args.arg_thickness_) addslonczewskitorque2_args.argptr[25] = unsafe.Pointer(&addslonczewskitorque2_args.arg_thickness_mul) addslonczewskitorque2_args.argptr[26] = unsafe.Pointer(&addslonczewskitorque2_args.arg_meshThickness) addslonczewskitorque2_args.argptr[27] = unsafe.Pointer(&addslonczewskitorque2_args.arg_freeLayerPosition) addslonczewskitorque2_args.argptr[28] = unsafe.Pointer(&addslonczewskitorque2_args.arg_N) } // Wrapper for addslonczewskitorque2 CUDA kernel, asynchronous. func k_addslonczewskitorque2_async(tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, jz_ unsafe.Pointer, jz_mul float32, px_ unsafe.Pointer, px_mul float32, py_ unsafe.Pointer, py_mul float32, pz_ unsafe.Pointer, pz_mul float32, alpha_ unsafe.Pointer, alpha_mul float32, pol_ unsafe.Pointer, pol_mul float32, lambda_ unsafe.Pointer, lambda_mul float32, epsPrime_ unsafe.Pointer, epsPrime_mul float32, thickness_ unsafe.Pointer, thickness_mul float32, meshThickness float32, freeLayerPosition float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("addslonczewskitorque2") } addslonczewskitorque2_args.Lock() defer addslonczewskitorque2_args.Unlock() if addslonczewskitorque2_code == 0 { addslonczewskitorque2_code = fatbinLoad(addslonczewskitorque2_map, "addslonczewskitorque2") } addslonczewskitorque2_args.arg_tx = tx addslonczewskitorque2_args.arg_ty = ty addslonczewskitorque2_args.arg_tz = tz addslonczewskitorque2_args.arg_mx = mx addslonczewskitorque2_args.arg_my = my addslonczewskitorque2_args.arg_mz = mz addslonczewskitorque2_args.arg_Ms_ = Ms_ addslonczewskitorque2_args.arg_Ms_mul = Ms_mul addslonczewskitorque2_args.arg_jz_ = jz_ addslonczewskitorque2_args.arg_jz_mul = jz_mul addslonczewskitorque2_args.arg_px_ = px_ addslonczewskitorque2_args.arg_px_mul = px_mul addslonczewskitorque2_args.arg_py_ = py_ addslonczewskitorque2_args.arg_py_mul = py_mul addslonczewskitorque2_args.arg_pz_ = pz_ addslonczewskitorque2_args.arg_pz_mul = pz_mul addslonczewskitorque2_args.arg_alpha_ = alpha_ addslonczewskitorque2_args.arg_alpha_mul = alpha_mul addslonczewskitorque2_args.arg_pol_ = pol_ addslonczewskitorque2_args.arg_pol_mul = pol_mul addslonczewskitorque2_args.arg_lambda_ = lambda_ addslonczewskitorque2_args.arg_lambda_mul = lambda_mul addslonczewskitorque2_args.arg_epsPrime_ = epsPrime_ addslonczewskitorque2_args.arg_epsPrime_mul = epsPrime_mul addslonczewskitorque2_args.arg_thickness_ = thickness_ addslonczewskitorque2_args.arg_thickness_mul = thickness_mul addslonczewskitorque2_args.arg_meshThickness = meshThickness addslonczewskitorque2_args.arg_freeLayerPosition = freeLayerPosition addslonczewskitorque2_args.arg_N = N args := addslonczewskitorque2_args.argptr[:] cu.LaunchKernel(addslonczewskitorque2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("addslonczewskitorque2") } } // maps compute capability on PTX code for addslonczewskitorque2 kernel. var addslonczewskitorque2_map = map[int]string{0: "", 30: addslonczewskitorque2_ptx_30, 32: addslonczewskitorque2_ptx_32, 35: addslonczewskitorque2_ptx_35, 37: addslonczewskitorque2_ptx_37, 50: addslonczewskitorque2_ptx_50, 52: addslonczewskitorque2_ptx_52, 53: addslonczewskitorque2_ptx_53, 60: addslonczewskitorque2_ptx_60, 61: addslonczewskitorque2_ptx_61, 62: addslonczewskitorque2_ptx_62, 70: addslonczewskitorque2_ptx_70, 72: addslonczewskitorque2_ptx_72, 75: addslonczewskitorque2_ptx_75} // addslonczewskitorque2 PTX code for various compute capabilities. const ( addslonczewskitorque2_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<86>; .reg .f64 %fd<3>; .reg .b64 %rd<61>; ld.param.u64 %rd1, [addslonczewskitorque2_param_0]; ld.param.u64 %rd2, [addslonczewskitorque2_param_1]; ld.param.u64 %rd3, [addslonczewskitorque2_param_2]; ld.param.u64 %rd4, [addslonczewskitorque2_param_3]; ld.param.u64 %rd5, [addslonczewskitorque2_param_4]; ld.param.u64 %rd6, [addslonczewskitorque2_param_5]; ld.param.u64 %rd7, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd8, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd9, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd10, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd11, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd12, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd13, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd14, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd15, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd16, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_25; cvta.to.global.u64 %rd17, %rd4; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.f32 %f1, [%rd19]; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.f32 %f2, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.f32 %f3, [%rd23]; setp.eq.s64 %p2, %rd8, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd8; add.s64 %rd26, %rd24, %rd18; ld.global.f32 %f42, [%rd26]; mul.f32 %f109, %f42, %f109; BB0_3: setp.eq.s64 %p3, %rd9, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd27, %rd9; add.s64 %rd29, %rd27, %rd18; ld.global.f32 %f43, [%rd29]; mul.f32 %f110, %f43, %f110; BB0_5: setp.eq.s64 %p4, %rd10, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd30, %rd10; add.s64 %rd32, %rd30, %rd18; ld.global.f32 %f44, [%rd32]; mul.f32 %f111, %f44, %f111; BB0_7: setp.eq.s64 %p5, %rd11, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd33, %rd11; add.s64 %rd35, %rd33, %rd18; ld.global.f32 %f45, [%rd35]; mul.f32 %f112, %f45, %f112; BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; mov.f32 %f113, 0f00000000; setp.eq.f32 %p6, %f12, 0f00000000; @%p6 bra BB0_11; rcp.rn.f32 %f113, %f12; BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd7, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd36, %rd7; add.s64 %rd38, %rd36, %rd18; ld.global.f32 %f50, [%rd38]; mul.f32 %f114, %f50, %f114; BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd39, %rd12; add.s64 %rd41, %rd39, %rd18; ld.global.f32 %f51, [%rd41]; mul.f32 %f115, %f51, %f115; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd42, %rd13; add.s64 %rd44, %rd42, %rd18; ld.global.f32 %f52, [%rd44]; mul.f32 %f116, %f52, %f116; BB0_17: setp.eq.s64 %p10, %rd14, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd45, %rd14; add.s64 %rd47, %rd45, %rd18; ld.global.f32 %f53, [%rd47]; mul.f32 %f117, %f53, %f117; BB0_19: setp.eq.s64 %p11, %rd15, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd48, %rd15; add.s64 %rd50, %rd48, %rd18; ld.global.f32 %f54, [%rd50]; mul.f32 %f118, %f54, %f118; BB0_21: setp.eq.s64 %p12, %rd16, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd51, %rd16; add.s64 %rd53, %rd51, %rd18; ld.global.f32 %f55, [%rd53]; mul.f32 %f119, %f55, %f119; BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd56, %rd54, %rd18; ld.global.f32 %f99, [%rd56]; add.f32 %f100, %f99, %f98; st.global.f32 [%rd56], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd57, %rd2; add.s64 %rd58, %rd57, %rd18; ld.global.f32 %f103, [%rd58]; add.f32 %f104, %f103, %f102; st.global.f32 [%rd58], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd59, %rd3; add.s64 %rd60, %rd59, %rd18; ld.global.f32 %f107, [%rd60]; add.f32 %f108, %f107, %f106; st.global.f32 [%rd60], %f108; BB0_25: ret; } ` addslonczewskitorque2_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<86>; .reg .f64 %fd<3>; .reg .b64 %rd<61>; ld.param.u64 %rd1, [addslonczewskitorque2_param_0]; ld.param.u64 %rd2, [addslonczewskitorque2_param_1]; ld.param.u64 %rd3, [addslonczewskitorque2_param_2]; ld.param.u64 %rd4, [addslonczewskitorque2_param_3]; ld.param.u64 %rd5, [addslonczewskitorque2_param_4]; ld.param.u64 %rd6, [addslonczewskitorque2_param_5]; ld.param.u64 %rd7, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd8, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd9, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd10, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd11, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd12, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd13, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd14, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd15, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd16, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_25; cvta.to.global.u64 %rd17, %rd4; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f1, [%rd19]; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f2, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f3, [%rd23]; setp.eq.s64 %p2, %rd8, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd8; add.s64 %rd26, %rd24, %rd18; ld.global.nc.f32 %f42, [%rd26]; mul.f32 %f109, %f42, %f109; BB0_3: setp.eq.s64 %p3, %rd9, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd27, %rd9; add.s64 %rd29, %rd27, %rd18; ld.global.nc.f32 %f43, [%rd29]; mul.f32 %f110, %f43, %f110; BB0_5: setp.eq.s64 %p4, %rd10, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd30, %rd10; add.s64 %rd32, %rd30, %rd18; ld.global.nc.f32 %f44, [%rd32]; mul.f32 %f111, %f44, %f111; BB0_7: setp.eq.s64 %p5, %rd11, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd33, %rd11; add.s64 %rd35, %rd33, %rd18; ld.global.nc.f32 %f45, [%rd35]; mul.f32 %f112, %f45, %f112; BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; mov.f32 %f113, 0f00000000; setp.eq.f32 %p6, %f12, 0f00000000; @%p6 bra BB0_11; rcp.rn.f32 %f113, %f12; BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd7, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd36, %rd7; add.s64 %rd38, %rd36, %rd18; ld.global.nc.f32 %f50, [%rd38]; mul.f32 %f114, %f50, %f114; BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd39, %rd12; add.s64 %rd41, %rd39, %rd18; ld.global.nc.f32 %f51, [%rd41]; mul.f32 %f115, %f51, %f115; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd42, %rd13; add.s64 %rd44, %rd42, %rd18; ld.global.nc.f32 %f52, [%rd44]; mul.f32 %f116, %f52, %f116; BB0_17: setp.eq.s64 %p10, %rd14, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd45, %rd14; add.s64 %rd47, %rd45, %rd18; ld.global.nc.f32 %f53, [%rd47]; mul.f32 %f117, %f53, %f117; BB0_19: setp.eq.s64 %p11, %rd15, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd48, %rd15; add.s64 %rd50, %rd48, %rd18; ld.global.nc.f32 %f54, [%rd50]; mul.f32 %f118, %f54, %f118; BB0_21: setp.eq.s64 %p12, %rd16, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd51, %rd16; add.s64 %rd53, %rd51, %rd18; ld.global.nc.f32 %f55, [%rd53]; mul.f32 %f119, %f55, %f119; BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd56, %rd54, %rd18; ld.global.f32 %f99, [%rd56]; add.f32 %f100, %f99, %f98; st.global.f32 [%rd56], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd57, %rd2; add.s64 %rd58, %rd57, %rd18; ld.global.f32 %f103, [%rd58]; add.f32 %f104, %f103, %f102; st.global.f32 [%rd58], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd59, %rd3; add.s64 %rd60, %rd59, %rd18; ld.global.f32 %f107, [%rd60]; add.f32 %f108, %f107, %f106; st.global.f32 [%rd60], %f108; BB0_25: ret; } ` addslonczewskitorque2_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<86>; .reg .f64 %fd<3>; .reg .b64 %rd<61>; ld.param.u64 %rd1, [addslonczewskitorque2_param_0]; ld.param.u64 %rd2, [addslonczewskitorque2_param_1]; ld.param.u64 %rd3, [addslonczewskitorque2_param_2]; ld.param.u64 %rd4, [addslonczewskitorque2_param_3]; ld.param.u64 %rd5, [addslonczewskitorque2_param_4]; ld.param.u64 %rd6, [addslonczewskitorque2_param_5]; ld.param.u64 %rd7, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd8, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd9, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd10, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd11, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd12, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd13, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd14, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd15, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd16, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_25; cvta.to.global.u64 %rd17, %rd4; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f1, [%rd19]; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f2, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f3, [%rd23]; setp.eq.s64 %p2, %rd8, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd8; add.s64 %rd26, %rd24, %rd18; ld.global.nc.f32 %f42, [%rd26]; mul.f32 %f109, %f42, %f109; BB0_3: setp.eq.s64 %p3, %rd9, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd27, %rd9; add.s64 %rd29, %rd27, %rd18; ld.global.nc.f32 %f43, [%rd29]; mul.f32 %f110, %f43, %f110; BB0_5: setp.eq.s64 %p4, %rd10, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd30, %rd10; add.s64 %rd32, %rd30, %rd18; ld.global.nc.f32 %f44, [%rd32]; mul.f32 %f111, %f44, %f111; BB0_7: setp.eq.s64 %p5, %rd11, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd33, %rd11; add.s64 %rd35, %rd33, %rd18; ld.global.nc.f32 %f45, [%rd35]; mul.f32 %f112, %f45, %f112; BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; mov.f32 %f113, 0f00000000; setp.eq.f32 %p6, %f12, 0f00000000; @%p6 bra BB0_11; rcp.rn.f32 %f113, %f12; BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd7, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd36, %rd7; add.s64 %rd38, %rd36, %rd18; ld.global.nc.f32 %f50, [%rd38]; mul.f32 %f114, %f50, %f114; BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd39, %rd12; add.s64 %rd41, %rd39, %rd18; ld.global.nc.f32 %f51, [%rd41]; mul.f32 %f115, %f51, %f115; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd42, %rd13; add.s64 %rd44, %rd42, %rd18; ld.global.nc.f32 %f52, [%rd44]; mul.f32 %f116, %f52, %f116; BB0_17: setp.eq.s64 %p10, %rd14, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd45, %rd14; add.s64 %rd47, %rd45, %rd18; ld.global.nc.f32 %f53, [%rd47]; mul.f32 %f117, %f53, %f117; BB0_19: setp.eq.s64 %p11, %rd15, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd48, %rd15; add.s64 %rd50, %rd48, %rd18; ld.global.nc.f32 %f54, [%rd50]; mul.f32 %f118, %f54, %f118; BB0_21: setp.eq.s64 %p12, %rd16, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd51, %rd16; add.s64 %rd53, %rd51, %rd18; ld.global.nc.f32 %f55, [%rd53]; mul.f32 %f119, %f55, %f119; BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd56, %rd54, %rd18; ld.global.f32 %f99, [%rd56]; add.f32 %f100, %f99, %f98; st.global.f32 [%rd56], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd57, %rd2; add.s64 %rd58, %rd57, %rd18; ld.global.f32 %f103, [%rd58]; add.f32 %f104, %f103, %f102; st.global.f32 [%rd58], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd59, %rd3; add.s64 %rd60, %rd59, %rd18; ld.global.f32 %f107, [%rd60]; add.f32 %f108, %f107, %f106; st.global.f32 [%rd60], %f108; BB0_25: ret; } ` addslonczewskitorque2_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<86>; .reg .f64 %fd<3>; .reg .b64 %rd<61>; ld.param.u64 %rd1, [addslonczewskitorque2_param_0]; ld.param.u64 %rd2, [addslonczewskitorque2_param_1]; ld.param.u64 %rd3, [addslonczewskitorque2_param_2]; ld.param.u64 %rd4, [addslonczewskitorque2_param_3]; ld.param.u64 %rd5, [addslonczewskitorque2_param_4]; ld.param.u64 %rd6, [addslonczewskitorque2_param_5]; ld.param.u64 %rd7, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd8, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd9, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd10, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd11, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd12, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd13, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd14, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd15, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd16, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_25; cvta.to.global.u64 %rd17, %rd4; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f1, [%rd19]; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f2, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f3, [%rd23]; setp.eq.s64 %p2, %rd8, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd8; add.s64 %rd26, %rd24, %rd18; ld.global.nc.f32 %f42, [%rd26]; mul.f32 %f109, %f42, %f109; BB0_3: setp.eq.s64 %p3, %rd9, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd27, %rd9; add.s64 %rd29, %rd27, %rd18; ld.global.nc.f32 %f43, [%rd29]; mul.f32 %f110, %f43, %f110; BB0_5: setp.eq.s64 %p4, %rd10, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd30, %rd10; add.s64 %rd32, %rd30, %rd18; ld.global.nc.f32 %f44, [%rd32]; mul.f32 %f111, %f44, %f111; BB0_7: setp.eq.s64 %p5, %rd11, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd33, %rd11; add.s64 %rd35, %rd33, %rd18; ld.global.nc.f32 %f45, [%rd35]; mul.f32 %f112, %f45, %f112; BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; mov.f32 %f113, 0f00000000; setp.eq.f32 %p6, %f12, 0f00000000; @%p6 bra BB0_11; rcp.rn.f32 %f113, %f12; BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd7, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd36, %rd7; add.s64 %rd38, %rd36, %rd18; ld.global.nc.f32 %f50, [%rd38]; mul.f32 %f114, %f50, %f114; BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd39, %rd12; add.s64 %rd41, %rd39, %rd18; ld.global.nc.f32 %f51, [%rd41]; mul.f32 %f115, %f51, %f115; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd42, %rd13; add.s64 %rd44, %rd42, %rd18; ld.global.nc.f32 %f52, [%rd44]; mul.f32 %f116, %f52, %f116; BB0_17: setp.eq.s64 %p10, %rd14, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd45, %rd14; add.s64 %rd47, %rd45, %rd18; ld.global.nc.f32 %f53, [%rd47]; mul.f32 %f117, %f53, %f117; BB0_19: setp.eq.s64 %p11, %rd15, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd48, %rd15; add.s64 %rd50, %rd48, %rd18; ld.global.nc.f32 %f54, [%rd50]; mul.f32 %f118, %f54, %f118; BB0_21: setp.eq.s64 %p12, %rd16, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd51, %rd16; add.s64 %rd53, %rd51, %rd18; ld.global.nc.f32 %f55, [%rd53]; mul.f32 %f119, %f55, %f119; BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd56, %rd54, %rd18; ld.global.f32 %f99, [%rd56]; add.f32 %f100, %f99, %f98; st.global.f32 [%rd56], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd57, %rd2; add.s64 %rd58, %rd57, %rd18; ld.global.f32 %f103, [%rd58]; add.f32 %f104, %f103, %f102; st.global.f32 [%rd58], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd59, %rd3; add.s64 %rd60, %rd59, %rd18; ld.global.f32 %f107, [%rd60]; add.f32 %f108, %f107, %f106; st.global.f32 [%rd60], %f108; BB0_25: ret; } ` addslonczewskitorque2_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<86>; .reg .f64 %fd<3>; .reg .b64 %rd<61>; ld.param.u64 %rd1, [addslonczewskitorque2_param_0]; ld.param.u64 %rd2, [addslonczewskitorque2_param_1]; ld.param.u64 %rd3, [addslonczewskitorque2_param_2]; ld.param.u64 %rd4, [addslonczewskitorque2_param_3]; ld.param.u64 %rd5, [addslonczewskitorque2_param_4]; ld.param.u64 %rd6, [addslonczewskitorque2_param_5]; ld.param.u64 %rd7, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd8, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd9, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd10, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd11, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd12, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd13, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd14, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd15, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd16, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_25; cvta.to.global.u64 %rd17, %rd4; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f1, [%rd19]; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f2, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f3, [%rd23]; setp.eq.s64 %p2, %rd8, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd8; add.s64 %rd26, %rd24, %rd18; ld.global.nc.f32 %f42, [%rd26]; mul.f32 %f109, %f42, %f109; BB0_3: setp.eq.s64 %p3, %rd9, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd27, %rd9; add.s64 %rd29, %rd27, %rd18; ld.global.nc.f32 %f43, [%rd29]; mul.f32 %f110, %f43, %f110; BB0_5: setp.eq.s64 %p4, %rd10, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd30, %rd10; add.s64 %rd32, %rd30, %rd18; ld.global.nc.f32 %f44, [%rd32]; mul.f32 %f111, %f44, %f111; BB0_7: setp.eq.s64 %p5, %rd11, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd33, %rd11; add.s64 %rd35, %rd33, %rd18; ld.global.nc.f32 %f45, [%rd35]; mul.f32 %f112, %f45, %f112; BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; mov.f32 %f113, 0f00000000; setp.eq.f32 %p6, %f12, 0f00000000; @%p6 bra BB0_11; rcp.rn.f32 %f113, %f12; BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd7, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd36, %rd7; add.s64 %rd38, %rd36, %rd18; ld.global.nc.f32 %f50, [%rd38]; mul.f32 %f114, %f50, %f114; BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd39, %rd12; add.s64 %rd41, %rd39, %rd18; ld.global.nc.f32 %f51, [%rd41]; mul.f32 %f115, %f51, %f115; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd42, %rd13; add.s64 %rd44, %rd42, %rd18; ld.global.nc.f32 %f52, [%rd44]; mul.f32 %f116, %f52, %f116; BB0_17: setp.eq.s64 %p10, %rd14, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd45, %rd14; add.s64 %rd47, %rd45, %rd18; ld.global.nc.f32 %f53, [%rd47]; mul.f32 %f117, %f53, %f117; BB0_19: setp.eq.s64 %p11, %rd15, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd48, %rd15; add.s64 %rd50, %rd48, %rd18; ld.global.nc.f32 %f54, [%rd50]; mul.f32 %f118, %f54, %f118; BB0_21: setp.eq.s64 %p12, %rd16, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd51, %rd16; add.s64 %rd53, %rd51, %rd18; ld.global.nc.f32 %f55, [%rd53]; mul.f32 %f119, %f55, %f119; BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd56, %rd54, %rd18; ld.global.f32 %f99, [%rd56]; add.f32 %f100, %f99, %f98; st.global.f32 [%rd56], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd57, %rd2; add.s64 %rd58, %rd57, %rd18; ld.global.f32 %f103, [%rd58]; add.f32 %f104, %f103, %f102; st.global.f32 [%rd58], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd59, %rd3; add.s64 %rd60, %rd59, %rd18; ld.global.f32 %f107, [%rd60]; add.f32 %f108, %f107, %f106; st.global.f32 [%rd60], %f108; BB0_25: ret; } ` addslonczewskitorque2_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<86>; .reg .f64 %fd<3>; .reg .b64 %rd<61>; ld.param.u64 %rd1, [addslonczewskitorque2_param_0]; ld.param.u64 %rd2, [addslonczewskitorque2_param_1]; ld.param.u64 %rd3, [addslonczewskitorque2_param_2]; ld.param.u64 %rd4, [addslonczewskitorque2_param_3]; ld.param.u64 %rd5, [addslonczewskitorque2_param_4]; ld.param.u64 %rd6, [addslonczewskitorque2_param_5]; ld.param.u64 %rd7, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd8, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd9, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd10, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd11, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd12, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd13, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd14, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd15, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd16, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_25; cvta.to.global.u64 %rd17, %rd4; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f1, [%rd19]; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f2, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f3, [%rd23]; setp.eq.s64 %p2, %rd8, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd8; add.s64 %rd26, %rd24, %rd18; ld.global.nc.f32 %f42, [%rd26]; mul.f32 %f109, %f42, %f109; BB0_3: setp.eq.s64 %p3, %rd9, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd27, %rd9; add.s64 %rd29, %rd27, %rd18; ld.global.nc.f32 %f43, [%rd29]; mul.f32 %f110, %f43, %f110; BB0_5: setp.eq.s64 %p4, %rd10, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd30, %rd10; add.s64 %rd32, %rd30, %rd18; ld.global.nc.f32 %f44, [%rd32]; mul.f32 %f111, %f44, %f111; BB0_7: setp.eq.s64 %p5, %rd11, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd33, %rd11; add.s64 %rd35, %rd33, %rd18; ld.global.nc.f32 %f45, [%rd35]; mul.f32 %f112, %f45, %f112; BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; mov.f32 %f113, 0f00000000; setp.eq.f32 %p6, %f12, 0f00000000; @%p6 bra BB0_11; rcp.rn.f32 %f113, %f12; BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd7, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd36, %rd7; add.s64 %rd38, %rd36, %rd18; ld.global.nc.f32 %f50, [%rd38]; mul.f32 %f114, %f50, %f114; BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd39, %rd12; add.s64 %rd41, %rd39, %rd18; ld.global.nc.f32 %f51, [%rd41]; mul.f32 %f115, %f51, %f115; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd42, %rd13; add.s64 %rd44, %rd42, %rd18; ld.global.nc.f32 %f52, [%rd44]; mul.f32 %f116, %f52, %f116; BB0_17: setp.eq.s64 %p10, %rd14, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd45, %rd14; add.s64 %rd47, %rd45, %rd18; ld.global.nc.f32 %f53, [%rd47]; mul.f32 %f117, %f53, %f117; BB0_19: setp.eq.s64 %p11, %rd15, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd48, %rd15; add.s64 %rd50, %rd48, %rd18; ld.global.nc.f32 %f54, [%rd50]; mul.f32 %f118, %f54, %f118; BB0_21: setp.eq.s64 %p12, %rd16, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd51, %rd16; add.s64 %rd53, %rd51, %rd18; ld.global.nc.f32 %f55, [%rd53]; mul.f32 %f119, %f55, %f119; BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd56, %rd54, %rd18; ld.global.f32 %f99, [%rd56]; add.f32 %f100, %f99, %f98; st.global.f32 [%rd56], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd57, %rd2; add.s64 %rd58, %rd57, %rd18; ld.global.f32 %f103, [%rd58]; add.f32 %f104, %f103, %f102; st.global.f32 [%rd58], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd59, %rd3; add.s64 %rd60, %rd59, %rd18; ld.global.f32 %f107, [%rd60]; add.f32 %f108, %f107, %f106; st.global.f32 [%rd60], %f108; BB0_25: ret; } ` addslonczewskitorque2_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<86>; .reg .f64 %fd<3>; .reg .b64 %rd<61>; ld.param.u64 %rd1, [addslonczewskitorque2_param_0]; ld.param.u64 %rd2, [addslonczewskitorque2_param_1]; ld.param.u64 %rd3, [addslonczewskitorque2_param_2]; ld.param.u64 %rd4, [addslonczewskitorque2_param_3]; ld.param.u64 %rd5, [addslonczewskitorque2_param_4]; ld.param.u64 %rd6, [addslonczewskitorque2_param_5]; ld.param.u64 %rd7, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd8, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd9, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd10, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd11, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd12, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd13, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd14, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd15, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd16, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_25; cvta.to.global.u64 %rd17, %rd4; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f1, [%rd19]; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f2, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f3, [%rd23]; setp.eq.s64 %p2, %rd8, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd8; add.s64 %rd26, %rd24, %rd18; ld.global.nc.f32 %f42, [%rd26]; mul.f32 %f109, %f42, %f109; BB0_3: setp.eq.s64 %p3, %rd9, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd27, %rd9; add.s64 %rd29, %rd27, %rd18; ld.global.nc.f32 %f43, [%rd29]; mul.f32 %f110, %f43, %f110; BB0_5: setp.eq.s64 %p4, %rd10, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd30, %rd10; add.s64 %rd32, %rd30, %rd18; ld.global.nc.f32 %f44, [%rd32]; mul.f32 %f111, %f44, %f111; BB0_7: setp.eq.s64 %p5, %rd11, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd33, %rd11; add.s64 %rd35, %rd33, %rd18; ld.global.nc.f32 %f45, [%rd35]; mul.f32 %f112, %f45, %f112; BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; mov.f32 %f113, 0f00000000; setp.eq.f32 %p6, %f12, 0f00000000; @%p6 bra BB0_11; rcp.rn.f32 %f113, %f12; BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd7, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd36, %rd7; add.s64 %rd38, %rd36, %rd18; ld.global.nc.f32 %f50, [%rd38]; mul.f32 %f114, %f50, %f114; BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd39, %rd12; add.s64 %rd41, %rd39, %rd18; ld.global.nc.f32 %f51, [%rd41]; mul.f32 %f115, %f51, %f115; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd42, %rd13; add.s64 %rd44, %rd42, %rd18; ld.global.nc.f32 %f52, [%rd44]; mul.f32 %f116, %f52, %f116; BB0_17: setp.eq.s64 %p10, %rd14, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd45, %rd14; add.s64 %rd47, %rd45, %rd18; ld.global.nc.f32 %f53, [%rd47]; mul.f32 %f117, %f53, %f117; BB0_19: setp.eq.s64 %p11, %rd15, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd48, %rd15; add.s64 %rd50, %rd48, %rd18; ld.global.nc.f32 %f54, [%rd50]; mul.f32 %f118, %f54, %f118; BB0_21: setp.eq.s64 %p12, %rd16, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd51, %rd16; add.s64 %rd53, %rd51, %rd18; ld.global.nc.f32 %f55, [%rd53]; mul.f32 %f119, %f55, %f119; BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd56, %rd54, %rd18; ld.global.f32 %f99, [%rd56]; add.f32 %f100, %f99, %f98; st.global.f32 [%rd56], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd57, %rd2; add.s64 %rd58, %rd57, %rd18; ld.global.f32 %f103, [%rd58]; add.f32 %f104, %f103, %f102; st.global.f32 [%rd58], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd59, %rd3; add.s64 %rd60, %rd59, %rd18; ld.global.f32 %f107, [%rd60]; add.f32 %f108, %f107, %f106; st.global.f32 [%rd60], %f108; BB0_25: ret; } ` addslonczewskitorque2_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<86>; .reg .f64 %fd<3>; .reg .b64 %rd<61>; ld.param.u64 %rd1, [addslonczewskitorque2_param_0]; ld.param.u64 %rd2, [addslonczewskitorque2_param_1]; ld.param.u64 %rd3, [addslonczewskitorque2_param_2]; ld.param.u64 %rd4, [addslonczewskitorque2_param_3]; ld.param.u64 %rd5, [addslonczewskitorque2_param_4]; ld.param.u64 %rd6, [addslonczewskitorque2_param_5]; ld.param.u64 %rd7, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd8, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd9, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd10, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd11, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd12, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd13, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd14, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd15, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd16, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_25; cvta.to.global.u64 %rd17, %rd4; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f1, [%rd19]; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f2, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f3, [%rd23]; setp.eq.s64 %p2, %rd8, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd8; add.s64 %rd26, %rd24, %rd18; ld.global.nc.f32 %f42, [%rd26]; mul.f32 %f109, %f42, %f109; BB0_3: setp.eq.s64 %p3, %rd9, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd27, %rd9; add.s64 %rd29, %rd27, %rd18; ld.global.nc.f32 %f43, [%rd29]; mul.f32 %f110, %f43, %f110; BB0_5: setp.eq.s64 %p4, %rd10, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd30, %rd10; add.s64 %rd32, %rd30, %rd18; ld.global.nc.f32 %f44, [%rd32]; mul.f32 %f111, %f44, %f111; BB0_7: setp.eq.s64 %p5, %rd11, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd33, %rd11; add.s64 %rd35, %rd33, %rd18; ld.global.nc.f32 %f45, [%rd35]; mul.f32 %f112, %f45, %f112; BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; mov.f32 %f113, 0f00000000; setp.eq.f32 %p6, %f12, 0f00000000; @%p6 bra BB0_11; rcp.rn.f32 %f113, %f12; BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd7, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd36, %rd7; add.s64 %rd38, %rd36, %rd18; ld.global.nc.f32 %f50, [%rd38]; mul.f32 %f114, %f50, %f114; BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd39, %rd12; add.s64 %rd41, %rd39, %rd18; ld.global.nc.f32 %f51, [%rd41]; mul.f32 %f115, %f51, %f115; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd42, %rd13; add.s64 %rd44, %rd42, %rd18; ld.global.nc.f32 %f52, [%rd44]; mul.f32 %f116, %f52, %f116; BB0_17: setp.eq.s64 %p10, %rd14, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd45, %rd14; add.s64 %rd47, %rd45, %rd18; ld.global.nc.f32 %f53, [%rd47]; mul.f32 %f117, %f53, %f117; BB0_19: setp.eq.s64 %p11, %rd15, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd48, %rd15; add.s64 %rd50, %rd48, %rd18; ld.global.nc.f32 %f54, [%rd50]; mul.f32 %f118, %f54, %f118; BB0_21: setp.eq.s64 %p12, %rd16, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd51, %rd16; add.s64 %rd53, %rd51, %rd18; ld.global.nc.f32 %f55, [%rd53]; mul.f32 %f119, %f55, %f119; BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd56, %rd54, %rd18; ld.global.f32 %f99, [%rd56]; add.f32 %f100, %f99, %f98; st.global.f32 [%rd56], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd57, %rd2; add.s64 %rd58, %rd57, %rd18; ld.global.f32 %f103, [%rd58]; add.f32 %f104, %f103, %f102; st.global.f32 [%rd58], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd59, %rd3; add.s64 %rd60, %rd59, %rd18; ld.global.f32 %f107, [%rd60]; add.f32 %f108, %f107, %f106; st.global.f32 [%rd60], %f108; BB0_25: ret; } ` addslonczewskitorque2_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<86>; .reg .f64 %fd<3>; .reg .b64 %rd<61>; ld.param.u64 %rd1, [addslonczewskitorque2_param_0]; ld.param.u64 %rd2, [addslonczewskitorque2_param_1]; ld.param.u64 %rd3, [addslonczewskitorque2_param_2]; ld.param.u64 %rd4, [addslonczewskitorque2_param_3]; ld.param.u64 %rd5, [addslonczewskitorque2_param_4]; ld.param.u64 %rd6, [addslonczewskitorque2_param_5]; ld.param.u64 %rd7, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd8, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd9, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd10, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd11, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd12, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd13, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd14, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd15, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd16, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_25; cvta.to.global.u64 %rd17, %rd4; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f1, [%rd19]; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f2, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f3, [%rd23]; setp.eq.s64 %p2, %rd8, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd8; add.s64 %rd26, %rd24, %rd18; ld.global.nc.f32 %f42, [%rd26]; mul.f32 %f109, %f42, %f109; BB0_3: setp.eq.s64 %p3, %rd9, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd27, %rd9; add.s64 %rd29, %rd27, %rd18; ld.global.nc.f32 %f43, [%rd29]; mul.f32 %f110, %f43, %f110; BB0_5: setp.eq.s64 %p4, %rd10, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd30, %rd10; add.s64 %rd32, %rd30, %rd18; ld.global.nc.f32 %f44, [%rd32]; mul.f32 %f111, %f44, %f111; BB0_7: setp.eq.s64 %p5, %rd11, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd33, %rd11; add.s64 %rd35, %rd33, %rd18; ld.global.nc.f32 %f45, [%rd35]; mul.f32 %f112, %f45, %f112; BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; mov.f32 %f113, 0f00000000; setp.eq.f32 %p6, %f12, 0f00000000; @%p6 bra BB0_11; rcp.rn.f32 %f113, %f12; BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd7, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd36, %rd7; add.s64 %rd38, %rd36, %rd18; ld.global.nc.f32 %f50, [%rd38]; mul.f32 %f114, %f50, %f114; BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd39, %rd12; add.s64 %rd41, %rd39, %rd18; ld.global.nc.f32 %f51, [%rd41]; mul.f32 %f115, %f51, %f115; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd42, %rd13; add.s64 %rd44, %rd42, %rd18; ld.global.nc.f32 %f52, [%rd44]; mul.f32 %f116, %f52, %f116; BB0_17: setp.eq.s64 %p10, %rd14, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd45, %rd14; add.s64 %rd47, %rd45, %rd18; ld.global.nc.f32 %f53, [%rd47]; mul.f32 %f117, %f53, %f117; BB0_19: setp.eq.s64 %p11, %rd15, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd48, %rd15; add.s64 %rd50, %rd48, %rd18; ld.global.nc.f32 %f54, [%rd50]; mul.f32 %f118, %f54, %f118; BB0_21: setp.eq.s64 %p12, %rd16, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd51, %rd16; add.s64 %rd53, %rd51, %rd18; ld.global.nc.f32 %f55, [%rd53]; mul.f32 %f119, %f55, %f119; BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd56, %rd54, %rd18; ld.global.f32 %f99, [%rd56]; add.f32 %f100, %f99, %f98; st.global.f32 [%rd56], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd57, %rd2; add.s64 %rd58, %rd57, %rd18; ld.global.f32 %f103, [%rd58]; add.f32 %f104, %f103, %f102; st.global.f32 [%rd58], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd59, %rd3; add.s64 %rd60, %rd59, %rd18; ld.global.f32 %f107, [%rd60]; add.f32 %f108, %f107, %f106; st.global.f32 [%rd60], %f108; BB0_25: ret; } ` addslonczewskitorque2_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<86>; .reg .f64 %fd<3>; .reg .b64 %rd<61>; ld.param.u64 %rd1, [addslonczewskitorque2_param_0]; ld.param.u64 %rd2, [addslonczewskitorque2_param_1]; ld.param.u64 %rd3, [addslonczewskitorque2_param_2]; ld.param.u64 %rd4, [addslonczewskitorque2_param_3]; ld.param.u64 %rd5, [addslonczewskitorque2_param_4]; ld.param.u64 %rd6, [addslonczewskitorque2_param_5]; ld.param.u64 %rd7, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd8, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd9, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd10, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd11, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd12, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd13, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd14, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd15, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd16, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_25; cvta.to.global.u64 %rd17, %rd4; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f1, [%rd19]; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f2, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f3, [%rd23]; setp.eq.s64 %p2, %rd8, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd8; add.s64 %rd26, %rd24, %rd18; ld.global.nc.f32 %f42, [%rd26]; mul.f32 %f109, %f42, %f109; BB0_3: setp.eq.s64 %p3, %rd9, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd27, %rd9; add.s64 %rd29, %rd27, %rd18; ld.global.nc.f32 %f43, [%rd29]; mul.f32 %f110, %f43, %f110; BB0_5: setp.eq.s64 %p4, %rd10, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd30, %rd10; add.s64 %rd32, %rd30, %rd18; ld.global.nc.f32 %f44, [%rd32]; mul.f32 %f111, %f44, %f111; BB0_7: setp.eq.s64 %p5, %rd11, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd33, %rd11; add.s64 %rd35, %rd33, %rd18; ld.global.nc.f32 %f45, [%rd35]; mul.f32 %f112, %f45, %f112; BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; mov.f32 %f113, 0f00000000; setp.eq.f32 %p6, %f12, 0f00000000; @%p6 bra BB0_11; rcp.rn.f32 %f113, %f12; BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd7, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd36, %rd7; add.s64 %rd38, %rd36, %rd18; ld.global.nc.f32 %f50, [%rd38]; mul.f32 %f114, %f50, %f114; BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd39, %rd12; add.s64 %rd41, %rd39, %rd18; ld.global.nc.f32 %f51, [%rd41]; mul.f32 %f115, %f51, %f115; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd42, %rd13; add.s64 %rd44, %rd42, %rd18; ld.global.nc.f32 %f52, [%rd44]; mul.f32 %f116, %f52, %f116; BB0_17: setp.eq.s64 %p10, %rd14, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd45, %rd14; add.s64 %rd47, %rd45, %rd18; ld.global.nc.f32 %f53, [%rd47]; mul.f32 %f117, %f53, %f117; BB0_19: setp.eq.s64 %p11, %rd15, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd48, %rd15; add.s64 %rd50, %rd48, %rd18; ld.global.nc.f32 %f54, [%rd50]; mul.f32 %f118, %f54, %f118; BB0_21: setp.eq.s64 %p12, %rd16, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd51, %rd16; add.s64 %rd53, %rd51, %rd18; ld.global.nc.f32 %f55, [%rd53]; mul.f32 %f119, %f55, %f119; BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd56, %rd54, %rd18; ld.global.f32 %f99, [%rd56]; add.f32 %f100, %f99, %f98; st.global.f32 [%rd56], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd57, %rd2; add.s64 %rd58, %rd57, %rd18; ld.global.f32 %f103, [%rd58]; add.f32 %f104, %f103, %f102; st.global.f32 [%rd58], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd59, %rd3; add.s64 %rd60, %rd59, %rd18; ld.global.f32 %f107, [%rd60]; add.f32 %f108, %f107, %f106; st.global.f32 [%rd60], %f108; BB0_25: ret; } ` addslonczewskitorque2_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<86>; .reg .f64 %fd<3>; .reg .b64 %rd<61>; ld.param.u64 %rd1, [addslonczewskitorque2_param_0]; ld.param.u64 %rd2, [addslonczewskitorque2_param_1]; ld.param.u64 %rd3, [addslonczewskitorque2_param_2]; ld.param.u64 %rd4, [addslonczewskitorque2_param_3]; ld.param.u64 %rd5, [addslonczewskitorque2_param_4]; ld.param.u64 %rd6, [addslonczewskitorque2_param_5]; ld.param.u64 %rd7, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd8, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd9, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd10, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd11, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd12, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd13, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd14, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd15, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd16, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_25; cvta.to.global.u64 %rd17, %rd4; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f1, [%rd19]; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f2, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f3, [%rd23]; setp.eq.s64 %p2, %rd8, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd8; add.s64 %rd26, %rd24, %rd18; ld.global.nc.f32 %f42, [%rd26]; mul.f32 %f109, %f42, %f109; BB0_3: setp.eq.s64 %p3, %rd9, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd27, %rd9; add.s64 %rd29, %rd27, %rd18; ld.global.nc.f32 %f43, [%rd29]; mul.f32 %f110, %f43, %f110; BB0_5: setp.eq.s64 %p4, %rd10, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd30, %rd10; add.s64 %rd32, %rd30, %rd18; ld.global.nc.f32 %f44, [%rd32]; mul.f32 %f111, %f44, %f111; BB0_7: setp.eq.s64 %p5, %rd11, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd33, %rd11; add.s64 %rd35, %rd33, %rd18; ld.global.nc.f32 %f45, [%rd35]; mul.f32 %f112, %f45, %f112; BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; mov.f32 %f113, 0f00000000; setp.eq.f32 %p6, %f12, 0f00000000; @%p6 bra BB0_11; rcp.rn.f32 %f113, %f12; BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd7, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd36, %rd7; add.s64 %rd38, %rd36, %rd18; ld.global.nc.f32 %f50, [%rd38]; mul.f32 %f114, %f50, %f114; BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd39, %rd12; add.s64 %rd41, %rd39, %rd18; ld.global.nc.f32 %f51, [%rd41]; mul.f32 %f115, %f51, %f115; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd42, %rd13; add.s64 %rd44, %rd42, %rd18; ld.global.nc.f32 %f52, [%rd44]; mul.f32 %f116, %f52, %f116; BB0_17: setp.eq.s64 %p10, %rd14, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd45, %rd14; add.s64 %rd47, %rd45, %rd18; ld.global.nc.f32 %f53, [%rd47]; mul.f32 %f117, %f53, %f117; BB0_19: setp.eq.s64 %p11, %rd15, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd48, %rd15; add.s64 %rd50, %rd48, %rd18; ld.global.nc.f32 %f54, [%rd50]; mul.f32 %f118, %f54, %f118; BB0_21: setp.eq.s64 %p12, %rd16, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd51, %rd16; add.s64 %rd53, %rd51, %rd18; ld.global.nc.f32 %f55, [%rd53]; mul.f32 %f119, %f55, %f119; BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd56, %rd54, %rd18; ld.global.f32 %f99, [%rd56]; add.f32 %f100, %f99, %f98; st.global.f32 [%rd56], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd57, %rd2; add.s64 %rd58, %rd57, %rd18; ld.global.f32 %f103, [%rd58]; add.f32 %f104, %f103, %f102; st.global.f32 [%rd58], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd59, %rd3; add.s64 %rd60, %rd59, %rd18; ld.global.f32 %f107, [%rd60]; add.f32 %f108, %f107, %f106; st.global.f32 [%rd60], %f108; BB0_25: ret; } ` addslonczewskitorque2_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<86>; .reg .f64 %fd<3>; .reg .b64 %rd<61>; ld.param.u64 %rd1, [addslonczewskitorque2_param_0]; ld.param.u64 %rd2, [addslonczewskitorque2_param_1]; ld.param.u64 %rd3, [addslonczewskitorque2_param_2]; ld.param.u64 %rd4, [addslonczewskitorque2_param_3]; ld.param.u64 %rd5, [addslonczewskitorque2_param_4]; ld.param.u64 %rd6, [addslonczewskitorque2_param_5]; ld.param.u64 %rd7, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd8, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd9, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd10, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd11, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd12, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd13, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd14, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd15, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd16, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_25; cvta.to.global.u64 %rd17, %rd4; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f1, [%rd19]; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f2, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f3, [%rd23]; setp.eq.s64 %p2, %rd8, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd8; add.s64 %rd26, %rd24, %rd18; ld.global.nc.f32 %f42, [%rd26]; mul.f32 %f109, %f42, %f109; BB0_3: setp.eq.s64 %p3, %rd9, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd27, %rd9; add.s64 %rd29, %rd27, %rd18; ld.global.nc.f32 %f43, [%rd29]; mul.f32 %f110, %f43, %f110; BB0_5: setp.eq.s64 %p4, %rd10, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd30, %rd10; add.s64 %rd32, %rd30, %rd18; ld.global.nc.f32 %f44, [%rd32]; mul.f32 %f111, %f44, %f111; BB0_7: setp.eq.s64 %p5, %rd11, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd33, %rd11; add.s64 %rd35, %rd33, %rd18; ld.global.nc.f32 %f45, [%rd35]; mul.f32 %f112, %f45, %f112; BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; mov.f32 %f113, 0f00000000; setp.eq.f32 %p6, %f12, 0f00000000; @%p6 bra BB0_11; rcp.rn.f32 %f113, %f12; BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd7, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd36, %rd7; add.s64 %rd38, %rd36, %rd18; ld.global.nc.f32 %f50, [%rd38]; mul.f32 %f114, %f50, %f114; BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd39, %rd12; add.s64 %rd41, %rd39, %rd18; ld.global.nc.f32 %f51, [%rd41]; mul.f32 %f115, %f51, %f115; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd42, %rd13; add.s64 %rd44, %rd42, %rd18; ld.global.nc.f32 %f52, [%rd44]; mul.f32 %f116, %f52, %f116; BB0_17: setp.eq.s64 %p10, %rd14, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd45, %rd14; add.s64 %rd47, %rd45, %rd18; ld.global.nc.f32 %f53, [%rd47]; mul.f32 %f117, %f53, %f117; BB0_19: setp.eq.s64 %p11, %rd15, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd48, %rd15; add.s64 %rd50, %rd48, %rd18; ld.global.nc.f32 %f54, [%rd50]; mul.f32 %f118, %f54, %f118; BB0_21: setp.eq.s64 %p12, %rd16, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd51, %rd16; add.s64 %rd53, %rd51, %rd18; ld.global.nc.f32 %f55, [%rd53]; mul.f32 %f119, %f55, %f119; BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd56, %rd54, %rd18; ld.global.f32 %f99, [%rd56]; add.f32 %f100, %f99, %f98; st.global.f32 [%rd56], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd57, %rd2; add.s64 %rd58, %rd57, %rd18; ld.global.f32 %f103, [%rd58]; add.f32 %f104, %f103, %f102; st.global.f32 [%rd58], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd59, %rd3; add.s64 %rd60, %rd59, %rd18; ld.global.f32 %f107, [%rd60]; add.f32 %f108, %f107, %f106; st.global.f32 [%rd60], %f108; BB0_25: ret; } ` addslonczewskitorque2_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl addslonczewskitorque2 .visible .entry addslonczewskitorque2( .param .u64 addslonczewskitorque2_param_0, .param .u64 addslonczewskitorque2_param_1, .param .u64 addslonczewskitorque2_param_2, .param .u64 addslonczewskitorque2_param_3, .param .u64 addslonczewskitorque2_param_4, .param .u64 addslonczewskitorque2_param_5, .param .u64 addslonczewskitorque2_param_6, .param .f32 addslonczewskitorque2_param_7, .param .u64 addslonczewskitorque2_param_8, .param .f32 addslonczewskitorque2_param_9, .param .u64 addslonczewskitorque2_param_10, .param .f32 addslonczewskitorque2_param_11, .param .u64 addslonczewskitorque2_param_12, .param .f32 addslonczewskitorque2_param_13, .param .u64 addslonczewskitorque2_param_14, .param .f32 addslonczewskitorque2_param_15, .param .u64 addslonczewskitorque2_param_16, .param .f32 addslonczewskitorque2_param_17, .param .u64 addslonczewskitorque2_param_18, .param .f32 addslonczewskitorque2_param_19, .param .u64 addslonczewskitorque2_param_20, .param .f32 addslonczewskitorque2_param_21, .param .u64 addslonczewskitorque2_param_22, .param .f32 addslonczewskitorque2_param_23, .param .u64 addslonczewskitorque2_param_24, .param .f32 addslonczewskitorque2_param_25, .param .f32 addslonczewskitorque2_param_26, .param .f32 addslonczewskitorque2_param_27, .param .u32 addslonczewskitorque2_param_28 ) { .reg .pred %p<17>; .reg .f32 %f<120>; .reg .b32 %r<86>; .reg .f64 %fd<3>; .reg .b64 %rd<61>; ld.param.u64 %rd1, [addslonczewskitorque2_param_0]; ld.param.u64 %rd2, [addslonczewskitorque2_param_1]; ld.param.u64 %rd3, [addslonczewskitorque2_param_2]; ld.param.u64 %rd4, [addslonczewskitorque2_param_3]; ld.param.u64 %rd5, [addslonczewskitorque2_param_4]; ld.param.u64 %rd6, [addslonczewskitorque2_param_5]; ld.param.u64 %rd7, [addslonczewskitorque2_param_6]; ld.param.f32 %f114, [addslonczewskitorque2_param_7]; ld.param.u64 %rd8, [addslonczewskitorque2_param_8]; ld.param.f32 %f109, [addslonczewskitorque2_param_9]; ld.param.u64 %rd9, [addslonczewskitorque2_param_10]; ld.param.f32 %f110, [addslonczewskitorque2_param_11]; ld.param.u64 %rd10, [addslonczewskitorque2_param_12]; ld.param.f32 %f111, [addslonczewskitorque2_param_13]; ld.param.u64 %rd11, [addslonczewskitorque2_param_14]; ld.param.f32 %f112, [addslonczewskitorque2_param_15]; ld.param.u64 %rd12, [addslonczewskitorque2_param_16]; ld.param.f32 %f115, [addslonczewskitorque2_param_17]; ld.param.u64 %rd13, [addslonczewskitorque2_param_18]; ld.param.f32 %f116, [addslonczewskitorque2_param_19]; ld.param.u64 %rd14, [addslonczewskitorque2_param_20]; ld.param.f32 %f117, [addslonczewskitorque2_param_21]; ld.param.u64 %rd15, [addslonczewskitorque2_param_22]; ld.param.f32 %f118, [addslonczewskitorque2_param_23]; ld.param.u64 %rd16, [addslonczewskitorque2_param_24]; ld.param.f32 %f119, [addslonczewskitorque2_param_25]; ld.param.f32 %f40, [addslonczewskitorque2_param_26]; ld.param.f32 %f41, [addslonczewskitorque2_param_27]; ld.param.u32 %r2, [addslonczewskitorque2_param_28]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_25; cvta.to.global.u64 %rd17, %rd4; mul.wide.s32 %rd18, %r1, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f1, [%rd19]; cvta.to.global.u64 %rd20, %rd5; add.s64 %rd21, %rd20, %rd18; ld.global.nc.f32 %f2, [%rd21]; cvta.to.global.u64 %rd22, %rd6; add.s64 %rd23, %rd22, %rd18; ld.global.nc.f32 %f3, [%rd23]; setp.eq.s64 %p2, %rd8, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd24, %rd8; add.s64 %rd26, %rd24, %rd18; ld.global.nc.f32 %f42, [%rd26]; mul.f32 %f109, %f42, %f109; BB0_3: setp.eq.s64 %p3, %rd9, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd27, %rd9; add.s64 %rd29, %rd27, %rd18; ld.global.nc.f32 %f43, [%rd29]; mul.f32 %f110, %f43, %f110; BB0_5: setp.eq.s64 %p4, %rd10, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd30, %rd10; add.s64 %rd32, %rd30, %rd18; ld.global.nc.f32 %f44, [%rd32]; mul.f32 %f111, %f44, %f111; BB0_7: setp.eq.s64 %p5, %rd11, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd33, %rd11; add.s64 %rd35, %rd33, %rd18; ld.global.nc.f32 %f45, [%rd35]; mul.f32 %f112, %f45, %f112; BB0_9: mul.f32 %f47, %f111, %f111; fma.rn.f32 %f48, %f110, %f110, %f47; fma.rn.f32 %f49, %f112, %f112, %f48; sqrt.rn.f32 %f12, %f49; mov.f32 %f113, 0f00000000; setp.eq.f32 %p6, %f12, 0f00000000; @%p6 bra BB0_11; rcp.rn.f32 %f113, %f12; BB0_11: mul.f32 %f15, %f110, %f113; mul.f32 %f16, %f111, %f113; mul.f32 %f17, %f112, %f113; setp.eq.s64 %p7, %rd7, 0; @%p7 bra BB0_13; cvta.to.global.u64 %rd36, %rd7; add.s64 %rd38, %rd36, %rd18; ld.global.nc.f32 %f50, [%rd38]; mul.f32 %f114, %f50, %f114; BB0_13: setp.eq.s64 %p8, %rd12, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd39, %rd12; add.s64 %rd41, %rd39, %rd18; ld.global.nc.f32 %f51, [%rd41]; mul.f32 %f115, %f51, %f115; BB0_15: setp.eq.s64 %p9, %rd13, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd42, %rd13; add.s64 %rd44, %rd42, %rd18; ld.global.nc.f32 %f52, [%rd44]; mul.f32 %f116, %f52, %f116; BB0_17: setp.eq.s64 %p10, %rd14, 0; @%p10 bra BB0_19; cvta.to.global.u64 %rd45, %rd14; add.s64 %rd47, %rd45, %rd18; ld.global.nc.f32 %f53, [%rd47]; mul.f32 %f117, %f53, %f117; BB0_19: setp.eq.s64 %p11, %rd15, 0; @%p11 bra BB0_21; cvta.to.global.u64 %rd48, %rd15; add.s64 %rd50, %rd48, %rd18; ld.global.nc.f32 %f54, [%rd50]; mul.f32 %f118, %f54, %f118; BB0_21: setp.eq.s64 %p12, %rd16, 0; @%p12 bra BB0_23; cvta.to.global.u64 %rd51, %rd16; add.s64 %rd53, %rd51, %rd18; ld.global.nc.f32 %f55, [%rd53]; mul.f32 %f119, %f55, %f119; BB0_23: setp.eq.f32 %p13, %f114, 0f00000000; setp.eq.f32 %p14, %f109, 0f00000000; or.pred %p15, %p14, %p13; @%p15 bra BB0_25; setp.eq.f32 %p16, %f119, 0f00000000; selp.f32 %f56, %f40, %f119, %p16; mul.f32 %f57, %f56, %f41; mul.f32 %f58, %f114, %f57; div.rn.f32 %f59, %f109, %f58; cvt.f64.f32 %fd1, %f59; mul.f64 %fd2, %fd1, 0d3CC7B6EF14E9250C; cvt.rn.f32.f64 %f60, %fd2; mul.f32 %f61, %f117, %f117; mul.f32 %f62, %f116, %f61; add.f32 %f63, %f61, 0f3F800000; add.f32 %f64, %f61, 0fBF800000; mul.f32 %f65, %f2, %f16; fma.rn.f32 %f66, %f1, %f15, %f65; fma.rn.f32 %f67, %f3, %f17, %f66; fma.rn.f32 %f68, %f67, %f64, %f63; div.rn.f32 %f69, %f62, %f68; mul.f32 %f70, %f69, %f60; mul.f32 %f71, %f118, %f60; fma.rn.f32 %f72, %f115, %f115, 0f3F800000; rcp.rn.f32 %f73, %f72; fma.rn.f32 %f74, %f115, %f71, %f70; mul.f32 %f75, %f73, %f74; mul.f32 %f76, %f115, %f70; sub.f32 %f77, %f71, %f76; mul.f32 %f78, %f73, %f77; mul.f32 %f79, %f2, %f17; mul.f32 %f80, %f3, %f16; sub.f32 %f81, %f80, %f79; mul.f32 %f82, %f3, %f15; mul.f32 %f83, %f1, %f17; sub.f32 %f84, %f83, %f82; mul.f32 %f85, %f1, %f16; mul.f32 %f86, %f2, %f15; sub.f32 %f87, %f86, %f85; mul.f32 %f88, %f2, %f87; mul.f32 %f89, %f3, %f84; sub.f32 %f90, %f88, %f89; mul.f32 %f91, %f3, %f81; mul.f32 %f92, %f1, %f87; sub.f32 %f93, %f91, %f92; mul.f32 %f94, %f1, %f84; mul.f32 %f95, %f2, %f81; sub.f32 %f96, %f94, %f95; mul.f32 %f97, %f81, %f78; fma.rn.f32 %f98, %f90, %f75, %f97; cvta.to.global.u64 %rd54, %rd1; add.s64 %rd56, %rd54, %rd18; ld.global.f32 %f99, [%rd56]; add.f32 %f100, %f99, %f98; st.global.f32 [%rd56], %f100; mul.f32 %f101, %f84, %f78; fma.rn.f32 %f102, %f93, %f75, %f101; cvta.to.global.u64 %rd57, %rd2; add.s64 %rd58, %rd57, %rd18; ld.global.f32 %f103, [%rd58]; add.f32 %f104, %f103, %f102; st.global.f32 [%rd58], %f104; mul.f32 %f105, %f87, %f78; fma.rn.f32 %f106, %f96, %f75, %f105; cvta.to.global.u64 %rd59, %rd3; add.s64 %rd60, %rd59, %rd18; ld.global.f32 %f107, [%rd60]; add.f32 %f108, %f107, %f106; st.global.f32 [%rd60], %f108; BB0_25: ret; } ` ) mumax3-3.10/cuda/stencil.h000066400000000000000000000016131371432437400154060ustar00rootroot00000000000000#ifndef _STENCIL_H_ #define _STENCIL_H_ // 3D array indexing #define index(ix,iy,iz,Nx,Ny,Nz) ( ( (iz)*(Ny) + (iy) ) * (Nx) + (ix) ) #define idx(ix,iy,iz) ( index((ix),(iy),(iz),(Nx),(Ny),(Nz)) ) // modulo used for PBC wrap around #define MOD(n, M) ( (( (n) % (M) ) + (M) ) % (M) ) // have PBC in x, y or z? #define PBCx (PBC & 1) #define PBCy (PBC & 2) #define PBCz (PBC & 4) // clamp or wrap index at boundary, depending on PBC // hclamp*: clamps on upper side (index+1) // lclamp*: clamps on lower side (index-1) // *clampx: clamps along x // ... #define hclampx(ix) (PBCx? MOD(ix, Nx) : min((ix), Nx-1)) #define lclampx(ix) (PBCx? MOD(ix, Nx) : max((ix), 0)) #define hclampy(iy) (PBCy? MOD(iy, Ny) : min((iy), Ny-1)) #define lclampy(iy) (PBCy? MOD(iy, Ny) : max((iy), 0)) #define hclampz(iz) (PBCz? MOD(iz, Nz) : min((iz), Nz-1)) #define lclampz(iz) (PBCz? MOD(iz, Nz) : max((iz), 0)) #endif mumax3-3.10/cuda/sum.h000066400000000000000000000001521371432437400145460ustar00rootroot00000000000000#ifndef _SUM_H_ #define _SUM_H_ inline __device__ float sum(float a, float b){ return a + b; } #endif mumax3-3.10/cuda/temperature.go000066400000000000000000000010151371432437400164540ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Set Bth to thermal noise (Brown). // see temperature.cu func SetTemperature(Bth, noise *data.Slice, k2mu0_Mu0VgammaDt float64, Msat, Temp, Alpha MSlice) { util.Argument(Bth.NComp() == 1 && noise.NComp() == 1) N := Bth.Len() cfg := make1DConf(N) k_settemperature2_async(Bth.DevPtr(0), noise.DevPtr(0), float32(k2mu0_Mu0VgammaDt), Msat.DevPtr(0), Msat.Mul(0), Temp.DevPtr(0), Temp.Mul(0), Alpha.DevPtr(0), Alpha.Mul(0), N, cfg) } mumax3-3.10/cuda/temperature2.cu000066400000000000000000000013621371432437400165450ustar00rootroot00000000000000#include #include "amul.h" // TODO: this could act on x,y,z, so that we need to call it only once. extern "C" __global__ void settemperature2(float* __restrict__ B, float* __restrict__ noise, float kB2_VgammaDt, float* __restrict__ Ms_, float Ms_mul, float* __restrict__ temp_, float temp_mul, float* __restrict__ alpha_, float alpha_mul, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { float invMs = inv_Msat(Ms_, Ms_mul, i); float temp = amul(temp_, temp_mul, i); float alpha = amul(alpha_, alpha_mul, i); B[i] = noise[i] * sqrtf((kB2_VgammaDt * alpha * temp * invMs )); } } mumax3-3.10/cuda/temperature2_wrapper.go000066400000000000000000001045411371432437400203060ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for settemperature2 kernel var settemperature2_code cu.Function // Stores the arguments for settemperature2 kernel invocation type settemperature2_args_t struct { arg_B unsafe.Pointer arg_noise unsafe.Pointer arg_kB2_VgammaDt float32 arg_Ms_ unsafe.Pointer arg_Ms_mul float32 arg_temp_ unsafe.Pointer arg_temp_mul float32 arg_alpha_ unsafe.Pointer arg_alpha_mul float32 arg_N int argptr [10]unsafe.Pointer sync.Mutex } // Stores the arguments for settemperature2 kernel invocation var settemperature2_args settemperature2_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. settemperature2_args.argptr[0] = unsafe.Pointer(&settemperature2_args.arg_B) settemperature2_args.argptr[1] = unsafe.Pointer(&settemperature2_args.arg_noise) settemperature2_args.argptr[2] = unsafe.Pointer(&settemperature2_args.arg_kB2_VgammaDt) settemperature2_args.argptr[3] = unsafe.Pointer(&settemperature2_args.arg_Ms_) settemperature2_args.argptr[4] = unsafe.Pointer(&settemperature2_args.arg_Ms_mul) settemperature2_args.argptr[5] = unsafe.Pointer(&settemperature2_args.arg_temp_) settemperature2_args.argptr[6] = unsafe.Pointer(&settemperature2_args.arg_temp_mul) settemperature2_args.argptr[7] = unsafe.Pointer(&settemperature2_args.arg_alpha_) settemperature2_args.argptr[8] = unsafe.Pointer(&settemperature2_args.arg_alpha_mul) settemperature2_args.argptr[9] = unsafe.Pointer(&settemperature2_args.arg_N) } // Wrapper for settemperature2 CUDA kernel, asynchronous. func k_settemperature2_async(B unsafe.Pointer, noise unsafe.Pointer, kB2_VgammaDt float32, Ms_ unsafe.Pointer, Ms_mul float32, temp_ unsafe.Pointer, temp_mul float32, alpha_ unsafe.Pointer, alpha_mul float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("settemperature2") } settemperature2_args.Lock() defer settemperature2_args.Unlock() if settemperature2_code == 0 { settemperature2_code = fatbinLoad(settemperature2_map, "settemperature2") } settemperature2_args.arg_B = B settemperature2_args.arg_noise = noise settemperature2_args.arg_kB2_VgammaDt = kB2_VgammaDt settemperature2_args.arg_Ms_ = Ms_ settemperature2_args.arg_Ms_mul = Ms_mul settemperature2_args.arg_temp_ = temp_ settemperature2_args.arg_temp_mul = temp_mul settemperature2_args.arg_alpha_ = alpha_ settemperature2_args.arg_alpha_mul = alpha_mul settemperature2_args.arg_N = N args := settemperature2_args.argptr[:] cu.LaunchKernel(settemperature2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("settemperature2") } } // maps compute capability on PTX code for settemperature2 kernel. var settemperature2_map = map[int]string{0: "", 30: settemperature2_ptx_30, 32: settemperature2_ptx_32, 35: settemperature2_ptx_35, 37: settemperature2_ptx_37, 50: settemperature2_ptx_50, 52: settemperature2_ptx_52, 53: settemperature2_ptx_53, 60: settemperature2_ptx_60, 61: settemperature2_ptx_61, 62: settemperature2_ptx_62, 70: settemperature2_ptx_70, 72: settemperature2_ptx_72, 75: settemperature2_ptx_75} // settemperature2 PTX code for various compute capabilities. const ( settemperature2_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f24, %f23; BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; BB0_9: cvta.to.global.u64 %rd15, %rd1; cvta.to.global.u64 %rd16, %rd2; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.f32 %f21, [%rd18]; mul.f32 %f22, %f21, %f20; add.s64 %rd19, %rd15, %rd17; st.global.f32 [%rd19], %f22; BB0_10: ret; } ` settemperature2_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f24, %f23; BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; BB0_9: cvta.to.global.u64 %rd15, %rd1; cvta.to.global.u64 %rd16, %rd2; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd18]; mul.f32 %f22, %f21, %f20; add.s64 %rd19, %rd15, %rd17; st.global.f32 [%rd19], %f22; BB0_10: ret; } ` settemperature2_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f24, %f23; BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; BB0_9: cvta.to.global.u64 %rd15, %rd1; cvta.to.global.u64 %rd16, %rd2; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd18]; mul.f32 %f22, %f21, %f20; add.s64 %rd19, %rd15, %rd17; st.global.f32 [%rd19], %f22; BB0_10: ret; } ` settemperature2_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f24, %f23; BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; BB0_9: cvta.to.global.u64 %rd15, %rd1; cvta.to.global.u64 %rd16, %rd2; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd18]; mul.f32 %f22, %f21, %f20; add.s64 %rd19, %rd15, %rd17; st.global.f32 [%rd19], %f22; BB0_10: ret; } ` settemperature2_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f24, %f23; BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; BB0_9: cvta.to.global.u64 %rd15, %rd1; cvta.to.global.u64 %rd16, %rd2; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd18]; mul.f32 %f22, %f21, %f20; add.s64 %rd19, %rd15, %rd17; st.global.f32 [%rd19], %f22; BB0_10: ret; } ` settemperature2_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f24, %f23; BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; BB0_9: cvta.to.global.u64 %rd15, %rd1; cvta.to.global.u64 %rd16, %rd2; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd18]; mul.f32 %f22, %f21, %f20; add.s64 %rd19, %rd15, %rd17; st.global.f32 [%rd19], %f22; BB0_10: ret; } ` settemperature2_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f24, %f23; BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; BB0_9: cvta.to.global.u64 %rd15, %rd1; cvta.to.global.u64 %rd16, %rd2; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd18]; mul.f32 %f22, %f21, %f20; add.s64 %rd19, %rd15, %rd17; st.global.f32 [%rd19], %f22; BB0_10: ret; } ` settemperature2_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f24, %f23; BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; BB0_9: cvta.to.global.u64 %rd15, %rd1; cvta.to.global.u64 %rd16, %rd2; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd18]; mul.f32 %f22, %f21, %f20; add.s64 %rd19, %rd15, %rd17; st.global.f32 [%rd19], %f22; BB0_10: ret; } ` settemperature2_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f24, %f23; BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; BB0_9: cvta.to.global.u64 %rd15, %rd1; cvta.to.global.u64 %rd16, %rd2; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd18]; mul.f32 %f22, %f21, %f20; add.s64 %rd19, %rd15, %rd17; st.global.f32 [%rd19], %f22; BB0_10: ret; } ` settemperature2_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f24, %f23; BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; BB0_9: cvta.to.global.u64 %rd15, %rd1; cvta.to.global.u64 %rd16, %rd2; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd18]; mul.f32 %f22, %f21, %f20; add.s64 %rd19, %rd15, %rd17; st.global.f32 [%rd19], %f22; BB0_10: ret; } ` settemperature2_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f24, %f23; BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; BB0_9: cvta.to.global.u64 %rd15, %rd1; cvta.to.global.u64 %rd16, %rd2; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd18]; mul.f32 %f22, %f21, %f20; add.s64 %rd19, %rd15, %rd17; st.global.f32 [%rd19], %f22; BB0_10: ret; } ` settemperature2_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f24, %f23; BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; BB0_9: cvta.to.global.u64 %rd15, %rd1; cvta.to.global.u64 %rd16, %rd2; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd18]; mul.f32 %f22, %f21, %f20; add.s64 %rd19, %rd15, %rd17; st.global.f32 [%rd19], %f22; BB0_10: ret; } ` settemperature2_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl settemperature2 .visible .entry settemperature2( .param .u64 settemperature2_param_0, .param .u64 settemperature2_param_1, .param .f32 settemperature2_param_2, .param .u64 settemperature2_param_3, .param .f32 settemperature2_param_4, .param .u64 settemperature2_param_5, .param .f32 settemperature2_param_6, .param .u64 settemperature2_param_7, .param .f32 settemperature2_param_8, .param .u32 settemperature2_param_9 ) { .reg .pred %p<6>; .reg .f32 %f<27>; .reg .b32 %r<9>; .reg .b64 %rd<20>; ld.param.u64 %rd1, [settemperature2_param_0]; ld.param.u64 %rd2, [settemperature2_param_1]; ld.param.f32 %f9, [settemperature2_param_2]; ld.param.u64 %rd3, [settemperature2_param_3]; ld.param.f32 %f23, [settemperature2_param_4]; ld.param.u64 %rd4, [settemperature2_param_5]; ld.param.f32 %f25, [settemperature2_param_6]; ld.param.u64 %rd5, [settemperature2_param_7]; ld.param.f32 %f26, [settemperature2_param_8]; ld.param.u32 %r2, [settemperature2_param_9]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_10; setp.eq.s64 %p2, %rd3, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd6, %rd3; mul.wide.s32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; ld.global.nc.f32 %f13, [%rd8]; mul.f32 %f23, %f13, %f23; BB0_3: setp.eq.f32 %p3, %f23, 0f00000000; mov.f32 %f24, 0f00000000; @%p3 bra BB0_5; rcp.rn.f32 %f24, %f23; BB0_5: setp.eq.s64 %p4, %rd4, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd9, %rd4; mul.wide.s32 %rd10, %r1, 4; add.s64 %rd11, %rd9, %rd10; ld.global.nc.f32 %f15, [%rd11]; mul.f32 %f25, %f15, %f25; BB0_7: setp.eq.s64 %p5, %rd5, 0; @%p5 bra BB0_9; cvta.to.global.u64 %rd12, %rd5; mul.wide.s32 %rd13, %r1, 4; add.s64 %rd14, %rd12, %rd13; ld.global.nc.f32 %f16, [%rd14]; mul.f32 %f26, %f16, %f26; BB0_9: cvta.to.global.u64 %rd15, %rd1; cvta.to.global.u64 %rd16, %rd2; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; mul.f32 %f17, %f26, %f9; mul.f32 %f18, %f25, %f17; mul.f32 %f19, %f24, %f18; sqrt.rn.f32 %f20, %f19; ld.global.nc.f32 %f21, [%rd18]; mul.f32 %f22, %f21, %f20; add.s64 %rd19, %rd15, %rd17; st.global.f32 [%rd19], %f22; BB0_10: ret; } ` ) mumax3-3.10/cuda/theta.cu000066400000000000000000000007221371432437400152320ustar00rootroot00000000000000#include "stencil.h" extern "C" __global__ void setTheta(float* __restrict__ theta, float* __restrict__ mz, int Nx, int Ny, int Nz) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } int I = idx(ix, iy, iz); // central cell index theta[I] = acosf(mz[I]); }mumax3-3.10/cuda/theta_wrapper.go000066400000000000000000000724651371432437400170050ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for setTheta kernel var setTheta_code cu.Function // Stores the arguments for setTheta kernel invocation type setTheta_args_t struct { arg_theta unsafe.Pointer arg_mz unsafe.Pointer arg_Nx int arg_Ny int arg_Nz int argptr [5]unsafe.Pointer sync.Mutex } // Stores the arguments for setTheta kernel invocation var setTheta_args setTheta_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. setTheta_args.argptr[0] = unsafe.Pointer(&setTheta_args.arg_theta) setTheta_args.argptr[1] = unsafe.Pointer(&setTheta_args.arg_mz) setTheta_args.argptr[2] = unsafe.Pointer(&setTheta_args.arg_Nx) setTheta_args.argptr[3] = unsafe.Pointer(&setTheta_args.arg_Ny) setTheta_args.argptr[4] = unsafe.Pointer(&setTheta_args.arg_Nz) } // Wrapper for setTheta CUDA kernel, asynchronous. func k_setTheta_async(theta unsafe.Pointer, mz unsafe.Pointer, Nx int, Ny int, Nz int, cfg *config) { if Synchronous { // debug Sync() timer.Start("setTheta") } setTheta_args.Lock() defer setTheta_args.Unlock() if setTheta_code == 0 { setTheta_code = fatbinLoad(setTheta_map, "setTheta") } setTheta_args.arg_theta = theta setTheta_args.arg_mz = mz setTheta_args.arg_Nx = Nx setTheta_args.arg_Ny = Ny setTheta_args.arg_Nz = Nz args := setTheta_args.argptr[:] cu.LaunchKernel(setTheta_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("setTheta") } } // maps compute capability on PTX code for setTheta kernel. var setTheta_map = map[int]string{0: "", 30: setTheta_ptx_30, 32: setTheta_ptx_32, 35: setTheta_ptx_35, 37: setTheta_ptx_37, 50: setTheta_ptx_50, 52: setTheta_ptx_52, 53: setTheta_ptx_53, 60: setTheta_ptx_60, 61: setTheta_ptx_61, 62: setTheta_ptx_62, 70: setTheta_ptx_70, 72: setTheta_ptx_72, 75: setTheta_ptx_75} // setTheta PTX code for various compute capabilities. const ( setTheta_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<27>; .reg .b32 %r<18>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.f32 %f1, [%rd5]; abs.f32 %f2, %f1; mov.f32 %f3, 0f3F800000; sub.f32 %f4, %f3, %f2; mul.f32 %f5, %f4, 0f3F000000; sqrt.rn.f32 %f6, %f5; setp.gt.f32 %p6, %f2, 0f3F11EB85; selp.f32 %f7, %f6, %f2, %p6; mul.f32 %f8, %f7, %f7; mov.f32 %f9, 0f3C94D2E9; mov.f32 %f10, 0f3D53F941; fma.rn.f32 %f11, %f10, %f8, %f9; mov.f32 %f12, 0f3D3F841F; fma.rn.f32 %f13, %f11, %f8, %f12; mov.f32 %f14, 0f3D994929; fma.rn.f32 %f15, %f13, %f8, %f14; mov.f32 %f16, 0f3E2AAB94; fma.rn.f32 %f17, %f15, %f8, %f16; mul.f32 %f18, %f8, %f17; fma.rn.f32 %f19, %f18, %f7, %f7; add.f32 %f20, %f19, %f19; mov.f32 %f21, 0f3FC90FDB; sub.f32 %f22, %f21, %f19; selp.f32 %f23, %f20, %f22, %p6; setp.lt.f32 %p7, %f1, 0f00000000; mov.f32 %f24, 0f40490FDB; sub.f32 %f25, %f24, %f23; selp.f32 %f26, %f25, %f23, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f26; BB0_2: ret; } ` setTheta_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<27>; .reg .b32 %r<18>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; mov.f32 %f3, 0f3F800000; sub.f32 %f4, %f3, %f2; mul.f32 %f5, %f4, 0f3F000000; sqrt.rn.f32 %f6, %f5; setp.gt.f32 %p6, %f2, 0f3F11EB85; selp.f32 %f7, %f6, %f2, %p6; mul.f32 %f8, %f7, %f7; mov.f32 %f9, 0f3C94D2E9; mov.f32 %f10, 0f3D53F941; fma.rn.f32 %f11, %f10, %f8, %f9; mov.f32 %f12, 0f3D3F841F; fma.rn.f32 %f13, %f11, %f8, %f12; mov.f32 %f14, 0f3D994929; fma.rn.f32 %f15, %f13, %f8, %f14; mov.f32 %f16, 0f3E2AAB94; fma.rn.f32 %f17, %f15, %f8, %f16; mul.f32 %f18, %f8, %f17; fma.rn.f32 %f19, %f18, %f7, %f7; add.f32 %f20, %f19, %f19; mov.f32 %f21, 0f3FC90FDB; sub.f32 %f22, %f21, %f19; selp.f32 %f23, %f20, %f22, %p6; setp.lt.f32 %p7, %f1, 0f00000000; mov.f32 %f24, 0f40490FDB; sub.f32 %f25, %f24, %f23; selp.f32 %f26, %f25, %f23, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f26; BB0_2: ret; } ` setTheta_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<27>; .reg .b32 %r<18>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; mov.f32 %f3, 0f3F800000; sub.f32 %f4, %f3, %f2; mul.f32 %f5, %f4, 0f3F000000; sqrt.rn.f32 %f6, %f5; setp.gt.f32 %p6, %f2, 0f3F11EB85; selp.f32 %f7, %f6, %f2, %p6; mul.f32 %f8, %f7, %f7; mov.f32 %f9, 0f3C94D2E9; mov.f32 %f10, 0f3D53F941; fma.rn.f32 %f11, %f10, %f8, %f9; mov.f32 %f12, 0f3D3F841F; fma.rn.f32 %f13, %f11, %f8, %f12; mov.f32 %f14, 0f3D994929; fma.rn.f32 %f15, %f13, %f8, %f14; mov.f32 %f16, 0f3E2AAB94; fma.rn.f32 %f17, %f15, %f8, %f16; mul.f32 %f18, %f8, %f17; fma.rn.f32 %f19, %f18, %f7, %f7; add.f32 %f20, %f19, %f19; mov.f32 %f21, 0f3FC90FDB; sub.f32 %f22, %f21, %f19; selp.f32 %f23, %f20, %f22, %p6; setp.lt.f32 %p7, %f1, 0f00000000; mov.f32 %f24, 0f40490FDB; sub.f32 %f25, %f24, %f23; selp.f32 %f26, %f25, %f23, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f26; BB0_2: ret; } ` setTheta_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<27>; .reg .b32 %r<18>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; mov.f32 %f3, 0f3F800000; sub.f32 %f4, %f3, %f2; mul.f32 %f5, %f4, 0f3F000000; sqrt.rn.f32 %f6, %f5; setp.gt.f32 %p6, %f2, 0f3F11EB85; selp.f32 %f7, %f6, %f2, %p6; mul.f32 %f8, %f7, %f7; mov.f32 %f9, 0f3C94D2E9; mov.f32 %f10, 0f3D53F941; fma.rn.f32 %f11, %f10, %f8, %f9; mov.f32 %f12, 0f3D3F841F; fma.rn.f32 %f13, %f11, %f8, %f12; mov.f32 %f14, 0f3D994929; fma.rn.f32 %f15, %f13, %f8, %f14; mov.f32 %f16, 0f3E2AAB94; fma.rn.f32 %f17, %f15, %f8, %f16; mul.f32 %f18, %f8, %f17; fma.rn.f32 %f19, %f18, %f7, %f7; add.f32 %f20, %f19, %f19; mov.f32 %f21, 0f3FC90FDB; sub.f32 %f22, %f21, %f19; selp.f32 %f23, %f20, %f22, %p6; setp.lt.f32 %p7, %f1, 0f00000000; mov.f32 %f24, 0f40490FDB; sub.f32 %f25, %f24, %f23; selp.f32 %f26, %f25, %f23, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f26; BB0_2: ret; } ` setTheta_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<27>; .reg .b32 %r<18>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; mov.f32 %f3, 0f3F800000; sub.f32 %f4, %f3, %f2; mul.f32 %f5, %f4, 0f3F000000; sqrt.rn.f32 %f6, %f5; setp.gt.f32 %p6, %f2, 0f3F11EB85; selp.f32 %f7, %f6, %f2, %p6; mul.f32 %f8, %f7, %f7; mov.f32 %f9, 0f3C94D2E9; mov.f32 %f10, 0f3D53F941; fma.rn.f32 %f11, %f10, %f8, %f9; mov.f32 %f12, 0f3D3F841F; fma.rn.f32 %f13, %f11, %f8, %f12; mov.f32 %f14, 0f3D994929; fma.rn.f32 %f15, %f13, %f8, %f14; mov.f32 %f16, 0f3E2AAB94; fma.rn.f32 %f17, %f15, %f8, %f16; mul.f32 %f18, %f8, %f17; fma.rn.f32 %f19, %f18, %f7, %f7; add.f32 %f20, %f19, %f19; mov.f32 %f21, 0f3FC90FDB; sub.f32 %f22, %f21, %f19; selp.f32 %f23, %f20, %f22, %p6; setp.lt.f32 %p7, %f1, 0f00000000; mov.f32 %f24, 0f40490FDB; sub.f32 %f25, %f24, %f23; selp.f32 %f26, %f25, %f23, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f26; BB0_2: ret; } ` setTheta_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<27>; .reg .b32 %r<18>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; mov.f32 %f3, 0f3F800000; sub.f32 %f4, %f3, %f2; mul.f32 %f5, %f4, 0f3F000000; sqrt.rn.f32 %f6, %f5; setp.gt.f32 %p6, %f2, 0f3F11EB85; selp.f32 %f7, %f6, %f2, %p6; mul.f32 %f8, %f7, %f7; mov.f32 %f9, 0f3C94D2E9; mov.f32 %f10, 0f3D53F941; fma.rn.f32 %f11, %f10, %f8, %f9; mov.f32 %f12, 0f3D3F841F; fma.rn.f32 %f13, %f11, %f8, %f12; mov.f32 %f14, 0f3D994929; fma.rn.f32 %f15, %f13, %f8, %f14; mov.f32 %f16, 0f3E2AAB94; fma.rn.f32 %f17, %f15, %f8, %f16; mul.f32 %f18, %f8, %f17; fma.rn.f32 %f19, %f18, %f7, %f7; add.f32 %f20, %f19, %f19; mov.f32 %f21, 0f3FC90FDB; sub.f32 %f22, %f21, %f19; selp.f32 %f23, %f20, %f22, %p6; setp.lt.f32 %p7, %f1, 0f00000000; mov.f32 %f24, 0f40490FDB; sub.f32 %f25, %f24, %f23; selp.f32 %f26, %f25, %f23, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f26; BB0_2: ret; } ` setTheta_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<27>; .reg .b32 %r<18>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; mov.f32 %f3, 0f3F800000; sub.f32 %f4, %f3, %f2; mul.f32 %f5, %f4, 0f3F000000; sqrt.rn.f32 %f6, %f5; setp.gt.f32 %p6, %f2, 0f3F11EB85; selp.f32 %f7, %f6, %f2, %p6; mul.f32 %f8, %f7, %f7; mov.f32 %f9, 0f3C94D2E9; mov.f32 %f10, 0f3D53F941; fma.rn.f32 %f11, %f10, %f8, %f9; mov.f32 %f12, 0f3D3F841F; fma.rn.f32 %f13, %f11, %f8, %f12; mov.f32 %f14, 0f3D994929; fma.rn.f32 %f15, %f13, %f8, %f14; mov.f32 %f16, 0f3E2AAB94; fma.rn.f32 %f17, %f15, %f8, %f16; mul.f32 %f18, %f8, %f17; fma.rn.f32 %f19, %f18, %f7, %f7; add.f32 %f20, %f19, %f19; mov.f32 %f21, 0f3FC90FDB; sub.f32 %f22, %f21, %f19; selp.f32 %f23, %f20, %f22, %p6; setp.lt.f32 %p7, %f1, 0f00000000; mov.f32 %f24, 0f40490FDB; sub.f32 %f25, %f24, %f23; selp.f32 %f26, %f25, %f23, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f26; BB0_2: ret; } ` setTheta_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<27>; .reg .b32 %r<18>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; mov.f32 %f3, 0f3F800000; sub.f32 %f4, %f3, %f2; mul.f32 %f5, %f4, 0f3F000000; sqrt.rn.f32 %f6, %f5; setp.gt.f32 %p6, %f2, 0f3F11EB85; selp.f32 %f7, %f6, %f2, %p6; mul.f32 %f8, %f7, %f7; mov.f32 %f9, 0f3C94D2E9; mov.f32 %f10, 0f3D53F941; fma.rn.f32 %f11, %f10, %f8, %f9; mov.f32 %f12, 0f3D3F841F; fma.rn.f32 %f13, %f11, %f8, %f12; mov.f32 %f14, 0f3D994929; fma.rn.f32 %f15, %f13, %f8, %f14; mov.f32 %f16, 0f3E2AAB94; fma.rn.f32 %f17, %f15, %f8, %f16; mul.f32 %f18, %f8, %f17; fma.rn.f32 %f19, %f18, %f7, %f7; add.f32 %f20, %f19, %f19; mov.f32 %f21, 0f3FC90FDB; sub.f32 %f22, %f21, %f19; selp.f32 %f23, %f20, %f22, %p6; setp.lt.f32 %p7, %f1, 0f00000000; mov.f32 %f24, 0f40490FDB; sub.f32 %f25, %f24, %f23; selp.f32 %f26, %f25, %f23, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f26; BB0_2: ret; } ` setTheta_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<27>; .reg .b32 %r<18>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; mov.f32 %f3, 0f3F800000; sub.f32 %f4, %f3, %f2; mul.f32 %f5, %f4, 0f3F000000; sqrt.rn.f32 %f6, %f5; setp.gt.f32 %p6, %f2, 0f3F11EB85; selp.f32 %f7, %f6, %f2, %p6; mul.f32 %f8, %f7, %f7; mov.f32 %f9, 0f3C94D2E9; mov.f32 %f10, 0f3D53F941; fma.rn.f32 %f11, %f10, %f8, %f9; mov.f32 %f12, 0f3D3F841F; fma.rn.f32 %f13, %f11, %f8, %f12; mov.f32 %f14, 0f3D994929; fma.rn.f32 %f15, %f13, %f8, %f14; mov.f32 %f16, 0f3E2AAB94; fma.rn.f32 %f17, %f15, %f8, %f16; mul.f32 %f18, %f8, %f17; fma.rn.f32 %f19, %f18, %f7, %f7; add.f32 %f20, %f19, %f19; mov.f32 %f21, 0f3FC90FDB; sub.f32 %f22, %f21, %f19; selp.f32 %f23, %f20, %f22, %p6; setp.lt.f32 %p7, %f1, 0f00000000; mov.f32 %f24, 0f40490FDB; sub.f32 %f25, %f24, %f23; selp.f32 %f26, %f25, %f23, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f26; BB0_2: ret; } ` setTheta_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<27>; .reg .b32 %r<18>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; mov.f32 %f3, 0f3F800000; sub.f32 %f4, %f3, %f2; mul.f32 %f5, %f4, 0f3F000000; sqrt.rn.f32 %f6, %f5; setp.gt.f32 %p6, %f2, 0f3F11EB85; selp.f32 %f7, %f6, %f2, %p6; mul.f32 %f8, %f7, %f7; mov.f32 %f9, 0f3C94D2E9; mov.f32 %f10, 0f3D53F941; fma.rn.f32 %f11, %f10, %f8, %f9; mov.f32 %f12, 0f3D3F841F; fma.rn.f32 %f13, %f11, %f8, %f12; mov.f32 %f14, 0f3D994929; fma.rn.f32 %f15, %f13, %f8, %f14; mov.f32 %f16, 0f3E2AAB94; fma.rn.f32 %f17, %f15, %f8, %f16; mul.f32 %f18, %f8, %f17; fma.rn.f32 %f19, %f18, %f7, %f7; add.f32 %f20, %f19, %f19; mov.f32 %f21, 0f3FC90FDB; sub.f32 %f22, %f21, %f19; selp.f32 %f23, %f20, %f22, %p6; setp.lt.f32 %p7, %f1, 0f00000000; mov.f32 %f24, 0f40490FDB; sub.f32 %f25, %f24, %f23; selp.f32 %f26, %f25, %f23, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f26; BB0_2: ret; } ` setTheta_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<27>; .reg .b32 %r<18>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; mov.f32 %f3, 0f3F800000; sub.f32 %f4, %f3, %f2; mul.f32 %f5, %f4, 0f3F000000; sqrt.rn.f32 %f6, %f5; setp.gt.f32 %p6, %f2, 0f3F11EB85; selp.f32 %f7, %f6, %f2, %p6; mul.f32 %f8, %f7, %f7; mov.f32 %f9, 0f3C94D2E9; mov.f32 %f10, 0f3D53F941; fma.rn.f32 %f11, %f10, %f8, %f9; mov.f32 %f12, 0f3D3F841F; fma.rn.f32 %f13, %f11, %f8, %f12; mov.f32 %f14, 0f3D994929; fma.rn.f32 %f15, %f13, %f8, %f14; mov.f32 %f16, 0f3E2AAB94; fma.rn.f32 %f17, %f15, %f8, %f16; mul.f32 %f18, %f8, %f17; fma.rn.f32 %f19, %f18, %f7, %f7; add.f32 %f20, %f19, %f19; mov.f32 %f21, 0f3FC90FDB; sub.f32 %f22, %f21, %f19; selp.f32 %f23, %f20, %f22, %p6; setp.lt.f32 %p7, %f1, 0f00000000; mov.f32 %f24, 0f40490FDB; sub.f32 %f25, %f24, %f23; selp.f32 %f26, %f25, %f23, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f26; BB0_2: ret; } ` setTheta_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<27>; .reg .b32 %r<18>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; mov.f32 %f3, 0f3F800000; sub.f32 %f4, %f3, %f2; mul.f32 %f5, %f4, 0f3F000000; sqrt.rn.f32 %f6, %f5; setp.gt.f32 %p6, %f2, 0f3F11EB85; selp.f32 %f7, %f6, %f2, %p6; mul.f32 %f8, %f7, %f7; mov.f32 %f9, 0f3C94D2E9; mov.f32 %f10, 0f3D53F941; fma.rn.f32 %f11, %f10, %f8, %f9; mov.f32 %f12, 0f3D3F841F; fma.rn.f32 %f13, %f11, %f8, %f12; mov.f32 %f14, 0f3D994929; fma.rn.f32 %f15, %f13, %f8, %f14; mov.f32 %f16, 0f3E2AAB94; fma.rn.f32 %f17, %f15, %f8, %f16; mul.f32 %f18, %f8, %f17; fma.rn.f32 %f19, %f18, %f7, %f7; add.f32 %f20, %f19, %f19; mov.f32 %f21, 0f3FC90FDB; sub.f32 %f22, %f21, %f19; selp.f32 %f23, %f20, %f22, %p6; setp.lt.f32 %p7, %f1, 0f00000000; mov.f32 %f24, 0f40490FDB; sub.f32 %f25, %f24, %f23; selp.f32 %f26, %f25, %f23, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f26; BB0_2: ret; } ` setTheta_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl setTheta .visible .entry setTheta( .param .u64 setTheta_param_0, .param .u64 setTheta_param_1, .param .u32 setTheta_param_2, .param .u32 setTheta_param_3, .param .u32 setTheta_param_4 ) { .reg .pred %p<8>; .reg .f32 %f<27>; .reg .b32 %r<18>; .reg .b64 %rd<8>; ld.param.u64 %rd1, [setTheta_param_0]; ld.param.u64 %rd2, [setTheta_param_1]; ld.param.u32 %r4, [setTheta_param_2]; ld.param.u32 %r5, [setTheta_param_3]; ld.param.u32 %r6, [setTheta_param_4]; mov.u32 %r7, %ctaid.x; mov.u32 %r8, %ntid.x; mov.u32 %r9, %tid.x; mad.lo.s32 %r1, %r8, %r7, %r9; mov.u32 %r10, %ntid.y; mov.u32 %r11, %ctaid.y; mov.u32 %r12, %tid.y; mad.lo.s32 %r2, %r10, %r11, %r12; mov.u32 %r13, %ntid.z; mov.u32 %r14, %ctaid.z; mov.u32 %r15, %tid.z; mad.lo.s32 %r3, %r13, %r14, %r15; setp.ge.s32 %p1, %r2, %r5; setp.ge.s32 %p2, %r1, %r4; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r6; or.pred %p5, %p3, %p4; @%p5 bra BB0_2; cvta.to.global.u64 %rd3, %rd2; mad.lo.s32 %r16, %r3, %r5, %r2; mad.lo.s32 %r17, %r16, %r4, %r1; mul.wide.s32 %rd4, %r17, 4; add.s64 %rd5, %rd3, %rd4; ld.global.nc.f32 %f1, [%rd5]; abs.f32 %f2, %f1; mov.f32 %f3, 0f3F800000; sub.f32 %f4, %f3, %f2; mul.f32 %f5, %f4, 0f3F000000; sqrt.rn.f32 %f6, %f5; setp.gt.f32 %p6, %f2, 0f3F11EB85; selp.f32 %f7, %f6, %f2, %p6; mul.f32 %f8, %f7, %f7; mov.f32 %f9, 0f3C94D2E9; mov.f32 %f10, 0f3D53F941; fma.rn.f32 %f11, %f10, %f8, %f9; mov.f32 %f12, 0f3D3F841F; fma.rn.f32 %f13, %f11, %f8, %f12; mov.f32 %f14, 0f3D994929; fma.rn.f32 %f15, %f13, %f8, %f14; mov.f32 %f16, 0f3E2AAB94; fma.rn.f32 %f17, %f15, %f8, %f16; mul.f32 %f18, %f8, %f17; fma.rn.f32 %f19, %f18, %f7, %f7; add.f32 %f20, %f19, %f19; mov.f32 %f21, 0f3FC90FDB; sub.f32 %f22, %f21, %f19; selp.f32 %f23, %f20, %f22, %p6; setp.lt.f32 %p7, %f1, 0f00000000; mov.f32 %f24, 0f40490FDB; sub.f32 %f25, %f24, %f23; selp.f32 %f26, %f25, %f23, %p7; cvta.to.global.u64 %rd6, %rd1; add.s64 %rd7, %rd6, %rd4; st.global.f32 [%rd7], %f26; BB0_2: ret; } ` ) mumax3-3.10/cuda/topologicalcharge.cu000066400000000000000000000125541371432437400176210ustar00rootroot00000000000000#include #include #include "exchange.h" #include "float3.h" #include "stencil.h" // Set s to the topological charge density. // See topologicalcharge.go. extern "C" __global__ void settopologicalcharge(float* __restrict__ s, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float icxcy, int Nx, int Ny, int Nz, uint8_t PBC) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } int I = idx(ix, iy, iz); // central cell index float3 m0 = make_float3(mx[I], my[I], mz[I]); // +0 float3 dmdx = make_float3(0.0f, 0.0f, 0.0f); // ∂m/∂x float3 dmdy = make_float3(0.0f, 0.0f, 0.0f); // ∂m/∂y float3 dmdx_x_dmdy = make_float3(0.0, 0.0, 0.0); // ∂m/∂x ❌ ∂m/∂y int i_; // neighbor index if(is0(m0)) { s[I] = 0.0f; return; } // x derivatives (along length) { float3 m_m2 = make_float3(0.0f, 0.0f, 0.0f); // -2 i_ = idx(lclampx(ix-2), iy, iz); // load neighbor m if inside grid, keep 0 otherwise if (ix-2 >= 0 || PBCx) { m_m2 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_m1 = make_float3(0.0f, 0.0f, 0.0f); // -1 i_ = idx(lclampx(ix-1), iy, iz); // load neighbor m if inside grid, keep 0 otherwise if (ix-1 >= 0 || PBCx) { m_m1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p1 = make_float3(0.0f, 0.0f, 0.0f); // +1 i_ = idx(hclampx(ix+1), iy, iz); if (ix+1 < Nx || PBCx) { m_p1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p2 = make_float3(0.0f, 0.0f, 0.0f); // +2 i_ = idx(hclampx(ix+2), iy, iz); if (ix+2 < Nx || PBCx) { m_p2 = make_float3(mx[i_], my[i_], mz[i_]); } if (is0(m_p1) && is0(m_m1)) // +0 { dmdx = make_float3(0.0f, 0.0f, 0.0f); // --1-- zero } else if ((is0(m_m2) | is0(m_p2)) && !is0(m_p1) && !is0(m_m1)) { dmdx = 0.5f * (m_p1 - m_m1); // -111-, 1111-, -1111 central difference, ε ~ h^2 } else if (is0(m_p1) && is0(m_m2)) { dmdx = m0 - m_m1; // -11-- backward difference, ε ~ h^1 } else if (is0(m_m1) && is0(m_p2)) { dmdx = -m0 + m_p1; // --11- forward difference, ε ~ h^1 } else if (!is0(m_m2) && is0(m_p1)) { dmdx = 0.5f * m_m2 - 2.0f * m_m1 + 1.5f * m0; // 111-- backward difference, ε ~ h^2 } else if (!is0(m_p2) && is0(m_m1)) { dmdx = -0.5f * m_p2 + 2.0f * m_p1 - 1.5f * m0; // --111 forward difference, ε ~ h^2 } else { dmdx = (2.0f/3.0f)*(m_p1 - m_m1) + (1.0f/12.0f)*(m_m2 - m_p2); // 11111 central difference, ε ~ h^4 } } // y derivatives (along height) { float3 m_m2 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, lclampy(iy-2), iz); if (iy-2 >= 0 || PBCy) { m_m2 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_m1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, lclampy(iy-1), iz); if (iy-1 >= 0 || PBCy) { m_m1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p1 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, hclampy(iy+1), iz); if (iy+1 < Ny || PBCy) { m_p1 = make_float3(mx[i_], my[i_], mz[i_]); } float3 m_p2 = make_float3(0.0f, 0.0f, 0.0f); i_ = idx(ix, hclampy(iy+2), iz); if (iy+2 < Ny || PBCy) { m_p2 = make_float3(mx[i_], my[i_], mz[i_]); } if (is0(m_p1) && is0(m_m1)) // +0 { dmdy = make_float3(0.0f, 0.0f, 0.0f); // --1-- zero } else if ((is0(m_m2) | is0(m_p2)) && !is0(m_p1) && !is0(m_m1)) { dmdy = 0.5f * (m_p1 - m_m1); // -111-, 1111-, -1111 central difference, ε ~ h^2 } else if (is0(m_p1) && is0(m_m2)) { dmdy = m0 - m_m1; // -11-- backward difference, ε ~ h^1 } else if (is0(m_m1) && is0(m_p2)) { dmdy = -m0 + m_p1; // --11- forward difference, ε ~ h^1 } else if (!is0(m_m2) && is0(m_p1)) { dmdy = 0.5f * m_m2 - 2.0f * m_m1 + 1.5f * m0; // 111-- backward difference, ε ~ h^2 } else if (!is0(m_p2) && is0(m_m1)) { dmdy = -0.5f * m_p2 + 2.0f * m_p1 - 1.5f * m0; // --111 forward difference, ε ~ h^2 } else { dmdy = (2.0f/3.0f)*(m_p1 - m_m1) + (1.0f/12.0f)*(m_m2 - m_p2); // 11111 central difference, ε ~ h^4 } } dmdx_x_dmdy = cross(dmdx, dmdy); s[I] = icxcy * dot(m0, dmdx_x_dmdy); } mumax3-3.10/cuda/topologicalcharge.go000066400000000000000000000010431371432437400176060ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Set s to the toplogogical charge density s = m · (∂m/∂x ❌ ∂m/∂y) // See topologicalcharge.cu func SetTopologicalCharge(s *data.Slice, m *data.Slice, mesh *data.Mesh) { cellsize := mesh.CellSize() N := s.Size() util.Argument(m.Size() == N) cfg := make3DConf(N) icxcy := float32(1.0 / (cellsize[X] * cellsize[Y])) k_settopologicalcharge_async(s.DevPtr(X), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), icxcy, N[X], N[Y], N[Z], mesh.PBC_code(), cfg) } mumax3-3.10/cuda/topologicalcharge_wrapper.go000066400000000000000000005730721371432437400213660ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for settopologicalcharge kernel var settopologicalcharge_code cu.Function // Stores the arguments for settopologicalcharge kernel invocation type settopologicalcharge_args_t struct { arg_s unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_icxcy float32 arg_Nx int arg_Ny int arg_Nz int arg_PBC byte argptr [9]unsafe.Pointer sync.Mutex } // Stores the arguments for settopologicalcharge kernel invocation var settopologicalcharge_args settopologicalcharge_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. settopologicalcharge_args.argptr[0] = unsafe.Pointer(&settopologicalcharge_args.arg_s) settopologicalcharge_args.argptr[1] = unsafe.Pointer(&settopologicalcharge_args.arg_mx) settopologicalcharge_args.argptr[2] = unsafe.Pointer(&settopologicalcharge_args.arg_my) settopologicalcharge_args.argptr[3] = unsafe.Pointer(&settopologicalcharge_args.arg_mz) settopologicalcharge_args.argptr[4] = unsafe.Pointer(&settopologicalcharge_args.arg_icxcy) settopologicalcharge_args.argptr[5] = unsafe.Pointer(&settopologicalcharge_args.arg_Nx) settopologicalcharge_args.argptr[6] = unsafe.Pointer(&settopologicalcharge_args.arg_Ny) settopologicalcharge_args.argptr[7] = unsafe.Pointer(&settopologicalcharge_args.arg_Nz) settopologicalcharge_args.argptr[8] = unsafe.Pointer(&settopologicalcharge_args.arg_PBC) } // Wrapper for settopologicalcharge CUDA kernel, asynchronous. func k_settopologicalcharge_async(s unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, icxcy float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("settopologicalcharge") } settopologicalcharge_args.Lock() defer settopologicalcharge_args.Unlock() if settopologicalcharge_code == 0 { settopologicalcharge_code = fatbinLoad(settopologicalcharge_map, "settopologicalcharge") } settopologicalcharge_args.arg_s = s settopologicalcharge_args.arg_mx = mx settopologicalcharge_args.arg_my = my settopologicalcharge_args.arg_mz = mz settopologicalcharge_args.arg_icxcy = icxcy settopologicalcharge_args.arg_Nx = Nx settopologicalcharge_args.arg_Ny = Ny settopologicalcharge_args.arg_Nz = Nz settopologicalcharge_args.arg_PBC = PBC args := settopologicalcharge_args.argptr[:] cu.LaunchKernel(settopologicalcharge_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("settopologicalcharge") } } // maps compute capability on PTX code for settopologicalcharge kernel. var settopologicalcharge_map = map[int]string{0: "", 30: settopologicalcharge_ptx_30, 32: settopologicalcharge_ptx_32, 35: settopologicalcharge_ptx_35, 37: settopologicalcharge_ptx_37, 50: settopologicalcharge_ptx_50, 52: settopologicalcharge_ptx_52, 53: settopologicalcharge_ptx_53, 60: settopologicalcharge_ptx_60, 61: settopologicalcharge_ptx_61, 62: settopologicalcharge_ptx_62, 70: settopologicalcharge_ptx_70, 72: settopologicalcharge_ptx_72, 75: settopologicalcharge_ptx_75} // settopologicalcharge PTX code for various compute capabilities. const ( settopologicalcharge_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<77>; .reg .b16 %rs<11>; .reg .f32 %f<265>; .reg .b32 %r<158>; .reg .b64 %rd<61>; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f102, [settopologicalcharge_param_4]; ld.param.u32 %r38, [settopologicalcharge_param_5]; ld.param.u32 %r39, [settopologicalcharge_param_6]; ld.param.u32 %r40, [settopologicalcharge_param_7]; ld.param.u8 %rs3, [settopologicalcharge_param_8]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r41, %ntid.x; mov.u32 %r42, %ctaid.x; mov.u32 %r43, %tid.x; mad.lo.s32 %r1, %r41, %r42, %r43; mov.u32 %r44, %ntid.y; mov.u32 %r45, %ctaid.y; mov.u32 %r46, %tid.y; mad.lo.s32 %r2, %r44, %r45, %r46; mov.u32 %r47, %ntid.z; mov.u32 %r48, %ctaid.z; mov.u32 %r49, %tid.z; mad.lo.s32 %r3, %r47, %r48, %r49; setp.ge.s32 %p3, %r2, %r39; setp.ge.s32 %p4, %r1, %r38; or.pred %p5, %p3, %p4; setp.ge.s32 %p6, %r3, %r40; or.pred %p7, %p5, %p6; @%p7 bra BB0_72; cvta.to.global.u64 %rd9, %rd5; mad.lo.s32 %r50, %r3, %r39, %r2; mul.lo.s32 %r4, %r50, %r38; add.s32 %r51, %r4, %r1; mul.wide.s32 %rd10, %r51, 4; add.s64 %rd11, %rd3, %rd10; add.s64 %rd12, %rd2, %rd10; add.s64 %rd13, %rd1, %rd10; ld.global.f32 %f1, [%rd11]; ld.global.f32 %f2, [%rd12]; mul.f32 %f103, %f2, %f2; fma.rn.f32 %f104, %f1, %f1, %f103; ld.global.f32 %f3, [%rd13]; fma.rn.f32 %f105, %f3, %f3, %f104; setp.eq.f32 %p8, %f105, 0f00000000; add.s64 %rd4, %rd9, %rd10; @%p8 bra BB0_71; bra.uni BB0_2; BB0_71: mov.u32 %r149, 0; st.global.u32 [%rd4], %r149; bra.uni BB0_72; BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r5, %r1, -2; @%p9 bra BB0_4; rem.s32 %r52, %r5, %r38; add.s32 %r53, %r52, %r38; rem.s32 %r150, %r53, %r38; bra.uni BB0_5; BB0_4: mov.u32 %r54, 0; max.s32 %r150, %r5, %r54; BB0_5: setp.lt.s32 %p11, %r5, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_7; add.s32 %r55, %r150, %r4; mul.wide.s32 %rd14, %r55, 4; add.s64 %rd15, %rd3, %rd14; ld.global.f32 %f7, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.f32 %f8, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.f32 %f9, [%rd17]; BB0_7: add.s32 %r9, %r1, -1; @%p9 bra BB0_9; rem.s32 %r56, %r9, %r38; add.s32 %r57, %r56, %r38; rem.s32 %r151, %r57, %r38; bra.uni BB0_10; BB0_9: mov.u32 %r58, 0; max.s32 %r151, %r9, %r58; BB0_10: setp.lt.s32 %p14, %r9, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_12; add.s32 %r59, %r151, %r4; mul.wide.s32 %rd18, %r59, 4; add.s64 %rd19, %rd3, %rd18; ld.global.f32 %f13, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.f32 %f14, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.f32 %f15, [%rd21]; BB0_12: add.s32 %r13, %r1, 1; @%p9 bra BB0_14; rem.s32 %r60, %r13, %r38; add.s32 %r61, %r60, %r38; rem.s32 %r152, %r61, %r38; bra.uni BB0_15; BB0_14: add.s32 %r62, %r38, -1; min.s32 %r152, %r13, %r62; BB0_15: setp.ge.s32 %p18, %r13, %r38; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_17; add.s32 %r63, %r152, %r4; mul.wide.s32 %rd22, %r63, 4; add.s64 %rd23, %rd3, %rd22; ld.global.f32 %f19, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.f32 %f20, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.f32 %f21, [%rd25]; BB0_17: add.s32 %r17, %r1, 2; @%p9 bra BB0_19; rem.s32 %r64, %r17, %r38; add.s32 %r65, %r64, %r38; rem.s32 %r153, %r65, %r38; bra.uni BB0_20; BB0_19: add.s32 %r66, %r38, -1; min.s32 %r153, %r17, %r66; BB0_20: add.s32 %r21, %r153, %r4; setp.ge.s32 %p22, %r17, %r38; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_22; mul.wide.s32 %rd26, %r21, 4; add.s64 %rd27, %rd3, %rd26; ld.global.f32 %f25, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.f32 %f26, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.f32 %f27, [%rd29]; BB0_22: mul.f32 %f118, %f20, %f20; fma.rn.f32 %f119, %f19, %f19, %f118; fma.rn.f32 %f28, %f21, %f21, %f119; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_24; mul.f32 %f123, %f14, %f14; fma.rn.f32 %f124, %f13, %f13, %f123; fma.rn.f32 %f125, %f15, %f15, %f124; setp.eq.f32 %p26, %f125, 0f00000000; mov.f32 %f247, 0f00000000; mov.f32 %f248, %f247; mov.f32 %f249, %f247; @%p26 bra BB0_36; BB0_24: mul.f32 %f126, %f8, %f8; fma.rn.f32 %f127, %f7, %f7, %f126; fma.rn.f32 %f29, %f9, %f9, %f127; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f128, %f26, %f26; fma.rn.f32 %f129, %f25, %f25, %f128; fma.rn.f32 %f30, %f27, %f27, %f129; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_26; mul.f32 %f130, %f14, %f14; fma.rn.f32 %f131, %f13, %f13, %f130; fma.rn.f32 %f132, %f15, %f15, %f131; setp.neu.f32 %p32, %f132, 0f00000000; @%p32 bra BB0_35; bra.uni BB0_26; BB0_35: sub.f32 %f161, %f19, %f13; mul.f32 %f247, %f161, 0f3F000000; sub.f32 %f162, %f20, %f14; mul.f32 %f248, %f162, 0f3F000000; sub.f32 %f163, %f21, %f15; mul.f32 %f249, %f163, 0f3F000000; bra.uni BB0_36; BB0_26: or.pred %p34, %p25, %p27; @%p34 bra BB0_28; bra.uni BB0_27; BB0_28: mul.f32 %f133, %f14, %f14; fma.rn.f32 %f134, %f13, %f13, %f133; fma.rn.f32 %f34, %f15, %f15, %f134; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_30; bra.uni BB0_29; BB0_30: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_32; bra.uni BB0_31; BB0_32: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_34; bra.uni BB0_33; BB0_34: sub.f32 %f152, %f19, %f13; sub.f32 %f153, %f20, %f14; sub.f32 %f154, %f21, %f15; sub.f32 %f155, %f7, %f25; mul.f32 %f156, %f155, 0f3DAAAAAB; sub.f32 %f157, %f8, %f26; mul.f32 %f158, %f157, 0f3DAAAAAB; sub.f32 %f159, %f9, %f27; mul.f32 %f160, %f159, 0f3DAAAAAB; fma.rn.f32 %f247, %f152, 0f3F2AAAAB, %f156; fma.rn.f32 %f248, %f153, 0f3F2AAAAB, %f158; fma.rn.f32 %f249, %f154, 0f3F2AAAAB, %f160; bra.uni BB0_36; BB0_27: sub.f32 %f247, %f1, %f13; sub.f32 %f248, %f2, %f14; sub.f32 %f249, %f3, %f15; bra.uni BB0_36; BB0_29: sub.f32 %f247, %f19, %f1; sub.f32 %f248, %f20, %f2; sub.f32 %f249, %f21, %f3; bra.uni BB0_36; BB0_31: mul.f32 %f135, %f13, 0fC0000000; fma.rn.f32 %f136, %f7, 0f3F000000, %f135; add.f32 %f137, %f14, %f14; mul.f32 %f138, %f8, 0f3F000000; sub.f32 %f139, %f138, %f137; add.f32 %f140, %f15, %f15; mul.f32 %f141, %f9, 0f3F000000; sub.f32 %f142, %f141, %f140; fma.rn.f32 %f247, %f1, 0f3FC00000, %f136; fma.rn.f32 %f248, %f2, 0f3FC00000, %f139; fma.rn.f32 %f249, %f3, 0f3FC00000, %f142; bra.uni BB0_36; BB0_33: mul.f32 %f143, %f25, 0fBF000000; fma.rn.f32 %f144, %f19, 0f40000000, %f143; mul.f32 %f145, %f26, 0fBF000000; fma.rn.f32 %f146, %f20, 0f40000000, %f145; mul.f32 %f147, %f27, 0fBF000000; fma.rn.f32 %f148, %f21, 0f40000000, %f147; mul.f32 %f149, %f1, 0f3FC00000; sub.f32 %f247, %f144, %f149; mul.f32 %f150, %f2, 0f3FC00000; sub.f32 %f248, %f146, %f150; mul.f32 %f151, %f3, 0f3FC00000; sub.f32 %f249, %f148, %f151; BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r22, %r2, -2; @%p43 bra BB0_38; rem.s32 %r71, %r22, %r39; add.s32 %r72, %r71, %r39; rem.s32 %r154, %r72, %r39; bra.uni BB0_39; BB0_38: mov.u32 %r73, 0; max.s32 %r154, %r22, %r73; BB0_39: setp.lt.s32 %p45, %r22, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_41; mad.lo.s32 %r82, %r3, %r39, %r154; mad.lo.s32 %r83, %r82, %r38, %r1; mul.wide.s32 %rd31, %r83, 4; add.s64 %rd32, %rd3, %rd31; ld.global.f32 %f56, [%rd32]; add.s64 %rd34, %rd2, %rd31; ld.global.f32 %f57, [%rd34]; add.s64 %rd36, %rd1, %rd31; ld.global.f32 %f58, [%rd36]; BB0_41: add.s32 %r26, %r2, -1; @%p43 bra BB0_43; rem.s32 %r88, %r26, %r39; add.s32 %r89, %r88, %r39; rem.s32 %r155, %r89, %r39; bra.uni BB0_44; BB0_43: mov.u32 %r90, 0; max.s32 %r155, %r26, %r90; BB0_44: setp.lt.s32 %p48, %r26, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_46; mad.lo.s32 %r99, %r3, %r39, %r155; mad.lo.s32 %r100, %r99, %r38, %r1; mul.wide.s32 %rd38, %r100, 4; add.s64 %rd39, %rd3, %rd38; ld.global.f32 %f62, [%rd39]; add.s64 %rd41, %rd2, %rd38; ld.global.f32 %f63, [%rd41]; add.s64 %rd43, %rd1, %rd38; ld.global.f32 %f64, [%rd43]; BB0_46: add.s32 %r30, %r2, 1; @%p43 bra BB0_48; rem.s32 %r105, %r30, %r39; add.s32 %r106, %r105, %r39; rem.s32 %r156, %r106, %r39; bra.uni BB0_49; BB0_48: add.s32 %r107, %r39, -1; min.s32 %r156, %r30, %r107; BB0_49: setp.ge.s32 %p52, %r30, %r39; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_51; mad.lo.s32 %r116, %r3, %r39, %r156; mad.lo.s32 %r117, %r116, %r38, %r1; mul.wide.s32 %rd45, %r117, 4; add.s64 %rd46, %rd3, %rd45; ld.global.f32 %f68, [%rd46]; add.s64 %rd48, %rd2, %rd45; ld.global.f32 %f69, [%rd48]; add.s64 %rd50, %rd1, %rd45; ld.global.f32 %f70, [%rd50]; BB0_51: add.s32 %r34, %r2, 2; @%p43 bra BB0_53; rem.s32 %r122, %r34, %r39; add.s32 %r123, %r122, %r39; rem.s32 %r157, %r123, %r39; bra.uni BB0_54; BB0_53: add.s32 %r124, %r39, -1; min.s32 %r157, %r34, %r124; BB0_54: setp.ge.s32 %p56, %r34, %r39; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_56; mad.lo.s32 %r133, %r3, %r39, %r157; mad.lo.s32 %r134, %r133, %r38, %r1; mul.wide.s32 %rd52, %r134, 4; add.s64 %rd53, %rd3, %rd52; ld.global.f32 %f74, [%rd53]; add.s64 %rd55, %rd2, %rd52; ld.global.f32 %f75, [%rd55]; add.s64 %rd57, %rd1, %rd52; ld.global.f32 %f76, [%rd57]; BB0_56: mul.f32 %f176, %f69, %f69; fma.rn.f32 %f177, %f68, %f68, %f176; fma.rn.f32 %f77, %f70, %f70, %f177; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_58; mul.f32 %f181, %f63, %f63; fma.rn.f32 %f182, %f62, %f62, %f181; fma.rn.f32 %f183, %f64, %f64, %f182; setp.eq.f32 %p60, %f183, 0f00000000; mov.f32 %f262, 0f00000000; mov.f32 %f263, %f262; mov.f32 %f264, %f262; @%p60 bra BB0_70; BB0_58: mul.f32 %f184, %f57, %f57; fma.rn.f32 %f185, %f56, %f56, %f184; fma.rn.f32 %f78, %f58, %f58, %f185; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f186, %f75, %f75; fma.rn.f32 %f187, %f74, %f74, %f186; fma.rn.f32 %f79, %f76, %f76, %f187; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_60; mul.f32 %f188, %f63, %f63; fma.rn.f32 %f189, %f62, %f62, %f188; fma.rn.f32 %f190, %f64, %f64, %f189; setp.neu.f32 %p66, %f190, 0f00000000; @%p66 bra BB0_69; bra.uni BB0_60; BB0_69: sub.f32 %f219, %f68, %f62; mul.f32 %f262, %f219, 0f3F000000; sub.f32 %f220, %f69, %f63; mul.f32 %f263, %f220, 0f3F000000; sub.f32 %f221, %f70, %f64; mul.f32 %f264, %f221, 0f3F000000; bra.uni BB0_70; BB0_60: or.pred %p68, %p59, %p61; @%p68 bra BB0_62; bra.uni BB0_61; BB0_62: mul.f32 %f191, %f63, %f63; fma.rn.f32 %f192, %f62, %f62, %f191; fma.rn.f32 %f83, %f64, %f64, %f192; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_64; bra.uni BB0_63; BB0_64: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_66; bra.uni BB0_65; BB0_66: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_68; bra.uni BB0_67; BB0_68: sub.f32 %f210, %f68, %f62; sub.f32 %f211, %f69, %f63; sub.f32 %f212, %f70, %f64; sub.f32 %f213, %f56, %f74; mul.f32 %f214, %f213, 0f3DAAAAAB; sub.f32 %f215, %f57, %f75; mul.f32 %f216, %f215, 0f3DAAAAAB; sub.f32 %f217, %f58, %f76; mul.f32 %f218, %f217, 0f3DAAAAAB; fma.rn.f32 %f262, %f210, 0f3F2AAAAB, %f214; fma.rn.f32 %f263, %f211, 0f3F2AAAAB, %f216; fma.rn.f32 %f264, %f212, 0f3F2AAAAB, %f218; bra.uni BB0_70; BB0_61: sub.f32 %f262, %f1, %f62; sub.f32 %f263, %f2, %f63; sub.f32 %f264, %f3, %f64; bra.uni BB0_70; BB0_63: sub.f32 %f262, %f68, %f1; sub.f32 %f263, %f69, %f2; sub.f32 %f264, %f70, %f3; bra.uni BB0_70; BB0_65: mul.f32 %f193, %f62, 0fC0000000; fma.rn.f32 %f194, %f56, 0f3F000000, %f193; add.f32 %f195, %f63, %f63; mul.f32 %f196, %f57, 0f3F000000; sub.f32 %f197, %f196, %f195; add.f32 %f198, %f64, %f64; mul.f32 %f199, %f58, 0f3F000000; sub.f32 %f200, %f199, %f198; fma.rn.f32 %f262, %f1, 0f3FC00000, %f194; fma.rn.f32 %f263, %f2, 0f3FC00000, %f197; fma.rn.f32 %f264, %f3, 0f3FC00000, %f200; bra.uni BB0_70; BB0_67: mul.f32 %f201, %f74, 0fBF000000; fma.rn.f32 %f202, %f68, 0f40000000, %f201; mul.f32 %f203, %f75, 0fBF000000; fma.rn.f32 %f204, %f69, 0f40000000, %f203; mul.f32 %f205, %f76, 0fBF000000; fma.rn.f32 %f206, %f70, 0f40000000, %f205; mul.f32 %f207, %f1, 0f3FC00000; sub.f32 %f262, %f202, %f207; mul.f32 %f208, %f2, 0f3FC00000; sub.f32 %f263, %f204, %f208; mul.f32 %f209, %f3, 0f3FC00000; sub.f32 %f264, %f206, %f209; BB0_70: mul.f32 %f222, %f249, %f263; mul.f32 %f223, %f248, %f264; sub.f32 %f224, %f223, %f222; mul.f32 %f225, %f247, %f264; mul.f32 %f226, %f249, %f262; sub.f32 %f227, %f226, %f225; mul.f32 %f228, %f248, %f262; mul.f32 %f229, %f247, %f263; sub.f32 %f230, %f229, %f228; mul.f32 %f231, %f2, %f227; fma.rn.f32 %f232, %f1, %f224, %f231; fma.rn.f32 %f233, %f3, %f230, %f232; mul.f32 %f234, %f233, %f102; mad.lo.s32 %r148, %r50, %r38, %r1; mul.wide.s32 %rd59, %r148, 4; add.s64 %rd60, %rd9, %rd59; st.global.f32 [%rd60], %f234; BB0_72: ret; } ` settopologicalcharge_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<77>; .reg .b16 %rs<11>; .reg .f32 %f<265>; .reg .b32 %r<97>; .reg .b64 %rd<46>; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f102, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; ld.param.u8 %rs3, [settopologicalcharge_param_8]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r43, %r44, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r46, %r47, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r49, %r50, %r51; setp.ge.s32 %p3, %r2, %r41; setp.ge.s32 %p4, %r1, %r40; or.pred %p5, %p3, %p4; setp.ge.s32 %p6, %r3, %r42; or.pred %p7, %p5, %p6; @%p7 bra BB0_72; cvta.to.global.u64 %rd9, %rd5; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd10, %r53, 4; add.s64 %rd11, %rd3, %rd10; add.s64 %rd12, %rd2, %rd10; add.s64 %rd13, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; ld.global.nc.f32 %f2, [%rd12]; mul.f32 %f103, %f2, %f2; fma.rn.f32 %f104, %f1, %f1, %f103; ld.global.nc.f32 %f3, [%rd13]; fma.rn.f32 %f105, %f3, %f3, %f104; setp.eq.f32 %p8, %f105, 0f00000000; add.s64 %rd4, %rd9, %rd10; @%p8 bra BB0_71; bra.uni BB0_2; BB0_71: mov.u32 %r88, 0; st.global.u32 [%rd4], %r88; bra.uni BB0_72; BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r6, %r1, -2; @%p9 bra BB0_4; rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r89, %r55, %r40; bra.uni BB0_5; BB0_4: mov.u32 %r56, 0; max.s32 %r89, %r6, %r56; BB0_5: setp.lt.s32 %p11, %r6, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_7; add.s32 %r57, %r89, %r5; mul.wide.s32 %rd14, %r57, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f7, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f8, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; BB0_7: add.s32 %r10, %r1, -1; @%p9 bra BB0_9; rem.s32 %r58, %r10, %r40; add.s32 %r59, %r58, %r40; rem.s32 %r90, %r59, %r40; bra.uni BB0_10; BB0_9: mov.u32 %r60, 0; max.s32 %r90, %r10, %r60; BB0_10: setp.lt.s32 %p14, %r10, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_12; add.s32 %r61, %r90, %r5; mul.wide.s32 %rd18, %r61, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f14, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f15, [%rd21]; BB0_12: add.s32 %r14, %r1, 1; @%p9 bra BB0_14; rem.s32 %r62, %r14, %r40; add.s32 %r63, %r62, %r40; rem.s32 %r91, %r63, %r40; bra.uni BB0_15; BB0_14: add.s32 %r64, %r40, -1; min.s32 %r91, %r14, %r64; BB0_15: setp.ge.s32 %p18, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_17; add.s32 %r65, %r91, %r5; mul.wide.s32 %rd22, %r65, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f19, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f20, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; BB0_17: add.s32 %r18, %r1, 2; @%p9 bra BB0_19; rem.s32 %r66, %r18, %r40; add.s32 %r67, %r66, %r40; rem.s32 %r92, %r67, %r40; bra.uni BB0_20; BB0_19: add.s32 %r68, %r40, -1; min.s32 %r92, %r18, %r68; BB0_20: add.s32 %r22, %r92, %r5; setp.ge.s32 %p22, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f25, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f26, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; BB0_22: mul.f32 %f118, %f20, %f20; fma.rn.f32 %f119, %f19, %f19, %f118; fma.rn.f32 %f28, %f21, %f21, %f119; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_24; mul.f32 %f123, %f14, %f14; fma.rn.f32 %f124, %f13, %f13, %f123; fma.rn.f32 %f125, %f15, %f15, %f124; setp.eq.f32 %p26, %f125, 0f00000000; mov.f32 %f247, 0f00000000; mov.f32 %f248, %f247; mov.f32 %f249, %f247; @%p26 bra BB0_36; BB0_24: mul.f32 %f126, %f8, %f8; fma.rn.f32 %f127, %f7, %f7, %f126; fma.rn.f32 %f29, %f9, %f9, %f127; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f128, %f26, %f26; fma.rn.f32 %f129, %f25, %f25, %f128; fma.rn.f32 %f30, %f27, %f27, %f129; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_26; mul.f32 %f130, %f14, %f14; fma.rn.f32 %f131, %f13, %f13, %f130; fma.rn.f32 %f132, %f15, %f15, %f131; setp.neu.f32 %p32, %f132, 0f00000000; @%p32 bra BB0_35; bra.uni BB0_26; BB0_35: sub.f32 %f161, %f19, %f13; mul.f32 %f247, %f161, 0f3F000000; sub.f32 %f162, %f20, %f14; mul.f32 %f248, %f162, 0f3F000000; sub.f32 %f163, %f21, %f15; mul.f32 %f249, %f163, 0f3F000000; bra.uni BB0_36; BB0_26: or.pred %p34, %p25, %p27; @%p34 bra BB0_28; bra.uni BB0_27; BB0_28: mul.f32 %f133, %f14, %f14; fma.rn.f32 %f134, %f13, %f13, %f133; fma.rn.f32 %f34, %f15, %f15, %f134; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_30; bra.uni BB0_29; BB0_30: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_32; bra.uni BB0_31; BB0_32: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_34; bra.uni BB0_33; BB0_34: sub.f32 %f152, %f19, %f13; sub.f32 %f153, %f20, %f14; sub.f32 %f154, %f21, %f15; sub.f32 %f155, %f7, %f25; mul.f32 %f156, %f155, 0f3DAAAAAB; sub.f32 %f157, %f8, %f26; mul.f32 %f158, %f157, 0f3DAAAAAB; sub.f32 %f159, %f9, %f27; mul.f32 %f160, %f159, 0f3DAAAAAB; fma.rn.f32 %f247, %f152, 0f3F2AAAAB, %f156; fma.rn.f32 %f248, %f153, 0f3F2AAAAB, %f158; fma.rn.f32 %f249, %f154, 0f3F2AAAAB, %f160; bra.uni BB0_36; BB0_27: sub.f32 %f247, %f1, %f13; sub.f32 %f248, %f2, %f14; sub.f32 %f249, %f3, %f15; bra.uni BB0_36; BB0_29: sub.f32 %f247, %f19, %f1; sub.f32 %f248, %f20, %f2; sub.f32 %f249, %f21, %f3; bra.uni BB0_36; BB0_31: mul.f32 %f135, %f13, 0fC0000000; fma.rn.f32 %f136, %f7, 0f3F000000, %f135; add.f32 %f137, %f14, %f14; mul.f32 %f138, %f8, 0f3F000000; sub.f32 %f139, %f138, %f137; add.f32 %f140, %f15, %f15; mul.f32 %f141, %f9, 0f3F000000; sub.f32 %f142, %f141, %f140; fma.rn.f32 %f247, %f1, 0f3FC00000, %f136; fma.rn.f32 %f248, %f2, 0f3FC00000, %f139; fma.rn.f32 %f249, %f3, 0f3FC00000, %f142; bra.uni BB0_36; BB0_33: mul.f32 %f143, %f25, 0fBF000000; fma.rn.f32 %f144, %f19, 0f40000000, %f143; mul.f32 %f145, %f26, 0fBF000000; fma.rn.f32 %f146, %f20, 0f40000000, %f145; mul.f32 %f147, %f27, 0fBF000000; fma.rn.f32 %f148, %f21, 0f40000000, %f147; mul.f32 %f149, %f1, 0f3FC00000; sub.f32 %f247, %f144, %f149; mul.f32 %f150, %f2, 0f3FC00000; sub.f32 %f248, %f146, %f150; mul.f32 %f151, %f3, 0f3FC00000; sub.f32 %f249, %f148, %f151; BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_38; rem.s32 %r69, %r23, %r41; add.s32 %r70, %r69, %r41; rem.s32 %r93, %r70, %r41; bra.uni BB0_39; BB0_38: mov.u32 %r71, 0; max.s32 %r93, %r23, %r71; BB0_39: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_41; add.s32 %r72, %r93, %r4; mad.lo.s32 %r73, %r72, %r40, %r1; mul.wide.s32 %rd30, %r73, 4; add.s64 %rd31, %rd3, %rd30; ld.global.nc.f32 %f56, [%rd31]; add.s64 %rd32, %rd2, %rd30; ld.global.nc.f32 %f57, [%rd32]; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f58, [%rd33]; BB0_41: add.s32 %r27, %r2, -1; @%p43 bra BB0_43; rem.s32 %r74, %r27, %r41; add.s32 %r75, %r74, %r41; rem.s32 %r94, %r75, %r41; bra.uni BB0_44; BB0_43: mov.u32 %r76, 0; max.s32 %r94, %r27, %r76; BB0_44: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_46; add.s32 %r77, %r94, %r4; mad.lo.s32 %r78, %r77, %r40, %r1; mul.wide.s32 %rd34, %r78, 4; add.s64 %rd35, %rd3, %rd34; ld.global.nc.f32 %f62, [%rd35]; add.s64 %rd36, %rd2, %rd34; ld.global.nc.f32 %f63, [%rd36]; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f64, [%rd37]; BB0_46: add.s32 %r31, %r2, 1; @%p43 bra BB0_48; rem.s32 %r79, %r31, %r41; add.s32 %r80, %r79, %r41; rem.s32 %r95, %r80, %r41; bra.uni BB0_49; BB0_48: add.s32 %r81, %r41, -1; min.s32 %r95, %r31, %r81; BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_51; add.s32 %r82, %r95, %r4; mad.lo.s32 %r83, %r82, %r40, %r1; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f68, [%rd39]; add.s64 %rd40, %rd2, %rd38; ld.global.nc.f32 %f69, [%rd40]; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f70, [%rd41]; BB0_51: add.s32 %r35, %r2, 2; @%p43 bra BB0_53; rem.s32 %r84, %r35, %r41; add.s32 %r85, %r84, %r41; rem.s32 %r96, %r85, %r41; bra.uni BB0_54; BB0_53: add.s32 %r86, %r41, -1; min.s32 %r96, %r35, %r86; BB0_54: add.s32 %r87, %r96, %r4; mad.lo.s32 %r39, %r87, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; ld.global.nc.f32 %f74, [%rd43]; add.s64 %rd44, %rd2, %rd42; ld.global.nc.f32 %f75, [%rd44]; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f76, [%rd45]; BB0_56: mul.f32 %f176, %f69, %f69; fma.rn.f32 %f177, %f68, %f68, %f176; fma.rn.f32 %f77, %f70, %f70, %f177; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_58; mul.f32 %f181, %f63, %f63; fma.rn.f32 %f182, %f62, %f62, %f181; fma.rn.f32 %f183, %f64, %f64, %f182; setp.eq.f32 %p60, %f183, 0f00000000; mov.f32 %f262, 0f00000000; mov.f32 %f263, %f262; mov.f32 %f264, %f262; @%p60 bra BB0_70; BB0_58: mul.f32 %f184, %f57, %f57; fma.rn.f32 %f185, %f56, %f56, %f184; fma.rn.f32 %f78, %f58, %f58, %f185; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f186, %f75, %f75; fma.rn.f32 %f187, %f74, %f74, %f186; fma.rn.f32 %f79, %f76, %f76, %f187; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_60; mul.f32 %f188, %f63, %f63; fma.rn.f32 %f189, %f62, %f62, %f188; fma.rn.f32 %f190, %f64, %f64, %f189; setp.neu.f32 %p66, %f190, 0f00000000; @%p66 bra BB0_69; bra.uni BB0_60; BB0_69: sub.f32 %f219, %f68, %f62; mul.f32 %f262, %f219, 0f3F000000; sub.f32 %f220, %f69, %f63; mul.f32 %f263, %f220, 0f3F000000; sub.f32 %f221, %f70, %f64; mul.f32 %f264, %f221, 0f3F000000; bra.uni BB0_70; BB0_60: or.pred %p68, %p59, %p61; @%p68 bra BB0_62; bra.uni BB0_61; BB0_62: mul.f32 %f191, %f63, %f63; fma.rn.f32 %f192, %f62, %f62, %f191; fma.rn.f32 %f83, %f64, %f64, %f192; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_64; bra.uni BB0_63; BB0_64: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_66; bra.uni BB0_65; BB0_66: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_68; bra.uni BB0_67; BB0_68: sub.f32 %f210, %f68, %f62; sub.f32 %f211, %f69, %f63; sub.f32 %f212, %f70, %f64; sub.f32 %f213, %f56, %f74; mul.f32 %f214, %f213, 0f3DAAAAAB; sub.f32 %f215, %f57, %f75; mul.f32 %f216, %f215, 0f3DAAAAAB; sub.f32 %f217, %f58, %f76; mul.f32 %f218, %f217, 0f3DAAAAAB; fma.rn.f32 %f262, %f210, 0f3F2AAAAB, %f214; fma.rn.f32 %f263, %f211, 0f3F2AAAAB, %f216; fma.rn.f32 %f264, %f212, 0f3F2AAAAB, %f218; bra.uni BB0_70; BB0_61: sub.f32 %f262, %f1, %f62; sub.f32 %f263, %f2, %f63; sub.f32 %f264, %f3, %f64; bra.uni BB0_70; BB0_63: sub.f32 %f262, %f68, %f1; sub.f32 %f263, %f69, %f2; sub.f32 %f264, %f70, %f3; bra.uni BB0_70; BB0_65: mul.f32 %f193, %f62, 0fC0000000; fma.rn.f32 %f194, %f56, 0f3F000000, %f193; add.f32 %f195, %f63, %f63; mul.f32 %f196, %f57, 0f3F000000; sub.f32 %f197, %f196, %f195; add.f32 %f198, %f64, %f64; mul.f32 %f199, %f58, 0f3F000000; sub.f32 %f200, %f199, %f198; fma.rn.f32 %f262, %f1, 0f3FC00000, %f194; fma.rn.f32 %f263, %f2, 0f3FC00000, %f197; fma.rn.f32 %f264, %f3, 0f3FC00000, %f200; bra.uni BB0_70; BB0_67: mul.f32 %f201, %f74, 0fBF000000; fma.rn.f32 %f202, %f68, 0f40000000, %f201; mul.f32 %f203, %f75, 0fBF000000; fma.rn.f32 %f204, %f69, 0f40000000, %f203; mul.f32 %f205, %f76, 0fBF000000; fma.rn.f32 %f206, %f70, 0f40000000, %f205; mul.f32 %f207, %f1, 0f3FC00000; sub.f32 %f262, %f202, %f207; mul.f32 %f208, %f2, 0f3FC00000; sub.f32 %f263, %f204, %f208; mul.f32 %f209, %f3, 0f3FC00000; sub.f32 %f264, %f206, %f209; BB0_70: mul.f32 %f222, %f249, %f263; mul.f32 %f223, %f248, %f264; sub.f32 %f224, %f223, %f222; mul.f32 %f225, %f247, %f264; mul.f32 %f226, %f249, %f262; sub.f32 %f227, %f226, %f225; mul.f32 %f228, %f248, %f262; mul.f32 %f229, %f247, %f263; sub.f32 %f230, %f229, %f228; mul.f32 %f231, %f2, %f227; fma.rn.f32 %f232, %f1, %f224, %f231; fma.rn.f32 %f233, %f3, %f230, %f232; mul.f32 %f234, %f233, %f102; st.global.f32 [%rd4], %f234; BB0_72: ret; } ` settopologicalcharge_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<77>; .reg .b16 %rs<11>; .reg .f32 %f<265>; .reg .b32 %r<97>; .reg .b64 %rd<46>; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f102, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; ld.param.u8 %rs3, [settopologicalcharge_param_8]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r43, %r44, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r46, %r47, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r49, %r50, %r51; setp.ge.s32 %p3, %r2, %r41; setp.ge.s32 %p4, %r1, %r40; or.pred %p5, %p3, %p4; setp.ge.s32 %p6, %r3, %r42; or.pred %p7, %p5, %p6; @%p7 bra BB0_72; cvta.to.global.u64 %rd9, %rd5; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd10, %r53, 4; add.s64 %rd11, %rd3, %rd10; add.s64 %rd12, %rd2, %rd10; add.s64 %rd13, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; ld.global.nc.f32 %f2, [%rd12]; mul.f32 %f103, %f2, %f2; fma.rn.f32 %f104, %f1, %f1, %f103; ld.global.nc.f32 %f3, [%rd13]; fma.rn.f32 %f105, %f3, %f3, %f104; setp.eq.f32 %p8, %f105, 0f00000000; add.s64 %rd4, %rd9, %rd10; @%p8 bra BB0_71; bra.uni BB0_2; BB0_71: mov.u32 %r88, 0; st.global.u32 [%rd4], %r88; bra.uni BB0_72; BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r6, %r1, -2; @%p9 bra BB0_4; rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r89, %r55, %r40; bra.uni BB0_5; BB0_4: mov.u32 %r56, 0; max.s32 %r89, %r6, %r56; BB0_5: setp.lt.s32 %p11, %r6, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_7; add.s32 %r57, %r89, %r5; mul.wide.s32 %rd14, %r57, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f7, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f8, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; BB0_7: add.s32 %r10, %r1, -1; @%p9 bra BB0_9; rem.s32 %r58, %r10, %r40; add.s32 %r59, %r58, %r40; rem.s32 %r90, %r59, %r40; bra.uni BB0_10; BB0_9: mov.u32 %r60, 0; max.s32 %r90, %r10, %r60; BB0_10: setp.lt.s32 %p14, %r10, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_12; add.s32 %r61, %r90, %r5; mul.wide.s32 %rd18, %r61, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f14, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f15, [%rd21]; BB0_12: add.s32 %r14, %r1, 1; @%p9 bra BB0_14; rem.s32 %r62, %r14, %r40; add.s32 %r63, %r62, %r40; rem.s32 %r91, %r63, %r40; bra.uni BB0_15; BB0_14: add.s32 %r64, %r40, -1; min.s32 %r91, %r14, %r64; BB0_15: setp.ge.s32 %p18, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_17; add.s32 %r65, %r91, %r5; mul.wide.s32 %rd22, %r65, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f19, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f20, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; BB0_17: add.s32 %r18, %r1, 2; @%p9 bra BB0_19; rem.s32 %r66, %r18, %r40; add.s32 %r67, %r66, %r40; rem.s32 %r92, %r67, %r40; bra.uni BB0_20; BB0_19: add.s32 %r68, %r40, -1; min.s32 %r92, %r18, %r68; BB0_20: add.s32 %r22, %r92, %r5; setp.ge.s32 %p22, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f25, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f26, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; BB0_22: mul.f32 %f118, %f20, %f20; fma.rn.f32 %f119, %f19, %f19, %f118; fma.rn.f32 %f28, %f21, %f21, %f119; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_24; mul.f32 %f123, %f14, %f14; fma.rn.f32 %f124, %f13, %f13, %f123; fma.rn.f32 %f125, %f15, %f15, %f124; setp.eq.f32 %p26, %f125, 0f00000000; mov.f32 %f247, 0f00000000; mov.f32 %f248, %f247; mov.f32 %f249, %f247; @%p26 bra BB0_36; BB0_24: mul.f32 %f126, %f8, %f8; fma.rn.f32 %f127, %f7, %f7, %f126; fma.rn.f32 %f29, %f9, %f9, %f127; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f128, %f26, %f26; fma.rn.f32 %f129, %f25, %f25, %f128; fma.rn.f32 %f30, %f27, %f27, %f129; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_26; mul.f32 %f130, %f14, %f14; fma.rn.f32 %f131, %f13, %f13, %f130; fma.rn.f32 %f132, %f15, %f15, %f131; setp.neu.f32 %p32, %f132, 0f00000000; @%p32 bra BB0_35; bra.uni BB0_26; BB0_35: sub.f32 %f161, %f19, %f13; mul.f32 %f247, %f161, 0f3F000000; sub.f32 %f162, %f20, %f14; mul.f32 %f248, %f162, 0f3F000000; sub.f32 %f163, %f21, %f15; mul.f32 %f249, %f163, 0f3F000000; bra.uni BB0_36; BB0_26: or.pred %p34, %p25, %p27; @%p34 bra BB0_28; bra.uni BB0_27; BB0_28: mul.f32 %f133, %f14, %f14; fma.rn.f32 %f134, %f13, %f13, %f133; fma.rn.f32 %f34, %f15, %f15, %f134; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_30; bra.uni BB0_29; BB0_30: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_32; bra.uni BB0_31; BB0_32: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_34; bra.uni BB0_33; BB0_34: sub.f32 %f152, %f19, %f13; sub.f32 %f153, %f20, %f14; sub.f32 %f154, %f21, %f15; sub.f32 %f155, %f7, %f25; mul.f32 %f156, %f155, 0f3DAAAAAB; sub.f32 %f157, %f8, %f26; mul.f32 %f158, %f157, 0f3DAAAAAB; sub.f32 %f159, %f9, %f27; mul.f32 %f160, %f159, 0f3DAAAAAB; fma.rn.f32 %f247, %f152, 0f3F2AAAAB, %f156; fma.rn.f32 %f248, %f153, 0f3F2AAAAB, %f158; fma.rn.f32 %f249, %f154, 0f3F2AAAAB, %f160; bra.uni BB0_36; BB0_27: sub.f32 %f247, %f1, %f13; sub.f32 %f248, %f2, %f14; sub.f32 %f249, %f3, %f15; bra.uni BB0_36; BB0_29: sub.f32 %f247, %f19, %f1; sub.f32 %f248, %f20, %f2; sub.f32 %f249, %f21, %f3; bra.uni BB0_36; BB0_31: mul.f32 %f135, %f13, 0fC0000000; fma.rn.f32 %f136, %f7, 0f3F000000, %f135; add.f32 %f137, %f14, %f14; mul.f32 %f138, %f8, 0f3F000000; sub.f32 %f139, %f138, %f137; add.f32 %f140, %f15, %f15; mul.f32 %f141, %f9, 0f3F000000; sub.f32 %f142, %f141, %f140; fma.rn.f32 %f247, %f1, 0f3FC00000, %f136; fma.rn.f32 %f248, %f2, 0f3FC00000, %f139; fma.rn.f32 %f249, %f3, 0f3FC00000, %f142; bra.uni BB0_36; BB0_33: mul.f32 %f143, %f25, 0fBF000000; fma.rn.f32 %f144, %f19, 0f40000000, %f143; mul.f32 %f145, %f26, 0fBF000000; fma.rn.f32 %f146, %f20, 0f40000000, %f145; mul.f32 %f147, %f27, 0fBF000000; fma.rn.f32 %f148, %f21, 0f40000000, %f147; mul.f32 %f149, %f1, 0f3FC00000; sub.f32 %f247, %f144, %f149; mul.f32 %f150, %f2, 0f3FC00000; sub.f32 %f248, %f146, %f150; mul.f32 %f151, %f3, 0f3FC00000; sub.f32 %f249, %f148, %f151; BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_38; rem.s32 %r69, %r23, %r41; add.s32 %r70, %r69, %r41; rem.s32 %r93, %r70, %r41; bra.uni BB0_39; BB0_38: mov.u32 %r71, 0; max.s32 %r93, %r23, %r71; BB0_39: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_41; add.s32 %r72, %r93, %r4; mad.lo.s32 %r73, %r72, %r40, %r1; mul.wide.s32 %rd30, %r73, 4; add.s64 %rd31, %rd3, %rd30; ld.global.nc.f32 %f56, [%rd31]; add.s64 %rd32, %rd2, %rd30; ld.global.nc.f32 %f57, [%rd32]; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f58, [%rd33]; BB0_41: add.s32 %r27, %r2, -1; @%p43 bra BB0_43; rem.s32 %r74, %r27, %r41; add.s32 %r75, %r74, %r41; rem.s32 %r94, %r75, %r41; bra.uni BB0_44; BB0_43: mov.u32 %r76, 0; max.s32 %r94, %r27, %r76; BB0_44: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_46; add.s32 %r77, %r94, %r4; mad.lo.s32 %r78, %r77, %r40, %r1; mul.wide.s32 %rd34, %r78, 4; add.s64 %rd35, %rd3, %rd34; ld.global.nc.f32 %f62, [%rd35]; add.s64 %rd36, %rd2, %rd34; ld.global.nc.f32 %f63, [%rd36]; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f64, [%rd37]; BB0_46: add.s32 %r31, %r2, 1; @%p43 bra BB0_48; rem.s32 %r79, %r31, %r41; add.s32 %r80, %r79, %r41; rem.s32 %r95, %r80, %r41; bra.uni BB0_49; BB0_48: add.s32 %r81, %r41, -1; min.s32 %r95, %r31, %r81; BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_51; add.s32 %r82, %r95, %r4; mad.lo.s32 %r83, %r82, %r40, %r1; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f68, [%rd39]; add.s64 %rd40, %rd2, %rd38; ld.global.nc.f32 %f69, [%rd40]; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f70, [%rd41]; BB0_51: add.s32 %r35, %r2, 2; @%p43 bra BB0_53; rem.s32 %r84, %r35, %r41; add.s32 %r85, %r84, %r41; rem.s32 %r96, %r85, %r41; bra.uni BB0_54; BB0_53: add.s32 %r86, %r41, -1; min.s32 %r96, %r35, %r86; BB0_54: add.s32 %r87, %r96, %r4; mad.lo.s32 %r39, %r87, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; ld.global.nc.f32 %f74, [%rd43]; add.s64 %rd44, %rd2, %rd42; ld.global.nc.f32 %f75, [%rd44]; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f76, [%rd45]; BB0_56: mul.f32 %f176, %f69, %f69; fma.rn.f32 %f177, %f68, %f68, %f176; fma.rn.f32 %f77, %f70, %f70, %f177; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_58; mul.f32 %f181, %f63, %f63; fma.rn.f32 %f182, %f62, %f62, %f181; fma.rn.f32 %f183, %f64, %f64, %f182; setp.eq.f32 %p60, %f183, 0f00000000; mov.f32 %f262, 0f00000000; mov.f32 %f263, %f262; mov.f32 %f264, %f262; @%p60 bra BB0_70; BB0_58: mul.f32 %f184, %f57, %f57; fma.rn.f32 %f185, %f56, %f56, %f184; fma.rn.f32 %f78, %f58, %f58, %f185; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f186, %f75, %f75; fma.rn.f32 %f187, %f74, %f74, %f186; fma.rn.f32 %f79, %f76, %f76, %f187; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_60; mul.f32 %f188, %f63, %f63; fma.rn.f32 %f189, %f62, %f62, %f188; fma.rn.f32 %f190, %f64, %f64, %f189; setp.neu.f32 %p66, %f190, 0f00000000; @%p66 bra BB0_69; bra.uni BB0_60; BB0_69: sub.f32 %f219, %f68, %f62; mul.f32 %f262, %f219, 0f3F000000; sub.f32 %f220, %f69, %f63; mul.f32 %f263, %f220, 0f3F000000; sub.f32 %f221, %f70, %f64; mul.f32 %f264, %f221, 0f3F000000; bra.uni BB0_70; BB0_60: or.pred %p68, %p59, %p61; @%p68 bra BB0_62; bra.uni BB0_61; BB0_62: mul.f32 %f191, %f63, %f63; fma.rn.f32 %f192, %f62, %f62, %f191; fma.rn.f32 %f83, %f64, %f64, %f192; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_64; bra.uni BB0_63; BB0_64: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_66; bra.uni BB0_65; BB0_66: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_68; bra.uni BB0_67; BB0_68: sub.f32 %f210, %f68, %f62; sub.f32 %f211, %f69, %f63; sub.f32 %f212, %f70, %f64; sub.f32 %f213, %f56, %f74; mul.f32 %f214, %f213, 0f3DAAAAAB; sub.f32 %f215, %f57, %f75; mul.f32 %f216, %f215, 0f3DAAAAAB; sub.f32 %f217, %f58, %f76; mul.f32 %f218, %f217, 0f3DAAAAAB; fma.rn.f32 %f262, %f210, 0f3F2AAAAB, %f214; fma.rn.f32 %f263, %f211, 0f3F2AAAAB, %f216; fma.rn.f32 %f264, %f212, 0f3F2AAAAB, %f218; bra.uni BB0_70; BB0_61: sub.f32 %f262, %f1, %f62; sub.f32 %f263, %f2, %f63; sub.f32 %f264, %f3, %f64; bra.uni BB0_70; BB0_63: sub.f32 %f262, %f68, %f1; sub.f32 %f263, %f69, %f2; sub.f32 %f264, %f70, %f3; bra.uni BB0_70; BB0_65: mul.f32 %f193, %f62, 0fC0000000; fma.rn.f32 %f194, %f56, 0f3F000000, %f193; add.f32 %f195, %f63, %f63; mul.f32 %f196, %f57, 0f3F000000; sub.f32 %f197, %f196, %f195; add.f32 %f198, %f64, %f64; mul.f32 %f199, %f58, 0f3F000000; sub.f32 %f200, %f199, %f198; fma.rn.f32 %f262, %f1, 0f3FC00000, %f194; fma.rn.f32 %f263, %f2, 0f3FC00000, %f197; fma.rn.f32 %f264, %f3, 0f3FC00000, %f200; bra.uni BB0_70; BB0_67: mul.f32 %f201, %f74, 0fBF000000; fma.rn.f32 %f202, %f68, 0f40000000, %f201; mul.f32 %f203, %f75, 0fBF000000; fma.rn.f32 %f204, %f69, 0f40000000, %f203; mul.f32 %f205, %f76, 0fBF000000; fma.rn.f32 %f206, %f70, 0f40000000, %f205; mul.f32 %f207, %f1, 0f3FC00000; sub.f32 %f262, %f202, %f207; mul.f32 %f208, %f2, 0f3FC00000; sub.f32 %f263, %f204, %f208; mul.f32 %f209, %f3, 0f3FC00000; sub.f32 %f264, %f206, %f209; BB0_70: mul.f32 %f222, %f249, %f263; mul.f32 %f223, %f248, %f264; sub.f32 %f224, %f223, %f222; mul.f32 %f225, %f247, %f264; mul.f32 %f226, %f249, %f262; sub.f32 %f227, %f226, %f225; mul.f32 %f228, %f248, %f262; mul.f32 %f229, %f247, %f263; sub.f32 %f230, %f229, %f228; mul.f32 %f231, %f2, %f227; fma.rn.f32 %f232, %f1, %f224, %f231; fma.rn.f32 %f233, %f3, %f230, %f232; mul.f32 %f234, %f233, %f102; st.global.f32 [%rd4], %f234; BB0_72: ret; } ` settopologicalcharge_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<77>; .reg .b16 %rs<11>; .reg .f32 %f<265>; .reg .b32 %r<97>; .reg .b64 %rd<46>; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f102, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; ld.param.u8 %rs3, [settopologicalcharge_param_8]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r43, %r44, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r46, %r47, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r49, %r50, %r51; setp.ge.s32 %p3, %r2, %r41; setp.ge.s32 %p4, %r1, %r40; or.pred %p5, %p3, %p4; setp.ge.s32 %p6, %r3, %r42; or.pred %p7, %p5, %p6; @%p7 bra BB0_72; cvta.to.global.u64 %rd9, %rd5; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd10, %r53, 4; add.s64 %rd11, %rd3, %rd10; add.s64 %rd12, %rd2, %rd10; add.s64 %rd13, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; ld.global.nc.f32 %f2, [%rd12]; mul.f32 %f103, %f2, %f2; fma.rn.f32 %f104, %f1, %f1, %f103; ld.global.nc.f32 %f3, [%rd13]; fma.rn.f32 %f105, %f3, %f3, %f104; setp.eq.f32 %p8, %f105, 0f00000000; add.s64 %rd4, %rd9, %rd10; @%p8 bra BB0_71; bra.uni BB0_2; BB0_71: mov.u32 %r88, 0; st.global.u32 [%rd4], %r88; bra.uni BB0_72; BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r6, %r1, -2; @%p9 bra BB0_4; rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r89, %r55, %r40; bra.uni BB0_5; BB0_4: mov.u32 %r56, 0; max.s32 %r89, %r6, %r56; BB0_5: setp.lt.s32 %p11, %r6, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_7; add.s32 %r57, %r89, %r5; mul.wide.s32 %rd14, %r57, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f7, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f8, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; BB0_7: add.s32 %r10, %r1, -1; @%p9 bra BB0_9; rem.s32 %r58, %r10, %r40; add.s32 %r59, %r58, %r40; rem.s32 %r90, %r59, %r40; bra.uni BB0_10; BB0_9: mov.u32 %r60, 0; max.s32 %r90, %r10, %r60; BB0_10: setp.lt.s32 %p14, %r10, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_12; add.s32 %r61, %r90, %r5; mul.wide.s32 %rd18, %r61, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f14, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f15, [%rd21]; BB0_12: add.s32 %r14, %r1, 1; @%p9 bra BB0_14; rem.s32 %r62, %r14, %r40; add.s32 %r63, %r62, %r40; rem.s32 %r91, %r63, %r40; bra.uni BB0_15; BB0_14: add.s32 %r64, %r40, -1; min.s32 %r91, %r14, %r64; BB0_15: setp.ge.s32 %p18, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_17; add.s32 %r65, %r91, %r5; mul.wide.s32 %rd22, %r65, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f19, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f20, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; BB0_17: add.s32 %r18, %r1, 2; @%p9 bra BB0_19; rem.s32 %r66, %r18, %r40; add.s32 %r67, %r66, %r40; rem.s32 %r92, %r67, %r40; bra.uni BB0_20; BB0_19: add.s32 %r68, %r40, -1; min.s32 %r92, %r18, %r68; BB0_20: add.s32 %r22, %r92, %r5; setp.ge.s32 %p22, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f25, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f26, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; BB0_22: mul.f32 %f118, %f20, %f20; fma.rn.f32 %f119, %f19, %f19, %f118; fma.rn.f32 %f28, %f21, %f21, %f119; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_24; mul.f32 %f123, %f14, %f14; fma.rn.f32 %f124, %f13, %f13, %f123; fma.rn.f32 %f125, %f15, %f15, %f124; setp.eq.f32 %p26, %f125, 0f00000000; mov.f32 %f247, 0f00000000; mov.f32 %f248, %f247; mov.f32 %f249, %f247; @%p26 bra BB0_36; BB0_24: mul.f32 %f126, %f8, %f8; fma.rn.f32 %f127, %f7, %f7, %f126; fma.rn.f32 %f29, %f9, %f9, %f127; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f128, %f26, %f26; fma.rn.f32 %f129, %f25, %f25, %f128; fma.rn.f32 %f30, %f27, %f27, %f129; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_26; mul.f32 %f130, %f14, %f14; fma.rn.f32 %f131, %f13, %f13, %f130; fma.rn.f32 %f132, %f15, %f15, %f131; setp.neu.f32 %p32, %f132, 0f00000000; @%p32 bra BB0_35; bra.uni BB0_26; BB0_35: sub.f32 %f161, %f19, %f13; mul.f32 %f247, %f161, 0f3F000000; sub.f32 %f162, %f20, %f14; mul.f32 %f248, %f162, 0f3F000000; sub.f32 %f163, %f21, %f15; mul.f32 %f249, %f163, 0f3F000000; bra.uni BB0_36; BB0_26: or.pred %p34, %p25, %p27; @%p34 bra BB0_28; bra.uni BB0_27; BB0_28: mul.f32 %f133, %f14, %f14; fma.rn.f32 %f134, %f13, %f13, %f133; fma.rn.f32 %f34, %f15, %f15, %f134; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_30; bra.uni BB0_29; BB0_30: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_32; bra.uni BB0_31; BB0_32: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_34; bra.uni BB0_33; BB0_34: sub.f32 %f152, %f19, %f13; sub.f32 %f153, %f20, %f14; sub.f32 %f154, %f21, %f15; sub.f32 %f155, %f7, %f25; mul.f32 %f156, %f155, 0f3DAAAAAB; sub.f32 %f157, %f8, %f26; mul.f32 %f158, %f157, 0f3DAAAAAB; sub.f32 %f159, %f9, %f27; mul.f32 %f160, %f159, 0f3DAAAAAB; fma.rn.f32 %f247, %f152, 0f3F2AAAAB, %f156; fma.rn.f32 %f248, %f153, 0f3F2AAAAB, %f158; fma.rn.f32 %f249, %f154, 0f3F2AAAAB, %f160; bra.uni BB0_36; BB0_27: sub.f32 %f247, %f1, %f13; sub.f32 %f248, %f2, %f14; sub.f32 %f249, %f3, %f15; bra.uni BB0_36; BB0_29: sub.f32 %f247, %f19, %f1; sub.f32 %f248, %f20, %f2; sub.f32 %f249, %f21, %f3; bra.uni BB0_36; BB0_31: mul.f32 %f135, %f13, 0fC0000000; fma.rn.f32 %f136, %f7, 0f3F000000, %f135; add.f32 %f137, %f14, %f14; mul.f32 %f138, %f8, 0f3F000000; sub.f32 %f139, %f138, %f137; add.f32 %f140, %f15, %f15; mul.f32 %f141, %f9, 0f3F000000; sub.f32 %f142, %f141, %f140; fma.rn.f32 %f247, %f1, 0f3FC00000, %f136; fma.rn.f32 %f248, %f2, 0f3FC00000, %f139; fma.rn.f32 %f249, %f3, 0f3FC00000, %f142; bra.uni BB0_36; BB0_33: mul.f32 %f143, %f25, 0fBF000000; fma.rn.f32 %f144, %f19, 0f40000000, %f143; mul.f32 %f145, %f26, 0fBF000000; fma.rn.f32 %f146, %f20, 0f40000000, %f145; mul.f32 %f147, %f27, 0fBF000000; fma.rn.f32 %f148, %f21, 0f40000000, %f147; mul.f32 %f149, %f1, 0f3FC00000; sub.f32 %f247, %f144, %f149; mul.f32 %f150, %f2, 0f3FC00000; sub.f32 %f248, %f146, %f150; mul.f32 %f151, %f3, 0f3FC00000; sub.f32 %f249, %f148, %f151; BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_38; rem.s32 %r69, %r23, %r41; add.s32 %r70, %r69, %r41; rem.s32 %r93, %r70, %r41; bra.uni BB0_39; BB0_38: mov.u32 %r71, 0; max.s32 %r93, %r23, %r71; BB0_39: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_41; add.s32 %r72, %r93, %r4; mad.lo.s32 %r73, %r72, %r40, %r1; mul.wide.s32 %rd30, %r73, 4; add.s64 %rd31, %rd3, %rd30; ld.global.nc.f32 %f56, [%rd31]; add.s64 %rd32, %rd2, %rd30; ld.global.nc.f32 %f57, [%rd32]; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f58, [%rd33]; BB0_41: add.s32 %r27, %r2, -1; @%p43 bra BB0_43; rem.s32 %r74, %r27, %r41; add.s32 %r75, %r74, %r41; rem.s32 %r94, %r75, %r41; bra.uni BB0_44; BB0_43: mov.u32 %r76, 0; max.s32 %r94, %r27, %r76; BB0_44: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_46; add.s32 %r77, %r94, %r4; mad.lo.s32 %r78, %r77, %r40, %r1; mul.wide.s32 %rd34, %r78, 4; add.s64 %rd35, %rd3, %rd34; ld.global.nc.f32 %f62, [%rd35]; add.s64 %rd36, %rd2, %rd34; ld.global.nc.f32 %f63, [%rd36]; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f64, [%rd37]; BB0_46: add.s32 %r31, %r2, 1; @%p43 bra BB0_48; rem.s32 %r79, %r31, %r41; add.s32 %r80, %r79, %r41; rem.s32 %r95, %r80, %r41; bra.uni BB0_49; BB0_48: add.s32 %r81, %r41, -1; min.s32 %r95, %r31, %r81; BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_51; add.s32 %r82, %r95, %r4; mad.lo.s32 %r83, %r82, %r40, %r1; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f68, [%rd39]; add.s64 %rd40, %rd2, %rd38; ld.global.nc.f32 %f69, [%rd40]; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f70, [%rd41]; BB0_51: add.s32 %r35, %r2, 2; @%p43 bra BB0_53; rem.s32 %r84, %r35, %r41; add.s32 %r85, %r84, %r41; rem.s32 %r96, %r85, %r41; bra.uni BB0_54; BB0_53: add.s32 %r86, %r41, -1; min.s32 %r96, %r35, %r86; BB0_54: add.s32 %r87, %r96, %r4; mad.lo.s32 %r39, %r87, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; ld.global.nc.f32 %f74, [%rd43]; add.s64 %rd44, %rd2, %rd42; ld.global.nc.f32 %f75, [%rd44]; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f76, [%rd45]; BB0_56: mul.f32 %f176, %f69, %f69; fma.rn.f32 %f177, %f68, %f68, %f176; fma.rn.f32 %f77, %f70, %f70, %f177; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_58; mul.f32 %f181, %f63, %f63; fma.rn.f32 %f182, %f62, %f62, %f181; fma.rn.f32 %f183, %f64, %f64, %f182; setp.eq.f32 %p60, %f183, 0f00000000; mov.f32 %f262, 0f00000000; mov.f32 %f263, %f262; mov.f32 %f264, %f262; @%p60 bra BB0_70; BB0_58: mul.f32 %f184, %f57, %f57; fma.rn.f32 %f185, %f56, %f56, %f184; fma.rn.f32 %f78, %f58, %f58, %f185; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f186, %f75, %f75; fma.rn.f32 %f187, %f74, %f74, %f186; fma.rn.f32 %f79, %f76, %f76, %f187; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_60; mul.f32 %f188, %f63, %f63; fma.rn.f32 %f189, %f62, %f62, %f188; fma.rn.f32 %f190, %f64, %f64, %f189; setp.neu.f32 %p66, %f190, 0f00000000; @%p66 bra BB0_69; bra.uni BB0_60; BB0_69: sub.f32 %f219, %f68, %f62; mul.f32 %f262, %f219, 0f3F000000; sub.f32 %f220, %f69, %f63; mul.f32 %f263, %f220, 0f3F000000; sub.f32 %f221, %f70, %f64; mul.f32 %f264, %f221, 0f3F000000; bra.uni BB0_70; BB0_60: or.pred %p68, %p59, %p61; @%p68 bra BB0_62; bra.uni BB0_61; BB0_62: mul.f32 %f191, %f63, %f63; fma.rn.f32 %f192, %f62, %f62, %f191; fma.rn.f32 %f83, %f64, %f64, %f192; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_64; bra.uni BB0_63; BB0_64: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_66; bra.uni BB0_65; BB0_66: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_68; bra.uni BB0_67; BB0_68: sub.f32 %f210, %f68, %f62; sub.f32 %f211, %f69, %f63; sub.f32 %f212, %f70, %f64; sub.f32 %f213, %f56, %f74; mul.f32 %f214, %f213, 0f3DAAAAAB; sub.f32 %f215, %f57, %f75; mul.f32 %f216, %f215, 0f3DAAAAAB; sub.f32 %f217, %f58, %f76; mul.f32 %f218, %f217, 0f3DAAAAAB; fma.rn.f32 %f262, %f210, 0f3F2AAAAB, %f214; fma.rn.f32 %f263, %f211, 0f3F2AAAAB, %f216; fma.rn.f32 %f264, %f212, 0f3F2AAAAB, %f218; bra.uni BB0_70; BB0_61: sub.f32 %f262, %f1, %f62; sub.f32 %f263, %f2, %f63; sub.f32 %f264, %f3, %f64; bra.uni BB0_70; BB0_63: sub.f32 %f262, %f68, %f1; sub.f32 %f263, %f69, %f2; sub.f32 %f264, %f70, %f3; bra.uni BB0_70; BB0_65: mul.f32 %f193, %f62, 0fC0000000; fma.rn.f32 %f194, %f56, 0f3F000000, %f193; add.f32 %f195, %f63, %f63; mul.f32 %f196, %f57, 0f3F000000; sub.f32 %f197, %f196, %f195; add.f32 %f198, %f64, %f64; mul.f32 %f199, %f58, 0f3F000000; sub.f32 %f200, %f199, %f198; fma.rn.f32 %f262, %f1, 0f3FC00000, %f194; fma.rn.f32 %f263, %f2, 0f3FC00000, %f197; fma.rn.f32 %f264, %f3, 0f3FC00000, %f200; bra.uni BB0_70; BB0_67: mul.f32 %f201, %f74, 0fBF000000; fma.rn.f32 %f202, %f68, 0f40000000, %f201; mul.f32 %f203, %f75, 0fBF000000; fma.rn.f32 %f204, %f69, 0f40000000, %f203; mul.f32 %f205, %f76, 0fBF000000; fma.rn.f32 %f206, %f70, 0f40000000, %f205; mul.f32 %f207, %f1, 0f3FC00000; sub.f32 %f262, %f202, %f207; mul.f32 %f208, %f2, 0f3FC00000; sub.f32 %f263, %f204, %f208; mul.f32 %f209, %f3, 0f3FC00000; sub.f32 %f264, %f206, %f209; BB0_70: mul.f32 %f222, %f249, %f263; mul.f32 %f223, %f248, %f264; sub.f32 %f224, %f223, %f222; mul.f32 %f225, %f247, %f264; mul.f32 %f226, %f249, %f262; sub.f32 %f227, %f226, %f225; mul.f32 %f228, %f248, %f262; mul.f32 %f229, %f247, %f263; sub.f32 %f230, %f229, %f228; mul.f32 %f231, %f2, %f227; fma.rn.f32 %f232, %f1, %f224, %f231; fma.rn.f32 %f233, %f3, %f230, %f232; mul.f32 %f234, %f233, %f102; st.global.f32 [%rd4], %f234; BB0_72: ret; } ` settopologicalcharge_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<77>; .reg .b16 %rs<11>; .reg .f32 %f<265>; .reg .b32 %r<97>; .reg .b64 %rd<46>; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f102, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; ld.param.u8 %rs3, [settopologicalcharge_param_8]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r43, %r44, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r46, %r47, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r49, %r50, %r51; setp.ge.s32 %p3, %r2, %r41; setp.ge.s32 %p4, %r1, %r40; or.pred %p5, %p3, %p4; setp.ge.s32 %p6, %r3, %r42; or.pred %p7, %p5, %p6; @%p7 bra BB0_72; cvta.to.global.u64 %rd9, %rd5; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd10, %r53, 4; add.s64 %rd11, %rd3, %rd10; add.s64 %rd12, %rd2, %rd10; add.s64 %rd13, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; ld.global.nc.f32 %f2, [%rd12]; mul.f32 %f103, %f2, %f2; fma.rn.f32 %f104, %f1, %f1, %f103; ld.global.nc.f32 %f3, [%rd13]; fma.rn.f32 %f105, %f3, %f3, %f104; setp.eq.f32 %p8, %f105, 0f00000000; add.s64 %rd4, %rd9, %rd10; @%p8 bra BB0_71; bra.uni BB0_2; BB0_71: mov.u32 %r88, 0; st.global.u32 [%rd4], %r88; bra.uni BB0_72; BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r6, %r1, -2; @%p9 bra BB0_4; rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r89, %r55, %r40; bra.uni BB0_5; BB0_4: mov.u32 %r56, 0; max.s32 %r89, %r6, %r56; BB0_5: setp.lt.s32 %p11, %r6, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_7; add.s32 %r57, %r89, %r5; mul.wide.s32 %rd14, %r57, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f7, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f8, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; BB0_7: add.s32 %r10, %r1, -1; @%p9 bra BB0_9; rem.s32 %r58, %r10, %r40; add.s32 %r59, %r58, %r40; rem.s32 %r90, %r59, %r40; bra.uni BB0_10; BB0_9: mov.u32 %r60, 0; max.s32 %r90, %r10, %r60; BB0_10: setp.lt.s32 %p14, %r10, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_12; add.s32 %r61, %r90, %r5; mul.wide.s32 %rd18, %r61, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f14, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f15, [%rd21]; BB0_12: add.s32 %r14, %r1, 1; @%p9 bra BB0_14; rem.s32 %r62, %r14, %r40; add.s32 %r63, %r62, %r40; rem.s32 %r91, %r63, %r40; bra.uni BB0_15; BB0_14: add.s32 %r64, %r40, -1; min.s32 %r91, %r14, %r64; BB0_15: setp.ge.s32 %p18, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_17; add.s32 %r65, %r91, %r5; mul.wide.s32 %rd22, %r65, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f19, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f20, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; BB0_17: add.s32 %r18, %r1, 2; @%p9 bra BB0_19; rem.s32 %r66, %r18, %r40; add.s32 %r67, %r66, %r40; rem.s32 %r92, %r67, %r40; bra.uni BB0_20; BB0_19: add.s32 %r68, %r40, -1; min.s32 %r92, %r18, %r68; BB0_20: add.s32 %r22, %r92, %r5; setp.ge.s32 %p22, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f25, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f26, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; BB0_22: mul.f32 %f118, %f20, %f20; fma.rn.f32 %f119, %f19, %f19, %f118; fma.rn.f32 %f28, %f21, %f21, %f119; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_24; mul.f32 %f123, %f14, %f14; fma.rn.f32 %f124, %f13, %f13, %f123; fma.rn.f32 %f125, %f15, %f15, %f124; setp.eq.f32 %p26, %f125, 0f00000000; mov.f32 %f247, 0f00000000; mov.f32 %f248, %f247; mov.f32 %f249, %f247; @%p26 bra BB0_36; BB0_24: mul.f32 %f126, %f8, %f8; fma.rn.f32 %f127, %f7, %f7, %f126; fma.rn.f32 %f29, %f9, %f9, %f127; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f128, %f26, %f26; fma.rn.f32 %f129, %f25, %f25, %f128; fma.rn.f32 %f30, %f27, %f27, %f129; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_26; mul.f32 %f130, %f14, %f14; fma.rn.f32 %f131, %f13, %f13, %f130; fma.rn.f32 %f132, %f15, %f15, %f131; setp.neu.f32 %p32, %f132, 0f00000000; @%p32 bra BB0_35; bra.uni BB0_26; BB0_35: sub.f32 %f161, %f19, %f13; mul.f32 %f247, %f161, 0f3F000000; sub.f32 %f162, %f20, %f14; mul.f32 %f248, %f162, 0f3F000000; sub.f32 %f163, %f21, %f15; mul.f32 %f249, %f163, 0f3F000000; bra.uni BB0_36; BB0_26: or.pred %p34, %p25, %p27; @%p34 bra BB0_28; bra.uni BB0_27; BB0_28: mul.f32 %f133, %f14, %f14; fma.rn.f32 %f134, %f13, %f13, %f133; fma.rn.f32 %f34, %f15, %f15, %f134; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_30; bra.uni BB0_29; BB0_30: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_32; bra.uni BB0_31; BB0_32: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_34; bra.uni BB0_33; BB0_34: sub.f32 %f152, %f19, %f13; sub.f32 %f153, %f20, %f14; sub.f32 %f154, %f21, %f15; sub.f32 %f155, %f7, %f25; mul.f32 %f156, %f155, 0f3DAAAAAB; sub.f32 %f157, %f8, %f26; mul.f32 %f158, %f157, 0f3DAAAAAB; sub.f32 %f159, %f9, %f27; mul.f32 %f160, %f159, 0f3DAAAAAB; fma.rn.f32 %f247, %f152, 0f3F2AAAAB, %f156; fma.rn.f32 %f248, %f153, 0f3F2AAAAB, %f158; fma.rn.f32 %f249, %f154, 0f3F2AAAAB, %f160; bra.uni BB0_36; BB0_27: sub.f32 %f247, %f1, %f13; sub.f32 %f248, %f2, %f14; sub.f32 %f249, %f3, %f15; bra.uni BB0_36; BB0_29: sub.f32 %f247, %f19, %f1; sub.f32 %f248, %f20, %f2; sub.f32 %f249, %f21, %f3; bra.uni BB0_36; BB0_31: mul.f32 %f135, %f13, 0fC0000000; fma.rn.f32 %f136, %f7, 0f3F000000, %f135; add.f32 %f137, %f14, %f14; mul.f32 %f138, %f8, 0f3F000000; sub.f32 %f139, %f138, %f137; add.f32 %f140, %f15, %f15; mul.f32 %f141, %f9, 0f3F000000; sub.f32 %f142, %f141, %f140; fma.rn.f32 %f247, %f1, 0f3FC00000, %f136; fma.rn.f32 %f248, %f2, 0f3FC00000, %f139; fma.rn.f32 %f249, %f3, 0f3FC00000, %f142; bra.uni BB0_36; BB0_33: mul.f32 %f143, %f25, 0fBF000000; fma.rn.f32 %f144, %f19, 0f40000000, %f143; mul.f32 %f145, %f26, 0fBF000000; fma.rn.f32 %f146, %f20, 0f40000000, %f145; mul.f32 %f147, %f27, 0fBF000000; fma.rn.f32 %f148, %f21, 0f40000000, %f147; mul.f32 %f149, %f1, 0f3FC00000; sub.f32 %f247, %f144, %f149; mul.f32 %f150, %f2, 0f3FC00000; sub.f32 %f248, %f146, %f150; mul.f32 %f151, %f3, 0f3FC00000; sub.f32 %f249, %f148, %f151; BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_38; rem.s32 %r69, %r23, %r41; add.s32 %r70, %r69, %r41; rem.s32 %r93, %r70, %r41; bra.uni BB0_39; BB0_38: mov.u32 %r71, 0; max.s32 %r93, %r23, %r71; BB0_39: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_41; add.s32 %r72, %r93, %r4; mad.lo.s32 %r73, %r72, %r40, %r1; mul.wide.s32 %rd30, %r73, 4; add.s64 %rd31, %rd3, %rd30; ld.global.nc.f32 %f56, [%rd31]; add.s64 %rd32, %rd2, %rd30; ld.global.nc.f32 %f57, [%rd32]; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f58, [%rd33]; BB0_41: add.s32 %r27, %r2, -1; @%p43 bra BB0_43; rem.s32 %r74, %r27, %r41; add.s32 %r75, %r74, %r41; rem.s32 %r94, %r75, %r41; bra.uni BB0_44; BB0_43: mov.u32 %r76, 0; max.s32 %r94, %r27, %r76; BB0_44: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_46; add.s32 %r77, %r94, %r4; mad.lo.s32 %r78, %r77, %r40, %r1; mul.wide.s32 %rd34, %r78, 4; add.s64 %rd35, %rd3, %rd34; ld.global.nc.f32 %f62, [%rd35]; add.s64 %rd36, %rd2, %rd34; ld.global.nc.f32 %f63, [%rd36]; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f64, [%rd37]; BB0_46: add.s32 %r31, %r2, 1; @%p43 bra BB0_48; rem.s32 %r79, %r31, %r41; add.s32 %r80, %r79, %r41; rem.s32 %r95, %r80, %r41; bra.uni BB0_49; BB0_48: add.s32 %r81, %r41, -1; min.s32 %r95, %r31, %r81; BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_51; add.s32 %r82, %r95, %r4; mad.lo.s32 %r83, %r82, %r40, %r1; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f68, [%rd39]; add.s64 %rd40, %rd2, %rd38; ld.global.nc.f32 %f69, [%rd40]; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f70, [%rd41]; BB0_51: add.s32 %r35, %r2, 2; @%p43 bra BB0_53; rem.s32 %r84, %r35, %r41; add.s32 %r85, %r84, %r41; rem.s32 %r96, %r85, %r41; bra.uni BB0_54; BB0_53: add.s32 %r86, %r41, -1; min.s32 %r96, %r35, %r86; BB0_54: add.s32 %r87, %r96, %r4; mad.lo.s32 %r39, %r87, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; ld.global.nc.f32 %f74, [%rd43]; add.s64 %rd44, %rd2, %rd42; ld.global.nc.f32 %f75, [%rd44]; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f76, [%rd45]; BB0_56: mul.f32 %f176, %f69, %f69; fma.rn.f32 %f177, %f68, %f68, %f176; fma.rn.f32 %f77, %f70, %f70, %f177; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_58; mul.f32 %f181, %f63, %f63; fma.rn.f32 %f182, %f62, %f62, %f181; fma.rn.f32 %f183, %f64, %f64, %f182; setp.eq.f32 %p60, %f183, 0f00000000; mov.f32 %f262, 0f00000000; mov.f32 %f263, %f262; mov.f32 %f264, %f262; @%p60 bra BB0_70; BB0_58: mul.f32 %f184, %f57, %f57; fma.rn.f32 %f185, %f56, %f56, %f184; fma.rn.f32 %f78, %f58, %f58, %f185; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f186, %f75, %f75; fma.rn.f32 %f187, %f74, %f74, %f186; fma.rn.f32 %f79, %f76, %f76, %f187; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_60; mul.f32 %f188, %f63, %f63; fma.rn.f32 %f189, %f62, %f62, %f188; fma.rn.f32 %f190, %f64, %f64, %f189; setp.neu.f32 %p66, %f190, 0f00000000; @%p66 bra BB0_69; bra.uni BB0_60; BB0_69: sub.f32 %f219, %f68, %f62; mul.f32 %f262, %f219, 0f3F000000; sub.f32 %f220, %f69, %f63; mul.f32 %f263, %f220, 0f3F000000; sub.f32 %f221, %f70, %f64; mul.f32 %f264, %f221, 0f3F000000; bra.uni BB0_70; BB0_60: or.pred %p68, %p59, %p61; @%p68 bra BB0_62; bra.uni BB0_61; BB0_62: mul.f32 %f191, %f63, %f63; fma.rn.f32 %f192, %f62, %f62, %f191; fma.rn.f32 %f83, %f64, %f64, %f192; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_64; bra.uni BB0_63; BB0_64: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_66; bra.uni BB0_65; BB0_66: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_68; bra.uni BB0_67; BB0_68: sub.f32 %f210, %f68, %f62; sub.f32 %f211, %f69, %f63; sub.f32 %f212, %f70, %f64; sub.f32 %f213, %f56, %f74; mul.f32 %f214, %f213, 0f3DAAAAAB; sub.f32 %f215, %f57, %f75; mul.f32 %f216, %f215, 0f3DAAAAAB; sub.f32 %f217, %f58, %f76; mul.f32 %f218, %f217, 0f3DAAAAAB; fma.rn.f32 %f262, %f210, 0f3F2AAAAB, %f214; fma.rn.f32 %f263, %f211, 0f3F2AAAAB, %f216; fma.rn.f32 %f264, %f212, 0f3F2AAAAB, %f218; bra.uni BB0_70; BB0_61: sub.f32 %f262, %f1, %f62; sub.f32 %f263, %f2, %f63; sub.f32 %f264, %f3, %f64; bra.uni BB0_70; BB0_63: sub.f32 %f262, %f68, %f1; sub.f32 %f263, %f69, %f2; sub.f32 %f264, %f70, %f3; bra.uni BB0_70; BB0_65: mul.f32 %f193, %f62, 0fC0000000; fma.rn.f32 %f194, %f56, 0f3F000000, %f193; add.f32 %f195, %f63, %f63; mul.f32 %f196, %f57, 0f3F000000; sub.f32 %f197, %f196, %f195; add.f32 %f198, %f64, %f64; mul.f32 %f199, %f58, 0f3F000000; sub.f32 %f200, %f199, %f198; fma.rn.f32 %f262, %f1, 0f3FC00000, %f194; fma.rn.f32 %f263, %f2, 0f3FC00000, %f197; fma.rn.f32 %f264, %f3, 0f3FC00000, %f200; bra.uni BB0_70; BB0_67: mul.f32 %f201, %f74, 0fBF000000; fma.rn.f32 %f202, %f68, 0f40000000, %f201; mul.f32 %f203, %f75, 0fBF000000; fma.rn.f32 %f204, %f69, 0f40000000, %f203; mul.f32 %f205, %f76, 0fBF000000; fma.rn.f32 %f206, %f70, 0f40000000, %f205; mul.f32 %f207, %f1, 0f3FC00000; sub.f32 %f262, %f202, %f207; mul.f32 %f208, %f2, 0f3FC00000; sub.f32 %f263, %f204, %f208; mul.f32 %f209, %f3, 0f3FC00000; sub.f32 %f264, %f206, %f209; BB0_70: mul.f32 %f222, %f249, %f263; mul.f32 %f223, %f248, %f264; sub.f32 %f224, %f223, %f222; mul.f32 %f225, %f247, %f264; mul.f32 %f226, %f249, %f262; sub.f32 %f227, %f226, %f225; mul.f32 %f228, %f248, %f262; mul.f32 %f229, %f247, %f263; sub.f32 %f230, %f229, %f228; mul.f32 %f231, %f2, %f227; fma.rn.f32 %f232, %f1, %f224, %f231; fma.rn.f32 %f233, %f3, %f230, %f232; mul.f32 %f234, %f233, %f102; st.global.f32 [%rd4], %f234; BB0_72: ret; } ` settopologicalcharge_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<77>; .reg .b16 %rs<11>; .reg .f32 %f<265>; .reg .b32 %r<97>; .reg .b64 %rd<46>; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f102, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; ld.param.u8 %rs3, [settopologicalcharge_param_8]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r43, %r44, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r46, %r47, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r49, %r50, %r51; setp.ge.s32 %p3, %r2, %r41; setp.ge.s32 %p4, %r1, %r40; or.pred %p5, %p3, %p4; setp.ge.s32 %p6, %r3, %r42; or.pred %p7, %p5, %p6; @%p7 bra BB0_72; cvta.to.global.u64 %rd9, %rd5; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd10, %r53, 4; add.s64 %rd11, %rd3, %rd10; add.s64 %rd12, %rd2, %rd10; add.s64 %rd13, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; ld.global.nc.f32 %f2, [%rd12]; mul.f32 %f103, %f2, %f2; fma.rn.f32 %f104, %f1, %f1, %f103; ld.global.nc.f32 %f3, [%rd13]; fma.rn.f32 %f105, %f3, %f3, %f104; setp.eq.f32 %p8, %f105, 0f00000000; add.s64 %rd4, %rd9, %rd10; @%p8 bra BB0_71; bra.uni BB0_2; BB0_71: mov.u32 %r88, 0; st.global.u32 [%rd4], %r88; bra.uni BB0_72; BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r6, %r1, -2; @%p9 bra BB0_4; rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r89, %r55, %r40; bra.uni BB0_5; BB0_4: mov.u32 %r56, 0; max.s32 %r89, %r6, %r56; BB0_5: setp.lt.s32 %p11, %r6, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_7; add.s32 %r57, %r89, %r5; mul.wide.s32 %rd14, %r57, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f7, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f8, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; BB0_7: add.s32 %r10, %r1, -1; @%p9 bra BB0_9; rem.s32 %r58, %r10, %r40; add.s32 %r59, %r58, %r40; rem.s32 %r90, %r59, %r40; bra.uni BB0_10; BB0_9: mov.u32 %r60, 0; max.s32 %r90, %r10, %r60; BB0_10: setp.lt.s32 %p14, %r10, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_12; add.s32 %r61, %r90, %r5; mul.wide.s32 %rd18, %r61, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f14, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f15, [%rd21]; BB0_12: add.s32 %r14, %r1, 1; @%p9 bra BB0_14; rem.s32 %r62, %r14, %r40; add.s32 %r63, %r62, %r40; rem.s32 %r91, %r63, %r40; bra.uni BB0_15; BB0_14: add.s32 %r64, %r40, -1; min.s32 %r91, %r14, %r64; BB0_15: setp.ge.s32 %p18, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_17; add.s32 %r65, %r91, %r5; mul.wide.s32 %rd22, %r65, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f19, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f20, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; BB0_17: add.s32 %r18, %r1, 2; @%p9 bra BB0_19; rem.s32 %r66, %r18, %r40; add.s32 %r67, %r66, %r40; rem.s32 %r92, %r67, %r40; bra.uni BB0_20; BB0_19: add.s32 %r68, %r40, -1; min.s32 %r92, %r18, %r68; BB0_20: add.s32 %r22, %r92, %r5; setp.ge.s32 %p22, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f25, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f26, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; BB0_22: mul.f32 %f118, %f20, %f20; fma.rn.f32 %f119, %f19, %f19, %f118; fma.rn.f32 %f28, %f21, %f21, %f119; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_24; mul.f32 %f123, %f14, %f14; fma.rn.f32 %f124, %f13, %f13, %f123; fma.rn.f32 %f125, %f15, %f15, %f124; setp.eq.f32 %p26, %f125, 0f00000000; mov.f32 %f247, 0f00000000; mov.f32 %f248, %f247; mov.f32 %f249, %f247; @%p26 bra BB0_36; BB0_24: mul.f32 %f126, %f8, %f8; fma.rn.f32 %f127, %f7, %f7, %f126; fma.rn.f32 %f29, %f9, %f9, %f127; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f128, %f26, %f26; fma.rn.f32 %f129, %f25, %f25, %f128; fma.rn.f32 %f30, %f27, %f27, %f129; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_26; mul.f32 %f130, %f14, %f14; fma.rn.f32 %f131, %f13, %f13, %f130; fma.rn.f32 %f132, %f15, %f15, %f131; setp.neu.f32 %p32, %f132, 0f00000000; @%p32 bra BB0_35; bra.uni BB0_26; BB0_35: sub.f32 %f161, %f19, %f13; mul.f32 %f247, %f161, 0f3F000000; sub.f32 %f162, %f20, %f14; mul.f32 %f248, %f162, 0f3F000000; sub.f32 %f163, %f21, %f15; mul.f32 %f249, %f163, 0f3F000000; bra.uni BB0_36; BB0_26: or.pred %p34, %p25, %p27; @%p34 bra BB0_28; bra.uni BB0_27; BB0_28: mul.f32 %f133, %f14, %f14; fma.rn.f32 %f134, %f13, %f13, %f133; fma.rn.f32 %f34, %f15, %f15, %f134; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_30; bra.uni BB0_29; BB0_30: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_32; bra.uni BB0_31; BB0_32: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_34; bra.uni BB0_33; BB0_34: sub.f32 %f152, %f19, %f13; sub.f32 %f153, %f20, %f14; sub.f32 %f154, %f21, %f15; sub.f32 %f155, %f7, %f25; mul.f32 %f156, %f155, 0f3DAAAAAB; sub.f32 %f157, %f8, %f26; mul.f32 %f158, %f157, 0f3DAAAAAB; sub.f32 %f159, %f9, %f27; mul.f32 %f160, %f159, 0f3DAAAAAB; fma.rn.f32 %f247, %f152, 0f3F2AAAAB, %f156; fma.rn.f32 %f248, %f153, 0f3F2AAAAB, %f158; fma.rn.f32 %f249, %f154, 0f3F2AAAAB, %f160; bra.uni BB0_36; BB0_27: sub.f32 %f247, %f1, %f13; sub.f32 %f248, %f2, %f14; sub.f32 %f249, %f3, %f15; bra.uni BB0_36; BB0_29: sub.f32 %f247, %f19, %f1; sub.f32 %f248, %f20, %f2; sub.f32 %f249, %f21, %f3; bra.uni BB0_36; BB0_31: mul.f32 %f135, %f13, 0fC0000000; fma.rn.f32 %f136, %f7, 0f3F000000, %f135; add.f32 %f137, %f14, %f14; mul.f32 %f138, %f8, 0f3F000000; sub.f32 %f139, %f138, %f137; add.f32 %f140, %f15, %f15; mul.f32 %f141, %f9, 0f3F000000; sub.f32 %f142, %f141, %f140; fma.rn.f32 %f247, %f1, 0f3FC00000, %f136; fma.rn.f32 %f248, %f2, 0f3FC00000, %f139; fma.rn.f32 %f249, %f3, 0f3FC00000, %f142; bra.uni BB0_36; BB0_33: mul.f32 %f143, %f25, 0fBF000000; fma.rn.f32 %f144, %f19, 0f40000000, %f143; mul.f32 %f145, %f26, 0fBF000000; fma.rn.f32 %f146, %f20, 0f40000000, %f145; mul.f32 %f147, %f27, 0fBF000000; fma.rn.f32 %f148, %f21, 0f40000000, %f147; mul.f32 %f149, %f1, 0f3FC00000; sub.f32 %f247, %f144, %f149; mul.f32 %f150, %f2, 0f3FC00000; sub.f32 %f248, %f146, %f150; mul.f32 %f151, %f3, 0f3FC00000; sub.f32 %f249, %f148, %f151; BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_38; rem.s32 %r69, %r23, %r41; add.s32 %r70, %r69, %r41; rem.s32 %r93, %r70, %r41; bra.uni BB0_39; BB0_38: mov.u32 %r71, 0; max.s32 %r93, %r23, %r71; BB0_39: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_41; add.s32 %r72, %r93, %r4; mad.lo.s32 %r73, %r72, %r40, %r1; mul.wide.s32 %rd30, %r73, 4; add.s64 %rd31, %rd3, %rd30; ld.global.nc.f32 %f56, [%rd31]; add.s64 %rd32, %rd2, %rd30; ld.global.nc.f32 %f57, [%rd32]; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f58, [%rd33]; BB0_41: add.s32 %r27, %r2, -1; @%p43 bra BB0_43; rem.s32 %r74, %r27, %r41; add.s32 %r75, %r74, %r41; rem.s32 %r94, %r75, %r41; bra.uni BB0_44; BB0_43: mov.u32 %r76, 0; max.s32 %r94, %r27, %r76; BB0_44: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_46; add.s32 %r77, %r94, %r4; mad.lo.s32 %r78, %r77, %r40, %r1; mul.wide.s32 %rd34, %r78, 4; add.s64 %rd35, %rd3, %rd34; ld.global.nc.f32 %f62, [%rd35]; add.s64 %rd36, %rd2, %rd34; ld.global.nc.f32 %f63, [%rd36]; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f64, [%rd37]; BB0_46: add.s32 %r31, %r2, 1; @%p43 bra BB0_48; rem.s32 %r79, %r31, %r41; add.s32 %r80, %r79, %r41; rem.s32 %r95, %r80, %r41; bra.uni BB0_49; BB0_48: add.s32 %r81, %r41, -1; min.s32 %r95, %r31, %r81; BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_51; add.s32 %r82, %r95, %r4; mad.lo.s32 %r83, %r82, %r40, %r1; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f68, [%rd39]; add.s64 %rd40, %rd2, %rd38; ld.global.nc.f32 %f69, [%rd40]; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f70, [%rd41]; BB0_51: add.s32 %r35, %r2, 2; @%p43 bra BB0_53; rem.s32 %r84, %r35, %r41; add.s32 %r85, %r84, %r41; rem.s32 %r96, %r85, %r41; bra.uni BB0_54; BB0_53: add.s32 %r86, %r41, -1; min.s32 %r96, %r35, %r86; BB0_54: add.s32 %r87, %r96, %r4; mad.lo.s32 %r39, %r87, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; ld.global.nc.f32 %f74, [%rd43]; add.s64 %rd44, %rd2, %rd42; ld.global.nc.f32 %f75, [%rd44]; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f76, [%rd45]; BB0_56: mul.f32 %f176, %f69, %f69; fma.rn.f32 %f177, %f68, %f68, %f176; fma.rn.f32 %f77, %f70, %f70, %f177; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_58; mul.f32 %f181, %f63, %f63; fma.rn.f32 %f182, %f62, %f62, %f181; fma.rn.f32 %f183, %f64, %f64, %f182; setp.eq.f32 %p60, %f183, 0f00000000; mov.f32 %f262, 0f00000000; mov.f32 %f263, %f262; mov.f32 %f264, %f262; @%p60 bra BB0_70; BB0_58: mul.f32 %f184, %f57, %f57; fma.rn.f32 %f185, %f56, %f56, %f184; fma.rn.f32 %f78, %f58, %f58, %f185; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f186, %f75, %f75; fma.rn.f32 %f187, %f74, %f74, %f186; fma.rn.f32 %f79, %f76, %f76, %f187; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_60; mul.f32 %f188, %f63, %f63; fma.rn.f32 %f189, %f62, %f62, %f188; fma.rn.f32 %f190, %f64, %f64, %f189; setp.neu.f32 %p66, %f190, 0f00000000; @%p66 bra BB0_69; bra.uni BB0_60; BB0_69: sub.f32 %f219, %f68, %f62; mul.f32 %f262, %f219, 0f3F000000; sub.f32 %f220, %f69, %f63; mul.f32 %f263, %f220, 0f3F000000; sub.f32 %f221, %f70, %f64; mul.f32 %f264, %f221, 0f3F000000; bra.uni BB0_70; BB0_60: or.pred %p68, %p59, %p61; @%p68 bra BB0_62; bra.uni BB0_61; BB0_62: mul.f32 %f191, %f63, %f63; fma.rn.f32 %f192, %f62, %f62, %f191; fma.rn.f32 %f83, %f64, %f64, %f192; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_64; bra.uni BB0_63; BB0_64: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_66; bra.uni BB0_65; BB0_66: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_68; bra.uni BB0_67; BB0_68: sub.f32 %f210, %f68, %f62; sub.f32 %f211, %f69, %f63; sub.f32 %f212, %f70, %f64; sub.f32 %f213, %f56, %f74; mul.f32 %f214, %f213, 0f3DAAAAAB; sub.f32 %f215, %f57, %f75; mul.f32 %f216, %f215, 0f3DAAAAAB; sub.f32 %f217, %f58, %f76; mul.f32 %f218, %f217, 0f3DAAAAAB; fma.rn.f32 %f262, %f210, 0f3F2AAAAB, %f214; fma.rn.f32 %f263, %f211, 0f3F2AAAAB, %f216; fma.rn.f32 %f264, %f212, 0f3F2AAAAB, %f218; bra.uni BB0_70; BB0_61: sub.f32 %f262, %f1, %f62; sub.f32 %f263, %f2, %f63; sub.f32 %f264, %f3, %f64; bra.uni BB0_70; BB0_63: sub.f32 %f262, %f68, %f1; sub.f32 %f263, %f69, %f2; sub.f32 %f264, %f70, %f3; bra.uni BB0_70; BB0_65: mul.f32 %f193, %f62, 0fC0000000; fma.rn.f32 %f194, %f56, 0f3F000000, %f193; add.f32 %f195, %f63, %f63; mul.f32 %f196, %f57, 0f3F000000; sub.f32 %f197, %f196, %f195; add.f32 %f198, %f64, %f64; mul.f32 %f199, %f58, 0f3F000000; sub.f32 %f200, %f199, %f198; fma.rn.f32 %f262, %f1, 0f3FC00000, %f194; fma.rn.f32 %f263, %f2, 0f3FC00000, %f197; fma.rn.f32 %f264, %f3, 0f3FC00000, %f200; bra.uni BB0_70; BB0_67: mul.f32 %f201, %f74, 0fBF000000; fma.rn.f32 %f202, %f68, 0f40000000, %f201; mul.f32 %f203, %f75, 0fBF000000; fma.rn.f32 %f204, %f69, 0f40000000, %f203; mul.f32 %f205, %f76, 0fBF000000; fma.rn.f32 %f206, %f70, 0f40000000, %f205; mul.f32 %f207, %f1, 0f3FC00000; sub.f32 %f262, %f202, %f207; mul.f32 %f208, %f2, 0f3FC00000; sub.f32 %f263, %f204, %f208; mul.f32 %f209, %f3, 0f3FC00000; sub.f32 %f264, %f206, %f209; BB0_70: mul.f32 %f222, %f249, %f263; mul.f32 %f223, %f248, %f264; sub.f32 %f224, %f223, %f222; mul.f32 %f225, %f247, %f264; mul.f32 %f226, %f249, %f262; sub.f32 %f227, %f226, %f225; mul.f32 %f228, %f248, %f262; mul.f32 %f229, %f247, %f263; sub.f32 %f230, %f229, %f228; mul.f32 %f231, %f2, %f227; fma.rn.f32 %f232, %f1, %f224, %f231; fma.rn.f32 %f233, %f3, %f230, %f232; mul.f32 %f234, %f233, %f102; st.global.f32 [%rd4], %f234; BB0_72: ret; } ` settopologicalcharge_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<77>; .reg .b16 %rs<11>; .reg .f32 %f<265>; .reg .b32 %r<97>; .reg .b64 %rd<46>; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f102, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; ld.param.u8 %rs3, [settopologicalcharge_param_8]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r43, %r44, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r46, %r47, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r49, %r50, %r51; setp.ge.s32 %p3, %r2, %r41; setp.ge.s32 %p4, %r1, %r40; or.pred %p5, %p3, %p4; setp.ge.s32 %p6, %r3, %r42; or.pred %p7, %p5, %p6; @%p7 bra BB0_72; cvta.to.global.u64 %rd9, %rd5; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd10, %r53, 4; add.s64 %rd11, %rd3, %rd10; add.s64 %rd12, %rd2, %rd10; add.s64 %rd13, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; ld.global.nc.f32 %f2, [%rd12]; mul.f32 %f103, %f2, %f2; fma.rn.f32 %f104, %f1, %f1, %f103; ld.global.nc.f32 %f3, [%rd13]; fma.rn.f32 %f105, %f3, %f3, %f104; setp.eq.f32 %p8, %f105, 0f00000000; add.s64 %rd4, %rd9, %rd10; @%p8 bra BB0_71; bra.uni BB0_2; BB0_71: mov.u32 %r88, 0; st.global.u32 [%rd4], %r88; bra.uni BB0_72; BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r6, %r1, -2; @%p9 bra BB0_4; rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r89, %r55, %r40; bra.uni BB0_5; BB0_4: mov.u32 %r56, 0; max.s32 %r89, %r6, %r56; BB0_5: setp.lt.s32 %p11, %r6, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_7; add.s32 %r57, %r89, %r5; mul.wide.s32 %rd14, %r57, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f7, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f8, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; BB0_7: add.s32 %r10, %r1, -1; @%p9 bra BB0_9; rem.s32 %r58, %r10, %r40; add.s32 %r59, %r58, %r40; rem.s32 %r90, %r59, %r40; bra.uni BB0_10; BB0_9: mov.u32 %r60, 0; max.s32 %r90, %r10, %r60; BB0_10: setp.lt.s32 %p14, %r10, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_12; add.s32 %r61, %r90, %r5; mul.wide.s32 %rd18, %r61, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f14, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f15, [%rd21]; BB0_12: add.s32 %r14, %r1, 1; @%p9 bra BB0_14; rem.s32 %r62, %r14, %r40; add.s32 %r63, %r62, %r40; rem.s32 %r91, %r63, %r40; bra.uni BB0_15; BB0_14: add.s32 %r64, %r40, -1; min.s32 %r91, %r14, %r64; BB0_15: setp.ge.s32 %p18, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_17; add.s32 %r65, %r91, %r5; mul.wide.s32 %rd22, %r65, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f19, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f20, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; BB0_17: add.s32 %r18, %r1, 2; @%p9 bra BB0_19; rem.s32 %r66, %r18, %r40; add.s32 %r67, %r66, %r40; rem.s32 %r92, %r67, %r40; bra.uni BB0_20; BB0_19: add.s32 %r68, %r40, -1; min.s32 %r92, %r18, %r68; BB0_20: add.s32 %r22, %r92, %r5; setp.ge.s32 %p22, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f25, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f26, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; BB0_22: mul.f32 %f118, %f20, %f20; fma.rn.f32 %f119, %f19, %f19, %f118; fma.rn.f32 %f28, %f21, %f21, %f119; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_24; mul.f32 %f123, %f14, %f14; fma.rn.f32 %f124, %f13, %f13, %f123; fma.rn.f32 %f125, %f15, %f15, %f124; setp.eq.f32 %p26, %f125, 0f00000000; mov.f32 %f247, 0f00000000; mov.f32 %f248, %f247; mov.f32 %f249, %f247; @%p26 bra BB0_36; BB0_24: mul.f32 %f126, %f8, %f8; fma.rn.f32 %f127, %f7, %f7, %f126; fma.rn.f32 %f29, %f9, %f9, %f127; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f128, %f26, %f26; fma.rn.f32 %f129, %f25, %f25, %f128; fma.rn.f32 %f30, %f27, %f27, %f129; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_26; mul.f32 %f130, %f14, %f14; fma.rn.f32 %f131, %f13, %f13, %f130; fma.rn.f32 %f132, %f15, %f15, %f131; setp.neu.f32 %p32, %f132, 0f00000000; @%p32 bra BB0_35; bra.uni BB0_26; BB0_35: sub.f32 %f161, %f19, %f13; mul.f32 %f247, %f161, 0f3F000000; sub.f32 %f162, %f20, %f14; mul.f32 %f248, %f162, 0f3F000000; sub.f32 %f163, %f21, %f15; mul.f32 %f249, %f163, 0f3F000000; bra.uni BB0_36; BB0_26: or.pred %p34, %p25, %p27; @%p34 bra BB0_28; bra.uni BB0_27; BB0_28: mul.f32 %f133, %f14, %f14; fma.rn.f32 %f134, %f13, %f13, %f133; fma.rn.f32 %f34, %f15, %f15, %f134; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_30; bra.uni BB0_29; BB0_30: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_32; bra.uni BB0_31; BB0_32: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_34; bra.uni BB0_33; BB0_34: sub.f32 %f152, %f19, %f13; sub.f32 %f153, %f20, %f14; sub.f32 %f154, %f21, %f15; sub.f32 %f155, %f7, %f25; mul.f32 %f156, %f155, 0f3DAAAAAB; sub.f32 %f157, %f8, %f26; mul.f32 %f158, %f157, 0f3DAAAAAB; sub.f32 %f159, %f9, %f27; mul.f32 %f160, %f159, 0f3DAAAAAB; fma.rn.f32 %f247, %f152, 0f3F2AAAAB, %f156; fma.rn.f32 %f248, %f153, 0f3F2AAAAB, %f158; fma.rn.f32 %f249, %f154, 0f3F2AAAAB, %f160; bra.uni BB0_36; BB0_27: sub.f32 %f247, %f1, %f13; sub.f32 %f248, %f2, %f14; sub.f32 %f249, %f3, %f15; bra.uni BB0_36; BB0_29: sub.f32 %f247, %f19, %f1; sub.f32 %f248, %f20, %f2; sub.f32 %f249, %f21, %f3; bra.uni BB0_36; BB0_31: mul.f32 %f135, %f13, 0fC0000000; fma.rn.f32 %f136, %f7, 0f3F000000, %f135; add.f32 %f137, %f14, %f14; mul.f32 %f138, %f8, 0f3F000000; sub.f32 %f139, %f138, %f137; add.f32 %f140, %f15, %f15; mul.f32 %f141, %f9, 0f3F000000; sub.f32 %f142, %f141, %f140; fma.rn.f32 %f247, %f1, 0f3FC00000, %f136; fma.rn.f32 %f248, %f2, 0f3FC00000, %f139; fma.rn.f32 %f249, %f3, 0f3FC00000, %f142; bra.uni BB0_36; BB0_33: mul.f32 %f143, %f25, 0fBF000000; fma.rn.f32 %f144, %f19, 0f40000000, %f143; mul.f32 %f145, %f26, 0fBF000000; fma.rn.f32 %f146, %f20, 0f40000000, %f145; mul.f32 %f147, %f27, 0fBF000000; fma.rn.f32 %f148, %f21, 0f40000000, %f147; mul.f32 %f149, %f1, 0f3FC00000; sub.f32 %f247, %f144, %f149; mul.f32 %f150, %f2, 0f3FC00000; sub.f32 %f248, %f146, %f150; mul.f32 %f151, %f3, 0f3FC00000; sub.f32 %f249, %f148, %f151; BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_38; rem.s32 %r69, %r23, %r41; add.s32 %r70, %r69, %r41; rem.s32 %r93, %r70, %r41; bra.uni BB0_39; BB0_38: mov.u32 %r71, 0; max.s32 %r93, %r23, %r71; BB0_39: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_41; add.s32 %r72, %r93, %r4; mad.lo.s32 %r73, %r72, %r40, %r1; mul.wide.s32 %rd30, %r73, 4; add.s64 %rd31, %rd3, %rd30; ld.global.nc.f32 %f56, [%rd31]; add.s64 %rd32, %rd2, %rd30; ld.global.nc.f32 %f57, [%rd32]; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f58, [%rd33]; BB0_41: add.s32 %r27, %r2, -1; @%p43 bra BB0_43; rem.s32 %r74, %r27, %r41; add.s32 %r75, %r74, %r41; rem.s32 %r94, %r75, %r41; bra.uni BB0_44; BB0_43: mov.u32 %r76, 0; max.s32 %r94, %r27, %r76; BB0_44: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_46; add.s32 %r77, %r94, %r4; mad.lo.s32 %r78, %r77, %r40, %r1; mul.wide.s32 %rd34, %r78, 4; add.s64 %rd35, %rd3, %rd34; ld.global.nc.f32 %f62, [%rd35]; add.s64 %rd36, %rd2, %rd34; ld.global.nc.f32 %f63, [%rd36]; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f64, [%rd37]; BB0_46: add.s32 %r31, %r2, 1; @%p43 bra BB0_48; rem.s32 %r79, %r31, %r41; add.s32 %r80, %r79, %r41; rem.s32 %r95, %r80, %r41; bra.uni BB0_49; BB0_48: add.s32 %r81, %r41, -1; min.s32 %r95, %r31, %r81; BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_51; add.s32 %r82, %r95, %r4; mad.lo.s32 %r83, %r82, %r40, %r1; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f68, [%rd39]; add.s64 %rd40, %rd2, %rd38; ld.global.nc.f32 %f69, [%rd40]; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f70, [%rd41]; BB0_51: add.s32 %r35, %r2, 2; @%p43 bra BB0_53; rem.s32 %r84, %r35, %r41; add.s32 %r85, %r84, %r41; rem.s32 %r96, %r85, %r41; bra.uni BB0_54; BB0_53: add.s32 %r86, %r41, -1; min.s32 %r96, %r35, %r86; BB0_54: add.s32 %r87, %r96, %r4; mad.lo.s32 %r39, %r87, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; ld.global.nc.f32 %f74, [%rd43]; add.s64 %rd44, %rd2, %rd42; ld.global.nc.f32 %f75, [%rd44]; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f76, [%rd45]; BB0_56: mul.f32 %f176, %f69, %f69; fma.rn.f32 %f177, %f68, %f68, %f176; fma.rn.f32 %f77, %f70, %f70, %f177; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_58; mul.f32 %f181, %f63, %f63; fma.rn.f32 %f182, %f62, %f62, %f181; fma.rn.f32 %f183, %f64, %f64, %f182; setp.eq.f32 %p60, %f183, 0f00000000; mov.f32 %f262, 0f00000000; mov.f32 %f263, %f262; mov.f32 %f264, %f262; @%p60 bra BB0_70; BB0_58: mul.f32 %f184, %f57, %f57; fma.rn.f32 %f185, %f56, %f56, %f184; fma.rn.f32 %f78, %f58, %f58, %f185; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f186, %f75, %f75; fma.rn.f32 %f187, %f74, %f74, %f186; fma.rn.f32 %f79, %f76, %f76, %f187; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_60; mul.f32 %f188, %f63, %f63; fma.rn.f32 %f189, %f62, %f62, %f188; fma.rn.f32 %f190, %f64, %f64, %f189; setp.neu.f32 %p66, %f190, 0f00000000; @%p66 bra BB0_69; bra.uni BB0_60; BB0_69: sub.f32 %f219, %f68, %f62; mul.f32 %f262, %f219, 0f3F000000; sub.f32 %f220, %f69, %f63; mul.f32 %f263, %f220, 0f3F000000; sub.f32 %f221, %f70, %f64; mul.f32 %f264, %f221, 0f3F000000; bra.uni BB0_70; BB0_60: or.pred %p68, %p59, %p61; @%p68 bra BB0_62; bra.uni BB0_61; BB0_62: mul.f32 %f191, %f63, %f63; fma.rn.f32 %f192, %f62, %f62, %f191; fma.rn.f32 %f83, %f64, %f64, %f192; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_64; bra.uni BB0_63; BB0_64: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_66; bra.uni BB0_65; BB0_66: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_68; bra.uni BB0_67; BB0_68: sub.f32 %f210, %f68, %f62; sub.f32 %f211, %f69, %f63; sub.f32 %f212, %f70, %f64; sub.f32 %f213, %f56, %f74; mul.f32 %f214, %f213, 0f3DAAAAAB; sub.f32 %f215, %f57, %f75; mul.f32 %f216, %f215, 0f3DAAAAAB; sub.f32 %f217, %f58, %f76; mul.f32 %f218, %f217, 0f3DAAAAAB; fma.rn.f32 %f262, %f210, 0f3F2AAAAB, %f214; fma.rn.f32 %f263, %f211, 0f3F2AAAAB, %f216; fma.rn.f32 %f264, %f212, 0f3F2AAAAB, %f218; bra.uni BB0_70; BB0_61: sub.f32 %f262, %f1, %f62; sub.f32 %f263, %f2, %f63; sub.f32 %f264, %f3, %f64; bra.uni BB0_70; BB0_63: sub.f32 %f262, %f68, %f1; sub.f32 %f263, %f69, %f2; sub.f32 %f264, %f70, %f3; bra.uni BB0_70; BB0_65: mul.f32 %f193, %f62, 0fC0000000; fma.rn.f32 %f194, %f56, 0f3F000000, %f193; add.f32 %f195, %f63, %f63; mul.f32 %f196, %f57, 0f3F000000; sub.f32 %f197, %f196, %f195; add.f32 %f198, %f64, %f64; mul.f32 %f199, %f58, 0f3F000000; sub.f32 %f200, %f199, %f198; fma.rn.f32 %f262, %f1, 0f3FC00000, %f194; fma.rn.f32 %f263, %f2, 0f3FC00000, %f197; fma.rn.f32 %f264, %f3, 0f3FC00000, %f200; bra.uni BB0_70; BB0_67: mul.f32 %f201, %f74, 0fBF000000; fma.rn.f32 %f202, %f68, 0f40000000, %f201; mul.f32 %f203, %f75, 0fBF000000; fma.rn.f32 %f204, %f69, 0f40000000, %f203; mul.f32 %f205, %f76, 0fBF000000; fma.rn.f32 %f206, %f70, 0f40000000, %f205; mul.f32 %f207, %f1, 0f3FC00000; sub.f32 %f262, %f202, %f207; mul.f32 %f208, %f2, 0f3FC00000; sub.f32 %f263, %f204, %f208; mul.f32 %f209, %f3, 0f3FC00000; sub.f32 %f264, %f206, %f209; BB0_70: mul.f32 %f222, %f249, %f263; mul.f32 %f223, %f248, %f264; sub.f32 %f224, %f223, %f222; mul.f32 %f225, %f247, %f264; mul.f32 %f226, %f249, %f262; sub.f32 %f227, %f226, %f225; mul.f32 %f228, %f248, %f262; mul.f32 %f229, %f247, %f263; sub.f32 %f230, %f229, %f228; mul.f32 %f231, %f2, %f227; fma.rn.f32 %f232, %f1, %f224, %f231; fma.rn.f32 %f233, %f3, %f230, %f232; mul.f32 %f234, %f233, %f102; st.global.f32 [%rd4], %f234; BB0_72: ret; } ` settopologicalcharge_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<77>; .reg .b16 %rs<11>; .reg .f32 %f<265>; .reg .b32 %r<97>; .reg .b64 %rd<46>; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f102, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; ld.param.u8 %rs3, [settopologicalcharge_param_8]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r43, %r44, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r46, %r47, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r49, %r50, %r51; setp.ge.s32 %p3, %r2, %r41; setp.ge.s32 %p4, %r1, %r40; or.pred %p5, %p3, %p4; setp.ge.s32 %p6, %r3, %r42; or.pred %p7, %p5, %p6; @%p7 bra BB0_72; cvta.to.global.u64 %rd9, %rd5; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd10, %r53, 4; add.s64 %rd11, %rd3, %rd10; add.s64 %rd12, %rd2, %rd10; add.s64 %rd13, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; ld.global.nc.f32 %f2, [%rd12]; mul.f32 %f103, %f2, %f2; fma.rn.f32 %f104, %f1, %f1, %f103; ld.global.nc.f32 %f3, [%rd13]; fma.rn.f32 %f105, %f3, %f3, %f104; setp.eq.f32 %p8, %f105, 0f00000000; add.s64 %rd4, %rd9, %rd10; @%p8 bra BB0_71; bra.uni BB0_2; BB0_71: mov.u32 %r88, 0; st.global.u32 [%rd4], %r88; bra.uni BB0_72; BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r6, %r1, -2; @%p9 bra BB0_4; rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r89, %r55, %r40; bra.uni BB0_5; BB0_4: mov.u32 %r56, 0; max.s32 %r89, %r6, %r56; BB0_5: setp.lt.s32 %p11, %r6, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_7; add.s32 %r57, %r89, %r5; mul.wide.s32 %rd14, %r57, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f7, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f8, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; BB0_7: add.s32 %r10, %r1, -1; @%p9 bra BB0_9; rem.s32 %r58, %r10, %r40; add.s32 %r59, %r58, %r40; rem.s32 %r90, %r59, %r40; bra.uni BB0_10; BB0_9: mov.u32 %r60, 0; max.s32 %r90, %r10, %r60; BB0_10: setp.lt.s32 %p14, %r10, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_12; add.s32 %r61, %r90, %r5; mul.wide.s32 %rd18, %r61, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f14, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f15, [%rd21]; BB0_12: add.s32 %r14, %r1, 1; @%p9 bra BB0_14; rem.s32 %r62, %r14, %r40; add.s32 %r63, %r62, %r40; rem.s32 %r91, %r63, %r40; bra.uni BB0_15; BB0_14: add.s32 %r64, %r40, -1; min.s32 %r91, %r14, %r64; BB0_15: setp.ge.s32 %p18, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_17; add.s32 %r65, %r91, %r5; mul.wide.s32 %rd22, %r65, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f19, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f20, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; BB0_17: add.s32 %r18, %r1, 2; @%p9 bra BB0_19; rem.s32 %r66, %r18, %r40; add.s32 %r67, %r66, %r40; rem.s32 %r92, %r67, %r40; bra.uni BB0_20; BB0_19: add.s32 %r68, %r40, -1; min.s32 %r92, %r18, %r68; BB0_20: add.s32 %r22, %r92, %r5; setp.ge.s32 %p22, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f25, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f26, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; BB0_22: mul.f32 %f118, %f20, %f20; fma.rn.f32 %f119, %f19, %f19, %f118; fma.rn.f32 %f28, %f21, %f21, %f119; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_24; mul.f32 %f123, %f14, %f14; fma.rn.f32 %f124, %f13, %f13, %f123; fma.rn.f32 %f125, %f15, %f15, %f124; setp.eq.f32 %p26, %f125, 0f00000000; mov.f32 %f247, 0f00000000; mov.f32 %f248, %f247; mov.f32 %f249, %f247; @%p26 bra BB0_36; BB0_24: mul.f32 %f126, %f8, %f8; fma.rn.f32 %f127, %f7, %f7, %f126; fma.rn.f32 %f29, %f9, %f9, %f127; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f128, %f26, %f26; fma.rn.f32 %f129, %f25, %f25, %f128; fma.rn.f32 %f30, %f27, %f27, %f129; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_26; mul.f32 %f130, %f14, %f14; fma.rn.f32 %f131, %f13, %f13, %f130; fma.rn.f32 %f132, %f15, %f15, %f131; setp.neu.f32 %p32, %f132, 0f00000000; @%p32 bra BB0_35; bra.uni BB0_26; BB0_35: sub.f32 %f161, %f19, %f13; mul.f32 %f247, %f161, 0f3F000000; sub.f32 %f162, %f20, %f14; mul.f32 %f248, %f162, 0f3F000000; sub.f32 %f163, %f21, %f15; mul.f32 %f249, %f163, 0f3F000000; bra.uni BB0_36; BB0_26: or.pred %p34, %p25, %p27; @%p34 bra BB0_28; bra.uni BB0_27; BB0_28: mul.f32 %f133, %f14, %f14; fma.rn.f32 %f134, %f13, %f13, %f133; fma.rn.f32 %f34, %f15, %f15, %f134; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_30; bra.uni BB0_29; BB0_30: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_32; bra.uni BB0_31; BB0_32: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_34; bra.uni BB0_33; BB0_34: sub.f32 %f152, %f19, %f13; sub.f32 %f153, %f20, %f14; sub.f32 %f154, %f21, %f15; sub.f32 %f155, %f7, %f25; mul.f32 %f156, %f155, 0f3DAAAAAB; sub.f32 %f157, %f8, %f26; mul.f32 %f158, %f157, 0f3DAAAAAB; sub.f32 %f159, %f9, %f27; mul.f32 %f160, %f159, 0f3DAAAAAB; fma.rn.f32 %f247, %f152, 0f3F2AAAAB, %f156; fma.rn.f32 %f248, %f153, 0f3F2AAAAB, %f158; fma.rn.f32 %f249, %f154, 0f3F2AAAAB, %f160; bra.uni BB0_36; BB0_27: sub.f32 %f247, %f1, %f13; sub.f32 %f248, %f2, %f14; sub.f32 %f249, %f3, %f15; bra.uni BB0_36; BB0_29: sub.f32 %f247, %f19, %f1; sub.f32 %f248, %f20, %f2; sub.f32 %f249, %f21, %f3; bra.uni BB0_36; BB0_31: mul.f32 %f135, %f13, 0fC0000000; fma.rn.f32 %f136, %f7, 0f3F000000, %f135; add.f32 %f137, %f14, %f14; mul.f32 %f138, %f8, 0f3F000000; sub.f32 %f139, %f138, %f137; add.f32 %f140, %f15, %f15; mul.f32 %f141, %f9, 0f3F000000; sub.f32 %f142, %f141, %f140; fma.rn.f32 %f247, %f1, 0f3FC00000, %f136; fma.rn.f32 %f248, %f2, 0f3FC00000, %f139; fma.rn.f32 %f249, %f3, 0f3FC00000, %f142; bra.uni BB0_36; BB0_33: mul.f32 %f143, %f25, 0fBF000000; fma.rn.f32 %f144, %f19, 0f40000000, %f143; mul.f32 %f145, %f26, 0fBF000000; fma.rn.f32 %f146, %f20, 0f40000000, %f145; mul.f32 %f147, %f27, 0fBF000000; fma.rn.f32 %f148, %f21, 0f40000000, %f147; mul.f32 %f149, %f1, 0f3FC00000; sub.f32 %f247, %f144, %f149; mul.f32 %f150, %f2, 0f3FC00000; sub.f32 %f248, %f146, %f150; mul.f32 %f151, %f3, 0f3FC00000; sub.f32 %f249, %f148, %f151; BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_38; rem.s32 %r69, %r23, %r41; add.s32 %r70, %r69, %r41; rem.s32 %r93, %r70, %r41; bra.uni BB0_39; BB0_38: mov.u32 %r71, 0; max.s32 %r93, %r23, %r71; BB0_39: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_41; add.s32 %r72, %r93, %r4; mad.lo.s32 %r73, %r72, %r40, %r1; mul.wide.s32 %rd30, %r73, 4; add.s64 %rd31, %rd3, %rd30; ld.global.nc.f32 %f56, [%rd31]; add.s64 %rd32, %rd2, %rd30; ld.global.nc.f32 %f57, [%rd32]; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f58, [%rd33]; BB0_41: add.s32 %r27, %r2, -1; @%p43 bra BB0_43; rem.s32 %r74, %r27, %r41; add.s32 %r75, %r74, %r41; rem.s32 %r94, %r75, %r41; bra.uni BB0_44; BB0_43: mov.u32 %r76, 0; max.s32 %r94, %r27, %r76; BB0_44: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_46; add.s32 %r77, %r94, %r4; mad.lo.s32 %r78, %r77, %r40, %r1; mul.wide.s32 %rd34, %r78, 4; add.s64 %rd35, %rd3, %rd34; ld.global.nc.f32 %f62, [%rd35]; add.s64 %rd36, %rd2, %rd34; ld.global.nc.f32 %f63, [%rd36]; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f64, [%rd37]; BB0_46: add.s32 %r31, %r2, 1; @%p43 bra BB0_48; rem.s32 %r79, %r31, %r41; add.s32 %r80, %r79, %r41; rem.s32 %r95, %r80, %r41; bra.uni BB0_49; BB0_48: add.s32 %r81, %r41, -1; min.s32 %r95, %r31, %r81; BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_51; add.s32 %r82, %r95, %r4; mad.lo.s32 %r83, %r82, %r40, %r1; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f68, [%rd39]; add.s64 %rd40, %rd2, %rd38; ld.global.nc.f32 %f69, [%rd40]; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f70, [%rd41]; BB0_51: add.s32 %r35, %r2, 2; @%p43 bra BB0_53; rem.s32 %r84, %r35, %r41; add.s32 %r85, %r84, %r41; rem.s32 %r96, %r85, %r41; bra.uni BB0_54; BB0_53: add.s32 %r86, %r41, -1; min.s32 %r96, %r35, %r86; BB0_54: add.s32 %r87, %r96, %r4; mad.lo.s32 %r39, %r87, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; ld.global.nc.f32 %f74, [%rd43]; add.s64 %rd44, %rd2, %rd42; ld.global.nc.f32 %f75, [%rd44]; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f76, [%rd45]; BB0_56: mul.f32 %f176, %f69, %f69; fma.rn.f32 %f177, %f68, %f68, %f176; fma.rn.f32 %f77, %f70, %f70, %f177; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_58; mul.f32 %f181, %f63, %f63; fma.rn.f32 %f182, %f62, %f62, %f181; fma.rn.f32 %f183, %f64, %f64, %f182; setp.eq.f32 %p60, %f183, 0f00000000; mov.f32 %f262, 0f00000000; mov.f32 %f263, %f262; mov.f32 %f264, %f262; @%p60 bra BB0_70; BB0_58: mul.f32 %f184, %f57, %f57; fma.rn.f32 %f185, %f56, %f56, %f184; fma.rn.f32 %f78, %f58, %f58, %f185; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f186, %f75, %f75; fma.rn.f32 %f187, %f74, %f74, %f186; fma.rn.f32 %f79, %f76, %f76, %f187; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_60; mul.f32 %f188, %f63, %f63; fma.rn.f32 %f189, %f62, %f62, %f188; fma.rn.f32 %f190, %f64, %f64, %f189; setp.neu.f32 %p66, %f190, 0f00000000; @%p66 bra BB0_69; bra.uni BB0_60; BB0_69: sub.f32 %f219, %f68, %f62; mul.f32 %f262, %f219, 0f3F000000; sub.f32 %f220, %f69, %f63; mul.f32 %f263, %f220, 0f3F000000; sub.f32 %f221, %f70, %f64; mul.f32 %f264, %f221, 0f3F000000; bra.uni BB0_70; BB0_60: or.pred %p68, %p59, %p61; @%p68 bra BB0_62; bra.uni BB0_61; BB0_62: mul.f32 %f191, %f63, %f63; fma.rn.f32 %f192, %f62, %f62, %f191; fma.rn.f32 %f83, %f64, %f64, %f192; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_64; bra.uni BB0_63; BB0_64: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_66; bra.uni BB0_65; BB0_66: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_68; bra.uni BB0_67; BB0_68: sub.f32 %f210, %f68, %f62; sub.f32 %f211, %f69, %f63; sub.f32 %f212, %f70, %f64; sub.f32 %f213, %f56, %f74; mul.f32 %f214, %f213, 0f3DAAAAAB; sub.f32 %f215, %f57, %f75; mul.f32 %f216, %f215, 0f3DAAAAAB; sub.f32 %f217, %f58, %f76; mul.f32 %f218, %f217, 0f3DAAAAAB; fma.rn.f32 %f262, %f210, 0f3F2AAAAB, %f214; fma.rn.f32 %f263, %f211, 0f3F2AAAAB, %f216; fma.rn.f32 %f264, %f212, 0f3F2AAAAB, %f218; bra.uni BB0_70; BB0_61: sub.f32 %f262, %f1, %f62; sub.f32 %f263, %f2, %f63; sub.f32 %f264, %f3, %f64; bra.uni BB0_70; BB0_63: sub.f32 %f262, %f68, %f1; sub.f32 %f263, %f69, %f2; sub.f32 %f264, %f70, %f3; bra.uni BB0_70; BB0_65: mul.f32 %f193, %f62, 0fC0000000; fma.rn.f32 %f194, %f56, 0f3F000000, %f193; add.f32 %f195, %f63, %f63; mul.f32 %f196, %f57, 0f3F000000; sub.f32 %f197, %f196, %f195; add.f32 %f198, %f64, %f64; mul.f32 %f199, %f58, 0f3F000000; sub.f32 %f200, %f199, %f198; fma.rn.f32 %f262, %f1, 0f3FC00000, %f194; fma.rn.f32 %f263, %f2, 0f3FC00000, %f197; fma.rn.f32 %f264, %f3, 0f3FC00000, %f200; bra.uni BB0_70; BB0_67: mul.f32 %f201, %f74, 0fBF000000; fma.rn.f32 %f202, %f68, 0f40000000, %f201; mul.f32 %f203, %f75, 0fBF000000; fma.rn.f32 %f204, %f69, 0f40000000, %f203; mul.f32 %f205, %f76, 0fBF000000; fma.rn.f32 %f206, %f70, 0f40000000, %f205; mul.f32 %f207, %f1, 0f3FC00000; sub.f32 %f262, %f202, %f207; mul.f32 %f208, %f2, 0f3FC00000; sub.f32 %f263, %f204, %f208; mul.f32 %f209, %f3, 0f3FC00000; sub.f32 %f264, %f206, %f209; BB0_70: mul.f32 %f222, %f249, %f263; mul.f32 %f223, %f248, %f264; sub.f32 %f224, %f223, %f222; mul.f32 %f225, %f247, %f264; mul.f32 %f226, %f249, %f262; sub.f32 %f227, %f226, %f225; mul.f32 %f228, %f248, %f262; mul.f32 %f229, %f247, %f263; sub.f32 %f230, %f229, %f228; mul.f32 %f231, %f2, %f227; fma.rn.f32 %f232, %f1, %f224, %f231; fma.rn.f32 %f233, %f3, %f230, %f232; mul.f32 %f234, %f233, %f102; st.global.f32 [%rd4], %f234; BB0_72: ret; } ` settopologicalcharge_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<77>; .reg .b16 %rs<11>; .reg .f32 %f<265>; .reg .b32 %r<97>; .reg .b64 %rd<46>; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f102, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; ld.param.u8 %rs3, [settopologicalcharge_param_8]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r43, %r44, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r46, %r47, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r49, %r50, %r51; setp.ge.s32 %p3, %r2, %r41; setp.ge.s32 %p4, %r1, %r40; or.pred %p5, %p3, %p4; setp.ge.s32 %p6, %r3, %r42; or.pred %p7, %p5, %p6; @%p7 bra BB0_72; cvta.to.global.u64 %rd9, %rd5; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd10, %r53, 4; add.s64 %rd11, %rd3, %rd10; add.s64 %rd12, %rd2, %rd10; add.s64 %rd13, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; ld.global.nc.f32 %f2, [%rd12]; mul.f32 %f103, %f2, %f2; fma.rn.f32 %f104, %f1, %f1, %f103; ld.global.nc.f32 %f3, [%rd13]; fma.rn.f32 %f105, %f3, %f3, %f104; setp.eq.f32 %p8, %f105, 0f00000000; add.s64 %rd4, %rd9, %rd10; @%p8 bra BB0_71; bra.uni BB0_2; BB0_71: mov.u32 %r88, 0; st.global.u32 [%rd4], %r88; bra.uni BB0_72; BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r6, %r1, -2; @%p9 bra BB0_4; rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r89, %r55, %r40; bra.uni BB0_5; BB0_4: mov.u32 %r56, 0; max.s32 %r89, %r6, %r56; BB0_5: setp.lt.s32 %p11, %r6, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_7; add.s32 %r57, %r89, %r5; mul.wide.s32 %rd14, %r57, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f7, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f8, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; BB0_7: add.s32 %r10, %r1, -1; @%p9 bra BB0_9; rem.s32 %r58, %r10, %r40; add.s32 %r59, %r58, %r40; rem.s32 %r90, %r59, %r40; bra.uni BB0_10; BB0_9: mov.u32 %r60, 0; max.s32 %r90, %r10, %r60; BB0_10: setp.lt.s32 %p14, %r10, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_12; add.s32 %r61, %r90, %r5; mul.wide.s32 %rd18, %r61, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f14, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f15, [%rd21]; BB0_12: add.s32 %r14, %r1, 1; @%p9 bra BB0_14; rem.s32 %r62, %r14, %r40; add.s32 %r63, %r62, %r40; rem.s32 %r91, %r63, %r40; bra.uni BB0_15; BB0_14: add.s32 %r64, %r40, -1; min.s32 %r91, %r14, %r64; BB0_15: setp.ge.s32 %p18, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_17; add.s32 %r65, %r91, %r5; mul.wide.s32 %rd22, %r65, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f19, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f20, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; BB0_17: add.s32 %r18, %r1, 2; @%p9 bra BB0_19; rem.s32 %r66, %r18, %r40; add.s32 %r67, %r66, %r40; rem.s32 %r92, %r67, %r40; bra.uni BB0_20; BB0_19: add.s32 %r68, %r40, -1; min.s32 %r92, %r18, %r68; BB0_20: add.s32 %r22, %r92, %r5; setp.ge.s32 %p22, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f25, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f26, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; BB0_22: mul.f32 %f118, %f20, %f20; fma.rn.f32 %f119, %f19, %f19, %f118; fma.rn.f32 %f28, %f21, %f21, %f119; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_24; mul.f32 %f123, %f14, %f14; fma.rn.f32 %f124, %f13, %f13, %f123; fma.rn.f32 %f125, %f15, %f15, %f124; setp.eq.f32 %p26, %f125, 0f00000000; mov.f32 %f247, 0f00000000; mov.f32 %f248, %f247; mov.f32 %f249, %f247; @%p26 bra BB0_36; BB0_24: mul.f32 %f126, %f8, %f8; fma.rn.f32 %f127, %f7, %f7, %f126; fma.rn.f32 %f29, %f9, %f9, %f127; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f128, %f26, %f26; fma.rn.f32 %f129, %f25, %f25, %f128; fma.rn.f32 %f30, %f27, %f27, %f129; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_26; mul.f32 %f130, %f14, %f14; fma.rn.f32 %f131, %f13, %f13, %f130; fma.rn.f32 %f132, %f15, %f15, %f131; setp.neu.f32 %p32, %f132, 0f00000000; @%p32 bra BB0_35; bra.uni BB0_26; BB0_35: sub.f32 %f161, %f19, %f13; mul.f32 %f247, %f161, 0f3F000000; sub.f32 %f162, %f20, %f14; mul.f32 %f248, %f162, 0f3F000000; sub.f32 %f163, %f21, %f15; mul.f32 %f249, %f163, 0f3F000000; bra.uni BB0_36; BB0_26: or.pred %p34, %p25, %p27; @%p34 bra BB0_28; bra.uni BB0_27; BB0_28: mul.f32 %f133, %f14, %f14; fma.rn.f32 %f134, %f13, %f13, %f133; fma.rn.f32 %f34, %f15, %f15, %f134; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_30; bra.uni BB0_29; BB0_30: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_32; bra.uni BB0_31; BB0_32: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_34; bra.uni BB0_33; BB0_34: sub.f32 %f152, %f19, %f13; sub.f32 %f153, %f20, %f14; sub.f32 %f154, %f21, %f15; sub.f32 %f155, %f7, %f25; mul.f32 %f156, %f155, 0f3DAAAAAB; sub.f32 %f157, %f8, %f26; mul.f32 %f158, %f157, 0f3DAAAAAB; sub.f32 %f159, %f9, %f27; mul.f32 %f160, %f159, 0f3DAAAAAB; fma.rn.f32 %f247, %f152, 0f3F2AAAAB, %f156; fma.rn.f32 %f248, %f153, 0f3F2AAAAB, %f158; fma.rn.f32 %f249, %f154, 0f3F2AAAAB, %f160; bra.uni BB0_36; BB0_27: sub.f32 %f247, %f1, %f13; sub.f32 %f248, %f2, %f14; sub.f32 %f249, %f3, %f15; bra.uni BB0_36; BB0_29: sub.f32 %f247, %f19, %f1; sub.f32 %f248, %f20, %f2; sub.f32 %f249, %f21, %f3; bra.uni BB0_36; BB0_31: mul.f32 %f135, %f13, 0fC0000000; fma.rn.f32 %f136, %f7, 0f3F000000, %f135; add.f32 %f137, %f14, %f14; mul.f32 %f138, %f8, 0f3F000000; sub.f32 %f139, %f138, %f137; add.f32 %f140, %f15, %f15; mul.f32 %f141, %f9, 0f3F000000; sub.f32 %f142, %f141, %f140; fma.rn.f32 %f247, %f1, 0f3FC00000, %f136; fma.rn.f32 %f248, %f2, 0f3FC00000, %f139; fma.rn.f32 %f249, %f3, 0f3FC00000, %f142; bra.uni BB0_36; BB0_33: mul.f32 %f143, %f25, 0fBF000000; fma.rn.f32 %f144, %f19, 0f40000000, %f143; mul.f32 %f145, %f26, 0fBF000000; fma.rn.f32 %f146, %f20, 0f40000000, %f145; mul.f32 %f147, %f27, 0fBF000000; fma.rn.f32 %f148, %f21, 0f40000000, %f147; mul.f32 %f149, %f1, 0f3FC00000; sub.f32 %f247, %f144, %f149; mul.f32 %f150, %f2, 0f3FC00000; sub.f32 %f248, %f146, %f150; mul.f32 %f151, %f3, 0f3FC00000; sub.f32 %f249, %f148, %f151; BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_38; rem.s32 %r69, %r23, %r41; add.s32 %r70, %r69, %r41; rem.s32 %r93, %r70, %r41; bra.uni BB0_39; BB0_38: mov.u32 %r71, 0; max.s32 %r93, %r23, %r71; BB0_39: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_41; add.s32 %r72, %r93, %r4; mad.lo.s32 %r73, %r72, %r40, %r1; mul.wide.s32 %rd30, %r73, 4; add.s64 %rd31, %rd3, %rd30; ld.global.nc.f32 %f56, [%rd31]; add.s64 %rd32, %rd2, %rd30; ld.global.nc.f32 %f57, [%rd32]; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f58, [%rd33]; BB0_41: add.s32 %r27, %r2, -1; @%p43 bra BB0_43; rem.s32 %r74, %r27, %r41; add.s32 %r75, %r74, %r41; rem.s32 %r94, %r75, %r41; bra.uni BB0_44; BB0_43: mov.u32 %r76, 0; max.s32 %r94, %r27, %r76; BB0_44: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_46; add.s32 %r77, %r94, %r4; mad.lo.s32 %r78, %r77, %r40, %r1; mul.wide.s32 %rd34, %r78, 4; add.s64 %rd35, %rd3, %rd34; ld.global.nc.f32 %f62, [%rd35]; add.s64 %rd36, %rd2, %rd34; ld.global.nc.f32 %f63, [%rd36]; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f64, [%rd37]; BB0_46: add.s32 %r31, %r2, 1; @%p43 bra BB0_48; rem.s32 %r79, %r31, %r41; add.s32 %r80, %r79, %r41; rem.s32 %r95, %r80, %r41; bra.uni BB0_49; BB0_48: add.s32 %r81, %r41, -1; min.s32 %r95, %r31, %r81; BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_51; add.s32 %r82, %r95, %r4; mad.lo.s32 %r83, %r82, %r40, %r1; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f68, [%rd39]; add.s64 %rd40, %rd2, %rd38; ld.global.nc.f32 %f69, [%rd40]; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f70, [%rd41]; BB0_51: add.s32 %r35, %r2, 2; @%p43 bra BB0_53; rem.s32 %r84, %r35, %r41; add.s32 %r85, %r84, %r41; rem.s32 %r96, %r85, %r41; bra.uni BB0_54; BB0_53: add.s32 %r86, %r41, -1; min.s32 %r96, %r35, %r86; BB0_54: add.s32 %r87, %r96, %r4; mad.lo.s32 %r39, %r87, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; ld.global.nc.f32 %f74, [%rd43]; add.s64 %rd44, %rd2, %rd42; ld.global.nc.f32 %f75, [%rd44]; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f76, [%rd45]; BB0_56: mul.f32 %f176, %f69, %f69; fma.rn.f32 %f177, %f68, %f68, %f176; fma.rn.f32 %f77, %f70, %f70, %f177; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_58; mul.f32 %f181, %f63, %f63; fma.rn.f32 %f182, %f62, %f62, %f181; fma.rn.f32 %f183, %f64, %f64, %f182; setp.eq.f32 %p60, %f183, 0f00000000; mov.f32 %f262, 0f00000000; mov.f32 %f263, %f262; mov.f32 %f264, %f262; @%p60 bra BB0_70; BB0_58: mul.f32 %f184, %f57, %f57; fma.rn.f32 %f185, %f56, %f56, %f184; fma.rn.f32 %f78, %f58, %f58, %f185; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f186, %f75, %f75; fma.rn.f32 %f187, %f74, %f74, %f186; fma.rn.f32 %f79, %f76, %f76, %f187; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_60; mul.f32 %f188, %f63, %f63; fma.rn.f32 %f189, %f62, %f62, %f188; fma.rn.f32 %f190, %f64, %f64, %f189; setp.neu.f32 %p66, %f190, 0f00000000; @%p66 bra BB0_69; bra.uni BB0_60; BB0_69: sub.f32 %f219, %f68, %f62; mul.f32 %f262, %f219, 0f3F000000; sub.f32 %f220, %f69, %f63; mul.f32 %f263, %f220, 0f3F000000; sub.f32 %f221, %f70, %f64; mul.f32 %f264, %f221, 0f3F000000; bra.uni BB0_70; BB0_60: or.pred %p68, %p59, %p61; @%p68 bra BB0_62; bra.uni BB0_61; BB0_62: mul.f32 %f191, %f63, %f63; fma.rn.f32 %f192, %f62, %f62, %f191; fma.rn.f32 %f83, %f64, %f64, %f192; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_64; bra.uni BB0_63; BB0_64: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_66; bra.uni BB0_65; BB0_66: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_68; bra.uni BB0_67; BB0_68: sub.f32 %f210, %f68, %f62; sub.f32 %f211, %f69, %f63; sub.f32 %f212, %f70, %f64; sub.f32 %f213, %f56, %f74; mul.f32 %f214, %f213, 0f3DAAAAAB; sub.f32 %f215, %f57, %f75; mul.f32 %f216, %f215, 0f3DAAAAAB; sub.f32 %f217, %f58, %f76; mul.f32 %f218, %f217, 0f3DAAAAAB; fma.rn.f32 %f262, %f210, 0f3F2AAAAB, %f214; fma.rn.f32 %f263, %f211, 0f3F2AAAAB, %f216; fma.rn.f32 %f264, %f212, 0f3F2AAAAB, %f218; bra.uni BB0_70; BB0_61: sub.f32 %f262, %f1, %f62; sub.f32 %f263, %f2, %f63; sub.f32 %f264, %f3, %f64; bra.uni BB0_70; BB0_63: sub.f32 %f262, %f68, %f1; sub.f32 %f263, %f69, %f2; sub.f32 %f264, %f70, %f3; bra.uni BB0_70; BB0_65: mul.f32 %f193, %f62, 0fC0000000; fma.rn.f32 %f194, %f56, 0f3F000000, %f193; add.f32 %f195, %f63, %f63; mul.f32 %f196, %f57, 0f3F000000; sub.f32 %f197, %f196, %f195; add.f32 %f198, %f64, %f64; mul.f32 %f199, %f58, 0f3F000000; sub.f32 %f200, %f199, %f198; fma.rn.f32 %f262, %f1, 0f3FC00000, %f194; fma.rn.f32 %f263, %f2, 0f3FC00000, %f197; fma.rn.f32 %f264, %f3, 0f3FC00000, %f200; bra.uni BB0_70; BB0_67: mul.f32 %f201, %f74, 0fBF000000; fma.rn.f32 %f202, %f68, 0f40000000, %f201; mul.f32 %f203, %f75, 0fBF000000; fma.rn.f32 %f204, %f69, 0f40000000, %f203; mul.f32 %f205, %f76, 0fBF000000; fma.rn.f32 %f206, %f70, 0f40000000, %f205; mul.f32 %f207, %f1, 0f3FC00000; sub.f32 %f262, %f202, %f207; mul.f32 %f208, %f2, 0f3FC00000; sub.f32 %f263, %f204, %f208; mul.f32 %f209, %f3, 0f3FC00000; sub.f32 %f264, %f206, %f209; BB0_70: mul.f32 %f222, %f249, %f263; mul.f32 %f223, %f248, %f264; sub.f32 %f224, %f223, %f222; mul.f32 %f225, %f247, %f264; mul.f32 %f226, %f249, %f262; sub.f32 %f227, %f226, %f225; mul.f32 %f228, %f248, %f262; mul.f32 %f229, %f247, %f263; sub.f32 %f230, %f229, %f228; mul.f32 %f231, %f2, %f227; fma.rn.f32 %f232, %f1, %f224, %f231; fma.rn.f32 %f233, %f3, %f230, %f232; mul.f32 %f234, %f233, %f102; st.global.f32 [%rd4], %f234; BB0_72: ret; } ` settopologicalcharge_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<77>; .reg .b16 %rs<11>; .reg .f32 %f<265>; .reg .b32 %r<97>; .reg .b64 %rd<46>; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f102, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; ld.param.u8 %rs3, [settopologicalcharge_param_8]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r43, %r44, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r46, %r47, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r49, %r50, %r51; setp.ge.s32 %p3, %r2, %r41; setp.ge.s32 %p4, %r1, %r40; or.pred %p5, %p3, %p4; setp.ge.s32 %p6, %r3, %r42; or.pred %p7, %p5, %p6; @%p7 bra BB0_72; cvta.to.global.u64 %rd9, %rd5; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd10, %r53, 4; add.s64 %rd11, %rd3, %rd10; add.s64 %rd12, %rd2, %rd10; add.s64 %rd13, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; ld.global.nc.f32 %f2, [%rd12]; mul.f32 %f103, %f2, %f2; fma.rn.f32 %f104, %f1, %f1, %f103; ld.global.nc.f32 %f3, [%rd13]; fma.rn.f32 %f105, %f3, %f3, %f104; setp.eq.f32 %p8, %f105, 0f00000000; add.s64 %rd4, %rd9, %rd10; @%p8 bra BB0_71; bra.uni BB0_2; BB0_71: mov.u32 %r88, 0; st.global.u32 [%rd4], %r88; bra.uni BB0_72; BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r6, %r1, -2; @%p9 bra BB0_4; rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r89, %r55, %r40; bra.uni BB0_5; BB0_4: mov.u32 %r56, 0; max.s32 %r89, %r6, %r56; BB0_5: setp.lt.s32 %p11, %r6, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_7; add.s32 %r57, %r89, %r5; mul.wide.s32 %rd14, %r57, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f7, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f8, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; BB0_7: add.s32 %r10, %r1, -1; @%p9 bra BB0_9; rem.s32 %r58, %r10, %r40; add.s32 %r59, %r58, %r40; rem.s32 %r90, %r59, %r40; bra.uni BB0_10; BB0_9: mov.u32 %r60, 0; max.s32 %r90, %r10, %r60; BB0_10: setp.lt.s32 %p14, %r10, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_12; add.s32 %r61, %r90, %r5; mul.wide.s32 %rd18, %r61, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f14, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f15, [%rd21]; BB0_12: add.s32 %r14, %r1, 1; @%p9 bra BB0_14; rem.s32 %r62, %r14, %r40; add.s32 %r63, %r62, %r40; rem.s32 %r91, %r63, %r40; bra.uni BB0_15; BB0_14: add.s32 %r64, %r40, -1; min.s32 %r91, %r14, %r64; BB0_15: setp.ge.s32 %p18, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_17; add.s32 %r65, %r91, %r5; mul.wide.s32 %rd22, %r65, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f19, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f20, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; BB0_17: add.s32 %r18, %r1, 2; @%p9 bra BB0_19; rem.s32 %r66, %r18, %r40; add.s32 %r67, %r66, %r40; rem.s32 %r92, %r67, %r40; bra.uni BB0_20; BB0_19: add.s32 %r68, %r40, -1; min.s32 %r92, %r18, %r68; BB0_20: add.s32 %r22, %r92, %r5; setp.ge.s32 %p22, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f25, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f26, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; BB0_22: mul.f32 %f118, %f20, %f20; fma.rn.f32 %f119, %f19, %f19, %f118; fma.rn.f32 %f28, %f21, %f21, %f119; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_24; mul.f32 %f123, %f14, %f14; fma.rn.f32 %f124, %f13, %f13, %f123; fma.rn.f32 %f125, %f15, %f15, %f124; setp.eq.f32 %p26, %f125, 0f00000000; mov.f32 %f247, 0f00000000; mov.f32 %f248, %f247; mov.f32 %f249, %f247; @%p26 bra BB0_36; BB0_24: mul.f32 %f126, %f8, %f8; fma.rn.f32 %f127, %f7, %f7, %f126; fma.rn.f32 %f29, %f9, %f9, %f127; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f128, %f26, %f26; fma.rn.f32 %f129, %f25, %f25, %f128; fma.rn.f32 %f30, %f27, %f27, %f129; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_26; mul.f32 %f130, %f14, %f14; fma.rn.f32 %f131, %f13, %f13, %f130; fma.rn.f32 %f132, %f15, %f15, %f131; setp.neu.f32 %p32, %f132, 0f00000000; @%p32 bra BB0_35; bra.uni BB0_26; BB0_35: sub.f32 %f161, %f19, %f13; mul.f32 %f247, %f161, 0f3F000000; sub.f32 %f162, %f20, %f14; mul.f32 %f248, %f162, 0f3F000000; sub.f32 %f163, %f21, %f15; mul.f32 %f249, %f163, 0f3F000000; bra.uni BB0_36; BB0_26: or.pred %p34, %p25, %p27; @%p34 bra BB0_28; bra.uni BB0_27; BB0_28: mul.f32 %f133, %f14, %f14; fma.rn.f32 %f134, %f13, %f13, %f133; fma.rn.f32 %f34, %f15, %f15, %f134; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_30; bra.uni BB0_29; BB0_30: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_32; bra.uni BB0_31; BB0_32: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_34; bra.uni BB0_33; BB0_34: sub.f32 %f152, %f19, %f13; sub.f32 %f153, %f20, %f14; sub.f32 %f154, %f21, %f15; sub.f32 %f155, %f7, %f25; mul.f32 %f156, %f155, 0f3DAAAAAB; sub.f32 %f157, %f8, %f26; mul.f32 %f158, %f157, 0f3DAAAAAB; sub.f32 %f159, %f9, %f27; mul.f32 %f160, %f159, 0f3DAAAAAB; fma.rn.f32 %f247, %f152, 0f3F2AAAAB, %f156; fma.rn.f32 %f248, %f153, 0f3F2AAAAB, %f158; fma.rn.f32 %f249, %f154, 0f3F2AAAAB, %f160; bra.uni BB0_36; BB0_27: sub.f32 %f247, %f1, %f13; sub.f32 %f248, %f2, %f14; sub.f32 %f249, %f3, %f15; bra.uni BB0_36; BB0_29: sub.f32 %f247, %f19, %f1; sub.f32 %f248, %f20, %f2; sub.f32 %f249, %f21, %f3; bra.uni BB0_36; BB0_31: mul.f32 %f135, %f13, 0fC0000000; fma.rn.f32 %f136, %f7, 0f3F000000, %f135; add.f32 %f137, %f14, %f14; mul.f32 %f138, %f8, 0f3F000000; sub.f32 %f139, %f138, %f137; add.f32 %f140, %f15, %f15; mul.f32 %f141, %f9, 0f3F000000; sub.f32 %f142, %f141, %f140; fma.rn.f32 %f247, %f1, 0f3FC00000, %f136; fma.rn.f32 %f248, %f2, 0f3FC00000, %f139; fma.rn.f32 %f249, %f3, 0f3FC00000, %f142; bra.uni BB0_36; BB0_33: mul.f32 %f143, %f25, 0fBF000000; fma.rn.f32 %f144, %f19, 0f40000000, %f143; mul.f32 %f145, %f26, 0fBF000000; fma.rn.f32 %f146, %f20, 0f40000000, %f145; mul.f32 %f147, %f27, 0fBF000000; fma.rn.f32 %f148, %f21, 0f40000000, %f147; mul.f32 %f149, %f1, 0f3FC00000; sub.f32 %f247, %f144, %f149; mul.f32 %f150, %f2, 0f3FC00000; sub.f32 %f248, %f146, %f150; mul.f32 %f151, %f3, 0f3FC00000; sub.f32 %f249, %f148, %f151; BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_38; rem.s32 %r69, %r23, %r41; add.s32 %r70, %r69, %r41; rem.s32 %r93, %r70, %r41; bra.uni BB0_39; BB0_38: mov.u32 %r71, 0; max.s32 %r93, %r23, %r71; BB0_39: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_41; add.s32 %r72, %r93, %r4; mad.lo.s32 %r73, %r72, %r40, %r1; mul.wide.s32 %rd30, %r73, 4; add.s64 %rd31, %rd3, %rd30; ld.global.nc.f32 %f56, [%rd31]; add.s64 %rd32, %rd2, %rd30; ld.global.nc.f32 %f57, [%rd32]; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f58, [%rd33]; BB0_41: add.s32 %r27, %r2, -1; @%p43 bra BB0_43; rem.s32 %r74, %r27, %r41; add.s32 %r75, %r74, %r41; rem.s32 %r94, %r75, %r41; bra.uni BB0_44; BB0_43: mov.u32 %r76, 0; max.s32 %r94, %r27, %r76; BB0_44: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_46; add.s32 %r77, %r94, %r4; mad.lo.s32 %r78, %r77, %r40, %r1; mul.wide.s32 %rd34, %r78, 4; add.s64 %rd35, %rd3, %rd34; ld.global.nc.f32 %f62, [%rd35]; add.s64 %rd36, %rd2, %rd34; ld.global.nc.f32 %f63, [%rd36]; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f64, [%rd37]; BB0_46: add.s32 %r31, %r2, 1; @%p43 bra BB0_48; rem.s32 %r79, %r31, %r41; add.s32 %r80, %r79, %r41; rem.s32 %r95, %r80, %r41; bra.uni BB0_49; BB0_48: add.s32 %r81, %r41, -1; min.s32 %r95, %r31, %r81; BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_51; add.s32 %r82, %r95, %r4; mad.lo.s32 %r83, %r82, %r40, %r1; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f68, [%rd39]; add.s64 %rd40, %rd2, %rd38; ld.global.nc.f32 %f69, [%rd40]; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f70, [%rd41]; BB0_51: add.s32 %r35, %r2, 2; @%p43 bra BB0_53; rem.s32 %r84, %r35, %r41; add.s32 %r85, %r84, %r41; rem.s32 %r96, %r85, %r41; bra.uni BB0_54; BB0_53: add.s32 %r86, %r41, -1; min.s32 %r96, %r35, %r86; BB0_54: add.s32 %r87, %r96, %r4; mad.lo.s32 %r39, %r87, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; ld.global.nc.f32 %f74, [%rd43]; add.s64 %rd44, %rd2, %rd42; ld.global.nc.f32 %f75, [%rd44]; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f76, [%rd45]; BB0_56: mul.f32 %f176, %f69, %f69; fma.rn.f32 %f177, %f68, %f68, %f176; fma.rn.f32 %f77, %f70, %f70, %f177; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_58; mul.f32 %f181, %f63, %f63; fma.rn.f32 %f182, %f62, %f62, %f181; fma.rn.f32 %f183, %f64, %f64, %f182; setp.eq.f32 %p60, %f183, 0f00000000; mov.f32 %f262, 0f00000000; mov.f32 %f263, %f262; mov.f32 %f264, %f262; @%p60 bra BB0_70; BB0_58: mul.f32 %f184, %f57, %f57; fma.rn.f32 %f185, %f56, %f56, %f184; fma.rn.f32 %f78, %f58, %f58, %f185; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f186, %f75, %f75; fma.rn.f32 %f187, %f74, %f74, %f186; fma.rn.f32 %f79, %f76, %f76, %f187; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_60; mul.f32 %f188, %f63, %f63; fma.rn.f32 %f189, %f62, %f62, %f188; fma.rn.f32 %f190, %f64, %f64, %f189; setp.neu.f32 %p66, %f190, 0f00000000; @%p66 bra BB0_69; bra.uni BB0_60; BB0_69: sub.f32 %f219, %f68, %f62; mul.f32 %f262, %f219, 0f3F000000; sub.f32 %f220, %f69, %f63; mul.f32 %f263, %f220, 0f3F000000; sub.f32 %f221, %f70, %f64; mul.f32 %f264, %f221, 0f3F000000; bra.uni BB0_70; BB0_60: or.pred %p68, %p59, %p61; @%p68 bra BB0_62; bra.uni BB0_61; BB0_62: mul.f32 %f191, %f63, %f63; fma.rn.f32 %f192, %f62, %f62, %f191; fma.rn.f32 %f83, %f64, %f64, %f192; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_64; bra.uni BB0_63; BB0_64: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_66; bra.uni BB0_65; BB0_66: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_68; bra.uni BB0_67; BB0_68: sub.f32 %f210, %f68, %f62; sub.f32 %f211, %f69, %f63; sub.f32 %f212, %f70, %f64; sub.f32 %f213, %f56, %f74; mul.f32 %f214, %f213, 0f3DAAAAAB; sub.f32 %f215, %f57, %f75; mul.f32 %f216, %f215, 0f3DAAAAAB; sub.f32 %f217, %f58, %f76; mul.f32 %f218, %f217, 0f3DAAAAAB; fma.rn.f32 %f262, %f210, 0f3F2AAAAB, %f214; fma.rn.f32 %f263, %f211, 0f3F2AAAAB, %f216; fma.rn.f32 %f264, %f212, 0f3F2AAAAB, %f218; bra.uni BB0_70; BB0_61: sub.f32 %f262, %f1, %f62; sub.f32 %f263, %f2, %f63; sub.f32 %f264, %f3, %f64; bra.uni BB0_70; BB0_63: sub.f32 %f262, %f68, %f1; sub.f32 %f263, %f69, %f2; sub.f32 %f264, %f70, %f3; bra.uni BB0_70; BB0_65: mul.f32 %f193, %f62, 0fC0000000; fma.rn.f32 %f194, %f56, 0f3F000000, %f193; add.f32 %f195, %f63, %f63; mul.f32 %f196, %f57, 0f3F000000; sub.f32 %f197, %f196, %f195; add.f32 %f198, %f64, %f64; mul.f32 %f199, %f58, 0f3F000000; sub.f32 %f200, %f199, %f198; fma.rn.f32 %f262, %f1, 0f3FC00000, %f194; fma.rn.f32 %f263, %f2, 0f3FC00000, %f197; fma.rn.f32 %f264, %f3, 0f3FC00000, %f200; bra.uni BB0_70; BB0_67: mul.f32 %f201, %f74, 0fBF000000; fma.rn.f32 %f202, %f68, 0f40000000, %f201; mul.f32 %f203, %f75, 0fBF000000; fma.rn.f32 %f204, %f69, 0f40000000, %f203; mul.f32 %f205, %f76, 0fBF000000; fma.rn.f32 %f206, %f70, 0f40000000, %f205; mul.f32 %f207, %f1, 0f3FC00000; sub.f32 %f262, %f202, %f207; mul.f32 %f208, %f2, 0f3FC00000; sub.f32 %f263, %f204, %f208; mul.f32 %f209, %f3, 0f3FC00000; sub.f32 %f264, %f206, %f209; BB0_70: mul.f32 %f222, %f249, %f263; mul.f32 %f223, %f248, %f264; sub.f32 %f224, %f223, %f222; mul.f32 %f225, %f247, %f264; mul.f32 %f226, %f249, %f262; sub.f32 %f227, %f226, %f225; mul.f32 %f228, %f248, %f262; mul.f32 %f229, %f247, %f263; sub.f32 %f230, %f229, %f228; mul.f32 %f231, %f2, %f227; fma.rn.f32 %f232, %f1, %f224, %f231; fma.rn.f32 %f233, %f3, %f230, %f232; mul.f32 %f234, %f233, %f102; st.global.f32 [%rd4], %f234; BB0_72: ret; } ` settopologicalcharge_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<77>; .reg .b16 %rs<11>; .reg .f32 %f<265>; .reg .b32 %r<97>; .reg .b64 %rd<46>; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f102, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; ld.param.u8 %rs3, [settopologicalcharge_param_8]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r43, %r44, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r46, %r47, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r49, %r50, %r51; setp.ge.s32 %p3, %r2, %r41; setp.ge.s32 %p4, %r1, %r40; or.pred %p5, %p3, %p4; setp.ge.s32 %p6, %r3, %r42; or.pred %p7, %p5, %p6; @%p7 bra BB0_72; cvta.to.global.u64 %rd9, %rd5; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd10, %r53, 4; add.s64 %rd11, %rd3, %rd10; add.s64 %rd12, %rd2, %rd10; add.s64 %rd13, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; ld.global.nc.f32 %f2, [%rd12]; mul.f32 %f103, %f2, %f2; fma.rn.f32 %f104, %f1, %f1, %f103; ld.global.nc.f32 %f3, [%rd13]; fma.rn.f32 %f105, %f3, %f3, %f104; setp.eq.f32 %p8, %f105, 0f00000000; add.s64 %rd4, %rd9, %rd10; @%p8 bra BB0_71; bra.uni BB0_2; BB0_71: mov.u32 %r88, 0; st.global.u32 [%rd4], %r88; bra.uni BB0_72; BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r6, %r1, -2; @%p9 bra BB0_4; rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r89, %r55, %r40; bra.uni BB0_5; BB0_4: mov.u32 %r56, 0; max.s32 %r89, %r6, %r56; BB0_5: setp.lt.s32 %p11, %r6, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_7; add.s32 %r57, %r89, %r5; mul.wide.s32 %rd14, %r57, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f7, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f8, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; BB0_7: add.s32 %r10, %r1, -1; @%p9 bra BB0_9; rem.s32 %r58, %r10, %r40; add.s32 %r59, %r58, %r40; rem.s32 %r90, %r59, %r40; bra.uni BB0_10; BB0_9: mov.u32 %r60, 0; max.s32 %r90, %r10, %r60; BB0_10: setp.lt.s32 %p14, %r10, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_12; add.s32 %r61, %r90, %r5; mul.wide.s32 %rd18, %r61, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f14, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f15, [%rd21]; BB0_12: add.s32 %r14, %r1, 1; @%p9 bra BB0_14; rem.s32 %r62, %r14, %r40; add.s32 %r63, %r62, %r40; rem.s32 %r91, %r63, %r40; bra.uni BB0_15; BB0_14: add.s32 %r64, %r40, -1; min.s32 %r91, %r14, %r64; BB0_15: setp.ge.s32 %p18, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_17; add.s32 %r65, %r91, %r5; mul.wide.s32 %rd22, %r65, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f19, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f20, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; BB0_17: add.s32 %r18, %r1, 2; @%p9 bra BB0_19; rem.s32 %r66, %r18, %r40; add.s32 %r67, %r66, %r40; rem.s32 %r92, %r67, %r40; bra.uni BB0_20; BB0_19: add.s32 %r68, %r40, -1; min.s32 %r92, %r18, %r68; BB0_20: add.s32 %r22, %r92, %r5; setp.ge.s32 %p22, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f25, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f26, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; BB0_22: mul.f32 %f118, %f20, %f20; fma.rn.f32 %f119, %f19, %f19, %f118; fma.rn.f32 %f28, %f21, %f21, %f119; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_24; mul.f32 %f123, %f14, %f14; fma.rn.f32 %f124, %f13, %f13, %f123; fma.rn.f32 %f125, %f15, %f15, %f124; setp.eq.f32 %p26, %f125, 0f00000000; mov.f32 %f247, 0f00000000; mov.f32 %f248, %f247; mov.f32 %f249, %f247; @%p26 bra BB0_36; BB0_24: mul.f32 %f126, %f8, %f8; fma.rn.f32 %f127, %f7, %f7, %f126; fma.rn.f32 %f29, %f9, %f9, %f127; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f128, %f26, %f26; fma.rn.f32 %f129, %f25, %f25, %f128; fma.rn.f32 %f30, %f27, %f27, %f129; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_26; mul.f32 %f130, %f14, %f14; fma.rn.f32 %f131, %f13, %f13, %f130; fma.rn.f32 %f132, %f15, %f15, %f131; setp.neu.f32 %p32, %f132, 0f00000000; @%p32 bra BB0_35; bra.uni BB0_26; BB0_35: sub.f32 %f161, %f19, %f13; mul.f32 %f247, %f161, 0f3F000000; sub.f32 %f162, %f20, %f14; mul.f32 %f248, %f162, 0f3F000000; sub.f32 %f163, %f21, %f15; mul.f32 %f249, %f163, 0f3F000000; bra.uni BB0_36; BB0_26: or.pred %p34, %p25, %p27; @%p34 bra BB0_28; bra.uni BB0_27; BB0_28: mul.f32 %f133, %f14, %f14; fma.rn.f32 %f134, %f13, %f13, %f133; fma.rn.f32 %f34, %f15, %f15, %f134; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_30; bra.uni BB0_29; BB0_30: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_32; bra.uni BB0_31; BB0_32: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_34; bra.uni BB0_33; BB0_34: sub.f32 %f152, %f19, %f13; sub.f32 %f153, %f20, %f14; sub.f32 %f154, %f21, %f15; sub.f32 %f155, %f7, %f25; mul.f32 %f156, %f155, 0f3DAAAAAB; sub.f32 %f157, %f8, %f26; mul.f32 %f158, %f157, 0f3DAAAAAB; sub.f32 %f159, %f9, %f27; mul.f32 %f160, %f159, 0f3DAAAAAB; fma.rn.f32 %f247, %f152, 0f3F2AAAAB, %f156; fma.rn.f32 %f248, %f153, 0f3F2AAAAB, %f158; fma.rn.f32 %f249, %f154, 0f3F2AAAAB, %f160; bra.uni BB0_36; BB0_27: sub.f32 %f247, %f1, %f13; sub.f32 %f248, %f2, %f14; sub.f32 %f249, %f3, %f15; bra.uni BB0_36; BB0_29: sub.f32 %f247, %f19, %f1; sub.f32 %f248, %f20, %f2; sub.f32 %f249, %f21, %f3; bra.uni BB0_36; BB0_31: mul.f32 %f135, %f13, 0fC0000000; fma.rn.f32 %f136, %f7, 0f3F000000, %f135; add.f32 %f137, %f14, %f14; mul.f32 %f138, %f8, 0f3F000000; sub.f32 %f139, %f138, %f137; add.f32 %f140, %f15, %f15; mul.f32 %f141, %f9, 0f3F000000; sub.f32 %f142, %f141, %f140; fma.rn.f32 %f247, %f1, 0f3FC00000, %f136; fma.rn.f32 %f248, %f2, 0f3FC00000, %f139; fma.rn.f32 %f249, %f3, 0f3FC00000, %f142; bra.uni BB0_36; BB0_33: mul.f32 %f143, %f25, 0fBF000000; fma.rn.f32 %f144, %f19, 0f40000000, %f143; mul.f32 %f145, %f26, 0fBF000000; fma.rn.f32 %f146, %f20, 0f40000000, %f145; mul.f32 %f147, %f27, 0fBF000000; fma.rn.f32 %f148, %f21, 0f40000000, %f147; mul.f32 %f149, %f1, 0f3FC00000; sub.f32 %f247, %f144, %f149; mul.f32 %f150, %f2, 0f3FC00000; sub.f32 %f248, %f146, %f150; mul.f32 %f151, %f3, 0f3FC00000; sub.f32 %f249, %f148, %f151; BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_38; rem.s32 %r69, %r23, %r41; add.s32 %r70, %r69, %r41; rem.s32 %r93, %r70, %r41; bra.uni BB0_39; BB0_38: mov.u32 %r71, 0; max.s32 %r93, %r23, %r71; BB0_39: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_41; add.s32 %r72, %r93, %r4; mad.lo.s32 %r73, %r72, %r40, %r1; mul.wide.s32 %rd30, %r73, 4; add.s64 %rd31, %rd3, %rd30; ld.global.nc.f32 %f56, [%rd31]; add.s64 %rd32, %rd2, %rd30; ld.global.nc.f32 %f57, [%rd32]; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f58, [%rd33]; BB0_41: add.s32 %r27, %r2, -1; @%p43 bra BB0_43; rem.s32 %r74, %r27, %r41; add.s32 %r75, %r74, %r41; rem.s32 %r94, %r75, %r41; bra.uni BB0_44; BB0_43: mov.u32 %r76, 0; max.s32 %r94, %r27, %r76; BB0_44: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_46; add.s32 %r77, %r94, %r4; mad.lo.s32 %r78, %r77, %r40, %r1; mul.wide.s32 %rd34, %r78, 4; add.s64 %rd35, %rd3, %rd34; ld.global.nc.f32 %f62, [%rd35]; add.s64 %rd36, %rd2, %rd34; ld.global.nc.f32 %f63, [%rd36]; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f64, [%rd37]; BB0_46: add.s32 %r31, %r2, 1; @%p43 bra BB0_48; rem.s32 %r79, %r31, %r41; add.s32 %r80, %r79, %r41; rem.s32 %r95, %r80, %r41; bra.uni BB0_49; BB0_48: add.s32 %r81, %r41, -1; min.s32 %r95, %r31, %r81; BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_51; add.s32 %r82, %r95, %r4; mad.lo.s32 %r83, %r82, %r40, %r1; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f68, [%rd39]; add.s64 %rd40, %rd2, %rd38; ld.global.nc.f32 %f69, [%rd40]; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f70, [%rd41]; BB0_51: add.s32 %r35, %r2, 2; @%p43 bra BB0_53; rem.s32 %r84, %r35, %r41; add.s32 %r85, %r84, %r41; rem.s32 %r96, %r85, %r41; bra.uni BB0_54; BB0_53: add.s32 %r86, %r41, -1; min.s32 %r96, %r35, %r86; BB0_54: add.s32 %r87, %r96, %r4; mad.lo.s32 %r39, %r87, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; ld.global.nc.f32 %f74, [%rd43]; add.s64 %rd44, %rd2, %rd42; ld.global.nc.f32 %f75, [%rd44]; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f76, [%rd45]; BB0_56: mul.f32 %f176, %f69, %f69; fma.rn.f32 %f177, %f68, %f68, %f176; fma.rn.f32 %f77, %f70, %f70, %f177; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_58; mul.f32 %f181, %f63, %f63; fma.rn.f32 %f182, %f62, %f62, %f181; fma.rn.f32 %f183, %f64, %f64, %f182; setp.eq.f32 %p60, %f183, 0f00000000; mov.f32 %f262, 0f00000000; mov.f32 %f263, %f262; mov.f32 %f264, %f262; @%p60 bra BB0_70; BB0_58: mul.f32 %f184, %f57, %f57; fma.rn.f32 %f185, %f56, %f56, %f184; fma.rn.f32 %f78, %f58, %f58, %f185; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f186, %f75, %f75; fma.rn.f32 %f187, %f74, %f74, %f186; fma.rn.f32 %f79, %f76, %f76, %f187; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_60; mul.f32 %f188, %f63, %f63; fma.rn.f32 %f189, %f62, %f62, %f188; fma.rn.f32 %f190, %f64, %f64, %f189; setp.neu.f32 %p66, %f190, 0f00000000; @%p66 bra BB0_69; bra.uni BB0_60; BB0_69: sub.f32 %f219, %f68, %f62; mul.f32 %f262, %f219, 0f3F000000; sub.f32 %f220, %f69, %f63; mul.f32 %f263, %f220, 0f3F000000; sub.f32 %f221, %f70, %f64; mul.f32 %f264, %f221, 0f3F000000; bra.uni BB0_70; BB0_60: or.pred %p68, %p59, %p61; @%p68 bra BB0_62; bra.uni BB0_61; BB0_62: mul.f32 %f191, %f63, %f63; fma.rn.f32 %f192, %f62, %f62, %f191; fma.rn.f32 %f83, %f64, %f64, %f192; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_64; bra.uni BB0_63; BB0_64: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_66; bra.uni BB0_65; BB0_66: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_68; bra.uni BB0_67; BB0_68: sub.f32 %f210, %f68, %f62; sub.f32 %f211, %f69, %f63; sub.f32 %f212, %f70, %f64; sub.f32 %f213, %f56, %f74; mul.f32 %f214, %f213, 0f3DAAAAAB; sub.f32 %f215, %f57, %f75; mul.f32 %f216, %f215, 0f3DAAAAAB; sub.f32 %f217, %f58, %f76; mul.f32 %f218, %f217, 0f3DAAAAAB; fma.rn.f32 %f262, %f210, 0f3F2AAAAB, %f214; fma.rn.f32 %f263, %f211, 0f3F2AAAAB, %f216; fma.rn.f32 %f264, %f212, 0f3F2AAAAB, %f218; bra.uni BB0_70; BB0_61: sub.f32 %f262, %f1, %f62; sub.f32 %f263, %f2, %f63; sub.f32 %f264, %f3, %f64; bra.uni BB0_70; BB0_63: sub.f32 %f262, %f68, %f1; sub.f32 %f263, %f69, %f2; sub.f32 %f264, %f70, %f3; bra.uni BB0_70; BB0_65: mul.f32 %f193, %f62, 0fC0000000; fma.rn.f32 %f194, %f56, 0f3F000000, %f193; add.f32 %f195, %f63, %f63; mul.f32 %f196, %f57, 0f3F000000; sub.f32 %f197, %f196, %f195; add.f32 %f198, %f64, %f64; mul.f32 %f199, %f58, 0f3F000000; sub.f32 %f200, %f199, %f198; fma.rn.f32 %f262, %f1, 0f3FC00000, %f194; fma.rn.f32 %f263, %f2, 0f3FC00000, %f197; fma.rn.f32 %f264, %f3, 0f3FC00000, %f200; bra.uni BB0_70; BB0_67: mul.f32 %f201, %f74, 0fBF000000; fma.rn.f32 %f202, %f68, 0f40000000, %f201; mul.f32 %f203, %f75, 0fBF000000; fma.rn.f32 %f204, %f69, 0f40000000, %f203; mul.f32 %f205, %f76, 0fBF000000; fma.rn.f32 %f206, %f70, 0f40000000, %f205; mul.f32 %f207, %f1, 0f3FC00000; sub.f32 %f262, %f202, %f207; mul.f32 %f208, %f2, 0f3FC00000; sub.f32 %f263, %f204, %f208; mul.f32 %f209, %f3, 0f3FC00000; sub.f32 %f264, %f206, %f209; BB0_70: mul.f32 %f222, %f249, %f263; mul.f32 %f223, %f248, %f264; sub.f32 %f224, %f223, %f222; mul.f32 %f225, %f247, %f264; mul.f32 %f226, %f249, %f262; sub.f32 %f227, %f226, %f225; mul.f32 %f228, %f248, %f262; mul.f32 %f229, %f247, %f263; sub.f32 %f230, %f229, %f228; mul.f32 %f231, %f2, %f227; fma.rn.f32 %f232, %f1, %f224, %f231; fma.rn.f32 %f233, %f3, %f230, %f232; mul.f32 %f234, %f233, %f102; st.global.f32 [%rd4], %f234; BB0_72: ret; } ` settopologicalcharge_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<77>; .reg .b16 %rs<11>; .reg .f32 %f<265>; .reg .b32 %r<97>; .reg .b64 %rd<46>; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f102, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; ld.param.u8 %rs3, [settopologicalcharge_param_8]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r43, %r44, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r46, %r47, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r49, %r50, %r51; setp.ge.s32 %p3, %r2, %r41; setp.ge.s32 %p4, %r1, %r40; or.pred %p5, %p3, %p4; setp.ge.s32 %p6, %r3, %r42; or.pred %p7, %p5, %p6; @%p7 bra BB0_72; cvta.to.global.u64 %rd9, %rd5; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd10, %r53, 4; add.s64 %rd11, %rd3, %rd10; add.s64 %rd12, %rd2, %rd10; add.s64 %rd13, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; ld.global.nc.f32 %f2, [%rd12]; mul.f32 %f103, %f2, %f2; fma.rn.f32 %f104, %f1, %f1, %f103; ld.global.nc.f32 %f3, [%rd13]; fma.rn.f32 %f105, %f3, %f3, %f104; setp.eq.f32 %p8, %f105, 0f00000000; add.s64 %rd4, %rd9, %rd10; @%p8 bra BB0_71; bra.uni BB0_2; BB0_71: mov.u32 %r88, 0; st.global.u32 [%rd4], %r88; bra.uni BB0_72; BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r6, %r1, -2; @%p9 bra BB0_4; rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r89, %r55, %r40; bra.uni BB0_5; BB0_4: mov.u32 %r56, 0; max.s32 %r89, %r6, %r56; BB0_5: setp.lt.s32 %p11, %r6, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_7; add.s32 %r57, %r89, %r5; mul.wide.s32 %rd14, %r57, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f7, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f8, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; BB0_7: add.s32 %r10, %r1, -1; @%p9 bra BB0_9; rem.s32 %r58, %r10, %r40; add.s32 %r59, %r58, %r40; rem.s32 %r90, %r59, %r40; bra.uni BB0_10; BB0_9: mov.u32 %r60, 0; max.s32 %r90, %r10, %r60; BB0_10: setp.lt.s32 %p14, %r10, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_12; add.s32 %r61, %r90, %r5; mul.wide.s32 %rd18, %r61, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f14, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f15, [%rd21]; BB0_12: add.s32 %r14, %r1, 1; @%p9 bra BB0_14; rem.s32 %r62, %r14, %r40; add.s32 %r63, %r62, %r40; rem.s32 %r91, %r63, %r40; bra.uni BB0_15; BB0_14: add.s32 %r64, %r40, -1; min.s32 %r91, %r14, %r64; BB0_15: setp.ge.s32 %p18, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_17; add.s32 %r65, %r91, %r5; mul.wide.s32 %rd22, %r65, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f19, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f20, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; BB0_17: add.s32 %r18, %r1, 2; @%p9 bra BB0_19; rem.s32 %r66, %r18, %r40; add.s32 %r67, %r66, %r40; rem.s32 %r92, %r67, %r40; bra.uni BB0_20; BB0_19: add.s32 %r68, %r40, -1; min.s32 %r92, %r18, %r68; BB0_20: add.s32 %r22, %r92, %r5; setp.ge.s32 %p22, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f25, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f26, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; BB0_22: mul.f32 %f118, %f20, %f20; fma.rn.f32 %f119, %f19, %f19, %f118; fma.rn.f32 %f28, %f21, %f21, %f119; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_24; mul.f32 %f123, %f14, %f14; fma.rn.f32 %f124, %f13, %f13, %f123; fma.rn.f32 %f125, %f15, %f15, %f124; setp.eq.f32 %p26, %f125, 0f00000000; mov.f32 %f247, 0f00000000; mov.f32 %f248, %f247; mov.f32 %f249, %f247; @%p26 bra BB0_36; BB0_24: mul.f32 %f126, %f8, %f8; fma.rn.f32 %f127, %f7, %f7, %f126; fma.rn.f32 %f29, %f9, %f9, %f127; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f128, %f26, %f26; fma.rn.f32 %f129, %f25, %f25, %f128; fma.rn.f32 %f30, %f27, %f27, %f129; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_26; mul.f32 %f130, %f14, %f14; fma.rn.f32 %f131, %f13, %f13, %f130; fma.rn.f32 %f132, %f15, %f15, %f131; setp.neu.f32 %p32, %f132, 0f00000000; @%p32 bra BB0_35; bra.uni BB0_26; BB0_35: sub.f32 %f161, %f19, %f13; mul.f32 %f247, %f161, 0f3F000000; sub.f32 %f162, %f20, %f14; mul.f32 %f248, %f162, 0f3F000000; sub.f32 %f163, %f21, %f15; mul.f32 %f249, %f163, 0f3F000000; bra.uni BB0_36; BB0_26: or.pred %p34, %p25, %p27; @%p34 bra BB0_28; bra.uni BB0_27; BB0_28: mul.f32 %f133, %f14, %f14; fma.rn.f32 %f134, %f13, %f13, %f133; fma.rn.f32 %f34, %f15, %f15, %f134; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_30; bra.uni BB0_29; BB0_30: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_32; bra.uni BB0_31; BB0_32: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_34; bra.uni BB0_33; BB0_34: sub.f32 %f152, %f19, %f13; sub.f32 %f153, %f20, %f14; sub.f32 %f154, %f21, %f15; sub.f32 %f155, %f7, %f25; mul.f32 %f156, %f155, 0f3DAAAAAB; sub.f32 %f157, %f8, %f26; mul.f32 %f158, %f157, 0f3DAAAAAB; sub.f32 %f159, %f9, %f27; mul.f32 %f160, %f159, 0f3DAAAAAB; fma.rn.f32 %f247, %f152, 0f3F2AAAAB, %f156; fma.rn.f32 %f248, %f153, 0f3F2AAAAB, %f158; fma.rn.f32 %f249, %f154, 0f3F2AAAAB, %f160; bra.uni BB0_36; BB0_27: sub.f32 %f247, %f1, %f13; sub.f32 %f248, %f2, %f14; sub.f32 %f249, %f3, %f15; bra.uni BB0_36; BB0_29: sub.f32 %f247, %f19, %f1; sub.f32 %f248, %f20, %f2; sub.f32 %f249, %f21, %f3; bra.uni BB0_36; BB0_31: mul.f32 %f135, %f13, 0fC0000000; fma.rn.f32 %f136, %f7, 0f3F000000, %f135; add.f32 %f137, %f14, %f14; mul.f32 %f138, %f8, 0f3F000000; sub.f32 %f139, %f138, %f137; add.f32 %f140, %f15, %f15; mul.f32 %f141, %f9, 0f3F000000; sub.f32 %f142, %f141, %f140; fma.rn.f32 %f247, %f1, 0f3FC00000, %f136; fma.rn.f32 %f248, %f2, 0f3FC00000, %f139; fma.rn.f32 %f249, %f3, 0f3FC00000, %f142; bra.uni BB0_36; BB0_33: mul.f32 %f143, %f25, 0fBF000000; fma.rn.f32 %f144, %f19, 0f40000000, %f143; mul.f32 %f145, %f26, 0fBF000000; fma.rn.f32 %f146, %f20, 0f40000000, %f145; mul.f32 %f147, %f27, 0fBF000000; fma.rn.f32 %f148, %f21, 0f40000000, %f147; mul.f32 %f149, %f1, 0f3FC00000; sub.f32 %f247, %f144, %f149; mul.f32 %f150, %f2, 0f3FC00000; sub.f32 %f248, %f146, %f150; mul.f32 %f151, %f3, 0f3FC00000; sub.f32 %f249, %f148, %f151; BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_38; rem.s32 %r69, %r23, %r41; add.s32 %r70, %r69, %r41; rem.s32 %r93, %r70, %r41; bra.uni BB0_39; BB0_38: mov.u32 %r71, 0; max.s32 %r93, %r23, %r71; BB0_39: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_41; add.s32 %r72, %r93, %r4; mad.lo.s32 %r73, %r72, %r40, %r1; mul.wide.s32 %rd30, %r73, 4; add.s64 %rd31, %rd3, %rd30; ld.global.nc.f32 %f56, [%rd31]; add.s64 %rd32, %rd2, %rd30; ld.global.nc.f32 %f57, [%rd32]; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f58, [%rd33]; BB0_41: add.s32 %r27, %r2, -1; @%p43 bra BB0_43; rem.s32 %r74, %r27, %r41; add.s32 %r75, %r74, %r41; rem.s32 %r94, %r75, %r41; bra.uni BB0_44; BB0_43: mov.u32 %r76, 0; max.s32 %r94, %r27, %r76; BB0_44: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_46; add.s32 %r77, %r94, %r4; mad.lo.s32 %r78, %r77, %r40, %r1; mul.wide.s32 %rd34, %r78, 4; add.s64 %rd35, %rd3, %rd34; ld.global.nc.f32 %f62, [%rd35]; add.s64 %rd36, %rd2, %rd34; ld.global.nc.f32 %f63, [%rd36]; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f64, [%rd37]; BB0_46: add.s32 %r31, %r2, 1; @%p43 bra BB0_48; rem.s32 %r79, %r31, %r41; add.s32 %r80, %r79, %r41; rem.s32 %r95, %r80, %r41; bra.uni BB0_49; BB0_48: add.s32 %r81, %r41, -1; min.s32 %r95, %r31, %r81; BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_51; add.s32 %r82, %r95, %r4; mad.lo.s32 %r83, %r82, %r40, %r1; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f68, [%rd39]; add.s64 %rd40, %rd2, %rd38; ld.global.nc.f32 %f69, [%rd40]; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f70, [%rd41]; BB0_51: add.s32 %r35, %r2, 2; @%p43 bra BB0_53; rem.s32 %r84, %r35, %r41; add.s32 %r85, %r84, %r41; rem.s32 %r96, %r85, %r41; bra.uni BB0_54; BB0_53: add.s32 %r86, %r41, -1; min.s32 %r96, %r35, %r86; BB0_54: add.s32 %r87, %r96, %r4; mad.lo.s32 %r39, %r87, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; ld.global.nc.f32 %f74, [%rd43]; add.s64 %rd44, %rd2, %rd42; ld.global.nc.f32 %f75, [%rd44]; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f76, [%rd45]; BB0_56: mul.f32 %f176, %f69, %f69; fma.rn.f32 %f177, %f68, %f68, %f176; fma.rn.f32 %f77, %f70, %f70, %f177; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_58; mul.f32 %f181, %f63, %f63; fma.rn.f32 %f182, %f62, %f62, %f181; fma.rn.f32 %f183, %f64, %f64, %f182; setp.eq.f32 %p60, %f183, 0f00000000; mov.f32 %f262, 0f00000000; mov.f32 %f263, %f262; mov.f32 %f264, %f262; @%p60 bra BB0_70; BB0_58: mul.f32 %f184, %f57, %f57; fma.rn.f32 %f185, %f56, %f56, %f184; fma.rn.f32 %f78, %f58, %f58, %f185; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f186, %f75, %f75; fma.rn.f32 %f187, %f74, %f74, %f186; fma.rn.f32 %f79, %f76, %f76, %f187; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_60; mul.f32 %f188, %f63, %f63; fma.rn.f32 %f189, %f62, %f62, %f188; fma.rn.f32 %f190, %f64, %f64, %f189; setp.neu.f32 %p66, %f190, 0f00000000; @%p66 bra BB0_69; bra.uni BB0_60; BB0_69: sub.f32 %f219, %f68, %f62; mul.f32 %f262, %f219, 0f3F000000; sub.f32 %f220, %f69, %f63; mul.f32 %f263, %f220, 0f3F000000; sub.f32 %f221, %f70, %f64; mul.f32 %f264, %f221, 0f3F000000; bra.uni BB0_70; BB0_60: or.pred %p68, %p59, %p61; @%p68 bra BB0_62; bra.uni BB0_61; BB0_62: mul.f32 %f191, %f63, %f63; fma.rn.f32 %f192, %f62, %f62, %f191; fma.rn.f32 %f83, %f64, %f64, %f192; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_64; bra.uni BB0_63; BB0_64: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_66; bra.uni BB0_65; BB0_66: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_68; bra.uni BB0_67; BB0_68: sub.f32 %f210, %f68, %f62; sub.f32 %f211, %f69, %f63; sub.f32 %f212, %f70, %f64; sub.f32 %f213, %f56, %f74; mul.f32 %f214, %f213, 0f3DAAAAAB; sub.f32 %f215, %f57, %f75; mul.f32 %f216, %f215, 0f3DAAAAAB; sub.f32 %f217, %f58, %f76; mul.f32 %f218, %f217, 0f3DAAAAAB; fma.rn.f32 %f262, %f210, 0f3F2AAAAB, %f214; fma.rn.f32 %f263, %f211, 0f3F2AAAAB, %f216; fma.rn.f32 %f264, %f212, 0f3F2AAAAB, %f218; bra.uni BB0_70; BB0_61: sub.f32 %f262, %f1, %f62; sub.f32 %f263, %f2, %f63; sub.f32 %f264, %f3, %f64; bra.uni BB0_70; BB0_63: sub.f32 %f262, %f68, %f1; sub.f32 %f263, %f69, %f2; sub.f32 %f264, %f70, %f3; bra.uni BB0_70; BB0_65: mul.f32 %f193, %f62, 0fC0000000; fma.rn.f32 %f194, %f56, 0f3F000000, %f193; add.f32 %f195, %f63, %f63; mul.f32 %f196, %f57, 0f3F000000; sub.f32 %f197, %f196, %f195; add.f32 %f198, %f64, %f64; mul.f32 %f199, %f58, 0f3F000000; sub.f32 %f200, %f199, %f198; fma.rn.f32 %f262, %f1, 0f3FC00000, %f194; fma.rn.f32 %f263, %f2, 0f3FC00000, %f197; fma.rn.f32 %f264, %f3, 0f3FC00000, %f200; bra.uni BB0_70; BB0_67: mul.f32 %f201, %f74, 0fBF000000; fma.rn.f32 %f202, %f68, 0f40000000, %f201; mul.f32 %f203, %f75, 0fBF000000; fma.rn.f32 %f204, %f69, 0f40000000, %f203; mul.f32 %f205, %f76, 0fBF000000; fma.rn.f32 %f206, %f70, 0f40000000, %f205; mul.f32 %f207, %f1, 0f3FC00000; sub.f32 %f262, %f202, %f207; mul.f32 %f208, %f2, 0f3FC00000; sub.f32 %f263, %f204, %f208; mul.f32 %f209, %f3, 0f3FC00000; sub.f32 %f264, %f206, %f209; BB0_70: mul.f32 %f222, %f249, %f263; mul.f32 %f223, %f248, %f264; sub.f32 %f224, %f223, %f222; mul.f32 %f225, %f247, %f264; mul.f32 %f226, %f249, %f262; sub.f32 %f227, %f226, %f225; mul.f32 %f228, %f248, %f262; mul.f32 %f229, %f247, %f263; sub.f32 %f230, %f229, %f228; mul.f32 %f231, %f2, %f227; fma.rn.f32 %f232, %f1, %f224, %f231; fma.rn.f32 %f233, %f3, %f230, %f232; mul.f32 %f234, %f233, %f102; st.global.f32 [%rd4], %f234; BB0_72: ret; } ` settopologicalcharge_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl settopologicalcharge .visible .entry settopologicalcharge( .param .u64 settopologicalcharge_param_0, .param .u64 settopologicalcharge_param_1, .param .u64 settopologicalcharge_param_2, .param .u64 settopologicalcharge_param_3, .param .f32 settopologicalcharge_param_4, .param .u32 settopologicalcharge_param_5, .param .u32 settopologicalcharge_param_6, .param .u32 settopologicalcharge_param_7, .param .u8 settopologicalcharge_param_8 ) { .reg .pred %p<77>; .reg .b16 %rs<11>; .reg .f32 %f<265>; .reg .b32 %r<97>; .reg .b64 %rd<46>; ld.param.u64 %rd5, [settopologicalcharge_param_0]; ld.param.u64 %rd6, [settopologicalcharge_param_1]; ld.param.u64 %rd7, [settopologicalcharge_param_2]; ld.param.u64 %rd8, [settopologicalcharge_param_3]; ld.param.f32 %f102, [settopologicalcharge_param_4]; ld.param.u32 %r40, [settopologicalcharge_param_5]; ld.param.u32 %r41, [settopologicalcharge_param_6]; ld.param.u32 %r42, [settopologicalcharge_param_7]; ld.param.u8 %rs3, [settopologicalcharge_param_8]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r43, %ntid.x; mov.u32 %r44, %ctaid.x; mov.u32 %r45, %tid.x; mad.lo.s32 %r1, %r43, %r44, %r45; mov.u32 %r46, %ntid.y; mov.u32 %r47, %ctaid.y; mov.u32 %r48, %tid.y; mad.lo.s32 %r2, %r46, %r47, %r48; mov.u32 %r49, %ntid.z; mov.u32 %r50, %ctaid.z; mov.u32 %r51, %tid.z; mad.lo.s32 %r3, %r49, %r50, %r51; setp.ge.s32 %p3, %r2, %r41; setp.ge.s32 %p4, %r1, %r40; or.pred %p5, %p3, %p4; setp.ge.s32 %p6, %r3, %r42; or.pred %p7, %p5, %p6; @%p7 bra BB0_72; cvta.to.global.u64 %rd9, %rd5; mul.lo.s32 %r4, %r3, %r41; add.s32 %r52, %r4, %r2; mul.lo.s32 %r5, %r52, %r40; add.s32 %r53, %r5, %r1; mul.wide.s32 %rd10, %r53, 4; add.s64 %rd11, %rd3, %rd10; add.s64 %rd12, %rd2, %rd10; add.s64 %rd13, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; ld.global.nc.f32 %f2, [%rd12]; mul.f32 %f103, %f2, %f2; fma.rn.f32 %f104, %f1, %f1, %f103; ld.global.nc.f32 %f3, [%rd13]; fma.rn.f32 %f105, %f3, %f3, %f104; setp.eq.f32 %p8, %f105, 0f00000000; add.s64 %rd4, %rd9, %rd10; @%p8 bra BB0_71; bra.uni BB0_2; BB0_71: mov.u32 %r88, 0; st.global.u32 [%rd4], %r88; bra.uni BB0_72; BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r6, %r1, -2; @%p9 bra BB0_4; rem.s32 %r54, %r6, %r40; add.s32 %r55, %r54, %r40; rem.s32 %r89, %r55, %r40; bra.uni BB0_5; BB0_4: mov.u32 %r56, 0; max.s32 %r89, %r6, %r56; BB0_5: setp.lt.s32 %p11, %r6, 0; mov.f32 %f7, 0f00000000; and.pred %p12, %p11, %p9; mov.f32 %f8, %f7; mov.f32 %f9, %f7; @%p12 bra BB0_7; add.s32 %r57, %r89, %r5; mul.wide.s32 %rd14, %r57, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f7, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f8, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f9, [%rd17]; BB0_7: add.s32 %r10, %r1, -1; @%p9 bra BB0_9; rem.s32 %r58, %r10, %r40; add.s32 %r59, %r58, %r40; rem.s32 %r90, %r59, %r40; bra.uni BB0_10; BB0_9: mov.u32 %r60, 0; max.s32 %r90, %r10, %r60; BB0_10: setp.lt.s32 %p14, %r10, 0; mov.f32 %f13, 0f00000000; and.pred %p16, %p14, %p9; mov.f32 %f14, %f13; mov.f32 %f15, %f13; @%p16 bra BB0_12; add.s32 %r61, %r90, %r5; mul.wide.s32 %rd18, %r61, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f13, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f14, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f15, [%rd21]; BB0_12: add.s32 %r14, %r1, 1; @%p9 bra BB0_14; rem.s32 %r62, %r14, %r40; add.s32 %r63, %r62, %r40; rem.s32 %r91, %r63, %r40; bra.uni BB0_15; BB0_14: add.s32 %r64, %r40, -1; min.s32 %r91, %r14, %r64; BB0_15: setp.ge.s32 %p18, %r14, %r40; mov.f32 %f19, 0f00000000; and.pred %p20, %p18, %p9; mov.f32 %f20, %f19; mov.f32 %f21, %f19; @%p20 bra BB0_17; add.s32 %r65, %r91, %r5; mul.wide.s32 %rd22, %r65, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f19, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f20, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f21, [%rd25]; BB0_17: add.s32 %r18, %r1, 2; @%p9 bra BB0_19; rem.s32 %r66, %r18, %r40; add.s32 %r67, %r66, %r40; rem.s32 %r92, %r67, %r40; bra.uni BB0_20; BB0_19: add.s32 %r68, %r40, -1; min.s32 %r92, %r18, %r68; BB0_20: add.s32 %r22, %r92, %r5; setp.ge.s32 %p22, %r18, %r40; mov.f32 %f25, 0f00000000; and.pred %p24, %p22, %p9; mov.f32 %f26, %f25; mov.f32 %f27, %f25; @%p24 bra BB0_22; mul.wide.s32 %rd26, %r22, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f25, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f26, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f27, [%rd29]; BB0_22: mul.f32 %f118, %f20, %f20; fma.rn.f32 %f119, %f19, %f19, %f118; fma.rn.f32 %f28, %f21, %f21, %f119; setp.neu.f32 %p25, %f28, 0f00000000; @%p25 bra BB0_24; mul.f32 %f123, %f14, %f14; fma.rn.f32 %f124, %f13, %f13, %f123; fma.rn.f32 %f125, %f15, %f15, %f124; setp.eq.f32 %p26, %f125, 0f00000000; mov.f32 %f247, 0f00000000; mov.f32 %f248, %f247; mov.f32 %f249, %f247; @%p26 bra BB0_36; BB0_24: mul.f32 %f126, %f8, %f8; fma.rn.f32 %f127, %f7, %f7, %f126; fma.rn.f32 %f29, %f9, %f9, %f127; setp.neu.f32 %p27, %f29, 0f00000000; mul.f32 %f128, %f26, %f26; fma.rn.f32 %f129, %f25, %f25, %f128; fma.rn.f32 %f30, %f27, %f27, %f129; setp.neu.f32 %p28, %f30, 0f00000000; and.pred %p29, %p27, %p28; setp.eq.f32 %p30, %f28, 0f00000000; or.pred %p31, %p29, %p30; @%p31 bra BB0_26; mul.f32 %f130, %f14, %f14; fma.rn.f32 %f131, %f13, %f13, %f130; fma.rn.f32 %f132, %f15, %f15, %f131; setp.neu.f32 %p32, %f132, 0f00000000; @%p32 bra BB0_35; bra.uni BB0_26; BB0_35: sub.f32 %f161, %f19, %f13; mul.f32 %f247, %f161, 0f3F000000; sub.f32 %f162, %f20, %f14; mul.f32 %f248, %f162, 0f3F000000; sub.f32 %f163, %f21, %f15; mul.f32 %f249, %f163, 0f3F000000; bra.uni BB0_36; BB0_26: or.pred %p34, %p25, %p27; @%p34 bra BB0_28; bra.uni BB0_27; BB0_28: mul.f32 %f133, %f14, %f14; fma.rn.f32 %f134, %f13, %f13, %f133; fma.rn.f32 %f34, %f15, %f15, %f134; setp.neu.f32 %p35, %f34, 0f00000000; or.pred %p37, %p35, %p28; @%p37 bra BB0_30; bra.uni BB0_29; BB0_30: setp.eq.f32 %p38, %f29, 0f00000000; or.pred %p39, %p38, %p25; @%p39 bra BB0_32; bra.uni BB0_31; BB0_32: setp.eq.f32 %p41, %f30, 0f00000000; or.pred %p42, %p41, %p35; @%p42 bra BB0_34; bra.uni BB0_33; BB0_34: sub.f32 %f152, %f19, %f13; sub.f32 %f153, %f20, %f14; sub.f32 %f154, %f21, %f15; sub.f32 %f155, %f7, %f25; mul.f32 %f156, %f155, 0f3DAAAAAB; sub.f32 %f157, %f8, %f26; mul.f32 %f158, %f157, 0f3DAAAAAB; sub.f32 %f159, %f9, %f27; mul.f32 %f160, %f159, 0f3DAAAAAB; fma.rn.f32 %f247, %f152, 0f3F2AAAAB, %f156; fma.rn.f32 %f248, %f153, 0f3F2AAAAB, %f158; fma.rn.f32 %f249, %f154, 0f3F2AAAAB, %f160; bra.uni BB0_36; BB0_27: sub.f32 %f247, %f1, %f13; sub.f32 %f248, %f2, %f14; sub.f32 %f249, %f3, %f15; bra.uni BB0_36; BB0_29: sub.f32 %f247, %f19, %f1; sub.f32 %f248, %f20, %f2; sub.f32 %f249, %f21, %f3; bra.uni BB0_36; BB0_31: mul.f32 %f135, %f13, 0fC0000000; fma.rn.f32 %f136, %f7, 0f3F000000, %f135; add.f32 %f137, %f14, %f14; mul.f32 %f138, %f8, 0f3F000000; sub.f32 %f139, %f138, %f137; add.f32 %f140, %f15, %f15; mul.f32 %f141, %f9, 0f3F000000; sub.f32 %f142, %f141, %f140; fma.rn.f32 %f247, %f1, 0f3FC00000, %f136; fma.rn.f32 %f248, %f2, 0f3FC00000, %f139; fma.rn.f32 %f249, %f3, 0f3FC00000, %f142; bra.uni BB0_36; BB0_33: mul.f32 %f143, %f25, 0fBF000000; fma.rn.f32 %f144, %f19, 0f40000000, %f143; mul.f32 %f145, %f26, 0fBF000000; fma.rn.f32 %f146, %f20, 0f40000000, %f145; mul.f32 %f147, %f27, 0fBF000000; fma.rn.f32 %f148, %f21, 0f40000000, %f147; mul.f32 %f149, %f1, 0f3FC00000; sub.f32 %f247, %f144, %f149; mul.f32 %f150, %f2, 0f3FC00000; sub.f32 %f248, %f146, %f150; mul.f32 %f151, %f3, 0f3FC00000; sub.f32 %f249, %f148, %f151; BB0_36: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p43, %rs2, 0; add.s32 %r23, %r2, -2; @%p43 bra BB0_38; rem.s32 %r69, %r23, %r41; add.s32 %r70, %r69, %r41; rem.s32 %r93, %r70, %r41; bra.uni BB0_39; BB0_38: mov.u32 %r71, 0; max.s32 %r93, %r23, %r71; BB0_39: setp.lt.s32 %p45, %r23, 0; mov.f32 %f56, 0f00000000; and.pred %p46, %p45, %p43; mov.f32 %f57, %f56; mov.f32 %f58, %f56; @%p46 bra BB0_41; add.s32 %r72, %r93, %r4; mad.lo.s32 %r73, %r72, %r40, %r1; mul.wide.s32 %rd30, %r73, 4; add.s64 %rd31, %rd3, %rd30; ld.global.nc.f32 %f56, [%rd31]; add.s64 %rd32, %rd2, %rd30; ld.global.nc.f32 %f57, [%rd32]; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f58, [%rd33]; BB0_41: add.s32 %r27, %r2, -1; @%p43 bra BB0_43; rem.s32 %r74, %r27, %r41; add.s32 %r75, %r74, %r41; rem.s32 %r94, %r75, %r41; bra.uni BB0_44; BB0_43: mov.u32 %r76, 0; max.s32 %r94, %r27, %r76; BB0_44: setp.lt.s32 %p48, %r27, 0; mov.f32 %f62, 0f00000000; and.pred %p50, %p48, %p43; mov.f32 %f63, %f62; mov.f32 %f64, %f62; @%p50 bra BB0_46; add.s32 %r77, %r94, %r4; mad.lo.s32 %r78, %r77, %r40, %r1; mul.wide.s32 %rd34, %r78, 4; add.s64 %rd35, %rd3, %rd34; ld.global.nc.f32 %f62, [%rd35]; add.s64 %rd36, %rd2, %rd34; ld.global.nc.f32 %f63, [%rd36]; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f64, [%rd37]; BB0_46: add.s32 %r31, %r2, 1; @%p43 bra BB0_48; rem.s32 %r79, %r31, %r41; add.s32 %r80, %r79, %r41; rem.s32 %r95, %r80, %r41; bra.uni BB0_49; BB0_48: add.s32 %r81, %r41, -1; min.s32 %r95, %r31, %r81; BB0_49: setp.ge.s32 %p52, %r31, %r41; mov.f32 %f68, 0f00000000; and.pred %p54, %p52, %p43; mov.f32 %f69, %f68; mov.f32 %f70, %f68; @%p54 bra BB0_51; add.s32 %r82, %r95, %r4; mad.lo.s32 %r83, %r82, %r40, %r1; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f68, [%rd39]; add.s64 %rd40, %rd2, %rd38; ld.global.nc.f32 %f69, [%rd40]; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f70, [%rd41]; BB0_51: add.s32 %r35, %r2, 2; @%p43 bra BB0_53; rem.s32 %r84, %r35, %r41; add.s32 %r85, %r84, %r41; rem.s32 %r96, %r85, %r41; bra.uni BB0_54; BB0_53: add.s32 %r86, %r41, -1; min.s32 %r96, %r35, %r86; BB0_54: add.s32 %r87, %r96, %r4; mad.lo.s32 %r39, %r87, %r40, %r1; setp.ge.s32 %p56, %r35, %r41; mov.f32 %f74, 0f00000000; and.pred %p58, %p56, %p43; mov.f32 %f75, %f74; mov.f32 %f76, %f74; @%p58 bra BB0_56; mul.wide.s32 %rd42, %r39, 4; add.s64 %rd43, %rd3, %rd42; ld.global.nc.f32 %f74, [%rd43]; add.s64 %rd44, %rd2, %rd42; ld.global.nc.f32 %f75, [%rd44]; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f76, [%rd45]; BB0_56: mul.f32 %f176, %f69, %f69; fma.rn.f32 %f177, %f68, %f68, %f176; fma.rn.f32 %f77, %f70, %f70, %f177; setp.neu.f32 %p59, %f77, 0f00000000; @%p59 bra BB0_58; mul.f32 %f181, %f63, %f63; fma.rn.f32 %f182, %f62, %f62, %f181; fma.rn.f32 %f183, %f64, %f64, %f182; setp.eq.f32 %p60, %f183, 0f00000000; mov.f32 %f262, 0f00000000; mov.f32 %f263, %f262; mov.f32 %f264, %f262; @%p60 bra BB0_70; BB0_58: mul.f32 %f184, %f57, %f57; fma.rn.f32 %f185, %f56, %f56, %f184; fma.rn.f32 %f78, %f58, %f58, %f185; setp.neu.f32 %p61, %f78, 0f00000000; mul.f32 %f186, %f75, %f75; fma.rn.f32 %f187, %f74, %f74, %f186; fma.rn.f32 %f79, %f76, %f76, %f187; setp.neu.f32 %p62, %f79, 0f00000000; and.pred %p63, %p61, %p62; setp.eq.f32 %p64, %f77, 0f00000000; or.pred %p65, %p63, %p64; @%p65 bra BB0_60; mul.f32 %f188, %f63, %f63; fma.rn.f32 %f189, %f62, %f62, %f188; fma.rn.f32 %f190, %f64, %f64, %f189; setp.neu.f32 %p66, %f190, 0f00000000; @%p66 bra BB0_69; bra.uni BB0_60; BB0_69: sub.f32 %f219, %f68, %f62; mul.f32 %f262, %f219, 0f3F000000; sub.f32 %f220, %f69, %f63; mul.f32 %f263, %f220, 0f3F000000; sub.f32 %f221, %f70, %f64; mul.f32 %f264, %f221, 0f3F000000; bra.uni BB0_70; BB0_60: or.pred %p68, %p59, %p61; @%p68 bra BB0_62; bra.uni BB0_61; BB0_62: mul.f32 %f191, %f63, %f63; fma.rn.f32 %f192, %f62, %f62, %f191; fma.rn.f32 %f83, %f64, %f64, %f192; setp.neu.f32 %p69, %f83, 0f00000000; or.pred %p71, %p69, %p62; @%p71 bra BB0_64; bra.uni BB0_63; BB0_64: setp.eq.f32 %p72, %f78, 0f00000000; or.pred %p73, %p72, %p59; @%p73 bra BB0_66; bra.uni BB0_65; BB0_66: setp.eq.f32 %p75, %f79, 0f00000000; or.pred %p76, %p75, %p69; @%p76 bra BB0_68; bra.uni BB0_67; BB0_68: sub.f32 %f210, %f68, %f62; sub.f32 %f211, %f69, %f63; sub.f32 %f212, %f70, %f64; sub.f32 %f213, %f56, %f74; mul.f32 %f214, %f213, 0f3DAAAAAB; sub.f32 %f215, %f57, %f75; mul.f32 %f216, %f215, 0f3DAAAAAB; sub.f32 %f217, %f58, %f76; mul.f32 %f218, %f217, 0f3DAAAAAB; fma.rn.f32 %f262, %f210, 0f3F2AAAAB, %f214; fma.rn.f32 %f263, %f211, 0f3F2AAAAB, %f216; fma.rn.f32 %f264, %f212, 0f3F2AAAAB, %f218; bra.uni BB0_70; BB0_61: sub.f32 %f262, %f1, %f62; sub.f32 %f263, %f2, %f63; sub.f32 %f264, %f3, %f64; bra.uni BB0_70; BB0_63: sub.f32 %f262, %f68, %f1; sub.f32 %f263, %f69, %f2; sub.f32 %f264, %f70, %f3; bra.uni BB0_70; BB0_65: mul.f32 %f193, %f62, 0fC0000000; fma.rn.f32 %f194, %f56, 0f3F000000, %f193; add.f32 %f195, %f63, %f63; mul.f32 %f196, %f57, 0f3F000000; sub.f32 %f197, %f196, %f195; add.f32 %f198, %f64, %f64; mul.f32 %f199, %f58, 0f3F000000; sub.f32 %f200, %f199, %f198; fma.rn.f32 %f262, %f1, 0f3FC00000, %f194; fma.rn.f32 %f263, %f2, 0f3FC00000, %f197; fma.rn.f32 %f264, %f3, 0f3FC00000, %f200; bra.uni BB0_70; BB0_67: mul.f32 %f201, %f74, 0fBF000000; fma.rn.f32 %f202, %f68, 0f40000000, %f201; mul.f32 %f203, %f75, 0fBF000000; fma.rn.f32 %f204, %f69, 0f40000000, %f203; mul.f32 %f205, %f76, 0fBF000000; fma.rn.f32 %f206, %f70, 0f40000000, %f205; mul.f32 %f207, %f1, 0f3FC00000; sub.f32 %f262, %f202, %f207; mul.f32 %f208, %f2, 0f3FC00000; sub.f32 %f263, %f204, %f208; mul.f32 %f209, %f3, 0f3FC00000; sub.f32 %f264, %f206, %f209; BB0_70: mul.f32 %f222, %f249, %f263; mul.f32 %f223, %f248, %f264; sub.f32 %f224, %f223, %f222; mul.f32 %f225, %f247, %f264; mul.f32 %f226, %f249, %f262; sub.f32 %f227, %f226, %f225; mul.f32 %f228, %f248, %f262; mul.f32 %f229, %f247, %f263; sub.f32 %f230, %f229, %f228; mul.f32 %f231, %f2, %f227; fma.rn.f32 %f232, %f1, %f224, %f231; fma.rn.f32 %f233, %f3, %f230, %f232; mul.f32 %f234, %f233, %f102; st.global.f32 [%rd4], %f234; BB0_72: ret; } ` ) mumax3-3.10/cuda/topologicalchargelattice.cu000066400000000000000000000073021371432437400211620ustar00rootroot00000000000000#include #include #include #include "exchange.h" #include "float3.h" #include "stencil.h" // Returns the topological charge contribution on an elementary triangle ijk // Order of arguments is important here to preserve the same measure of chirality // Note: the result is zero if an argument is zero, or when two arguments are the same __device__ inline float triangleCharge(float3 mi, float3 mj, float3 mk) { float numer = dot(mi, cross(mj, mk)); float denom = 1.0f + dot(mi, mj) + dot(mi, mk) + dot(mj, mk); return 2.0f * atan2(numer, denom); } // Set s to the toplogogical charge density for lattices based on the solid angle // subtended by triangle associated with three spins: a,b,c // // s = 2 atan[(a . b x c /(1 + a.b + a.c + b.c)] / (dx dy) // // After M Boettcher et al, New J Phys 20, 103014 (2018), adapted from // B. Berg and M. Luescher, Nucl. Phys. B 190, 412 (1981), and implemented by // Joo-Von Kim. // // A unit cell comprises two triangles, but s is a site-dependent quantity so we // double-count and average over four triangles. extern "C" __global__ void settopologicalchargelattice(float* __restrict__ s, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float icxcy, int Nx, int Ny, int Nz, uint8_t PBC) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } int i0 = idx(ix, iy, iz); // central cell index float3 m0 = make_float3(mx[i0], my[i0], mz[i0]); // central cell magnetization if(is0(m0)) { s[i0] = 0.0f; return; } // indices of the 4 neighbors (counter clockwise) int i1 = idx(hclampx(ix+1), iy, iz); // (i+1,j) int i2 = idx(ix, hclampy(iy+1), iz); // (i,j+1) int i3 = idx(lclampx(ix-1), iy, iz); // (i-1,j) int i4 = idx(ix, lclampy(iy-1), iz); // (i,j-1) // magnetization of the 4 neighbors float3 m1 = make_float3(mx[i1], my[i1], mz[i1]); float3 m2 = make_float3(mx[i2], my[i2], mz[i2]); float3 m3 = make_float3(mx[i3], my[i3], mz[i3]); float3 m4 = make_float3(mx[i4], my[i4], mz[i4]); // local topological charge (accumulator) float topcharge = 0.0; // charge contribution from the upper right triangle // if diagonally opposite neighbor is not zero, use a weight of 1/2 to avoid counting charges twice if ((ix+1=0 || PBCx) && (iy+1=0 || PBCx) && (iy-1>=0 || PBCy)) { int i_ = idx(lclampx(ix-1), lclampy(iy-1), iz); float3 m_ = make_float3(mx[i_], my[i_], mz[i_]); float weight = is0(m_) ? 1 : 0.5; topcharge += weight * triangleCharge(m0, m3, m4); } // bottom right if ((ix+1=0 || PBCy)) { int i_ = idx(hclampx(ix+1), lclampy(iy-1), iz); float3 m_ = make_float3(mx[i_], my[i_], mz[i_]); float weight = is0(m_) ? 1 : 0.5; topcharge += weight * triangleCharge(m0, m4, m1); } s[i0] = icxcy * topcharge; } mumax3-3.10/cuda/topologicalchargelattice.go000066400000000000000000000007731371432437400211650ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Topological charge according to Berg and Lüscher func SetTopologicalChargeLattice(s *data.Slice, m *data.Slice, mesh *data.Mesh) { cellsize := mesh.CellSize() N := s.Size() util.Argument(m.Size() == N) cfg := make3DConf(N) icxcy := float32(1.0 / (cellsize[X] * cellsize[Y])) k_settopologicalchargelattice_async(s.DevPtr(X), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), icxcy, N[X], N[Y], N[Z], mesh.PBC_code(), cfg) } mumax3-3.10/cuda/topologicalchargelattice_wrapper.go000066400000000000000000000544331371432437400227270ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for settopologicalchargelattice kernel var settopologicalchargelattice_code cu.Function // Stores the arguments for settopologicalchargelattice kernel invocation type settopologicalchargelattice_args_t struct { arg_s unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_icxcy float32 arg_Nx int arg_Ny int arg_Nz int arg_PBC byte argptr [9]unsafe.Pointer sync.Mutex } // Stores the arguments for settopologicalchargelattice kernel invocation var settopologicalchargelattice_args settopologicalchargelattice_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. settopologicalchargelattice_args.argptr[0] = unsafe.Pointer(&settopologicalchargelattice_args.arg_s) settopologicalchargelattice_args.argptr[1] = unsafe.Pointer(&settopologicalchargelattice_args.arg_mx) settopologicalchargelattice_args.argptr[2] = unsafe.Pointer(&settopologicalchargelattice_args.arg_my) settopologicalchargelattice_args.argptr[3] = unsafe.Pointer(&settopologicalchargelattice_args.arg_mz) settopologicalchargelattice_args.argptr[4] = unsafe.Pointer(&settopologicalchargelattice_args.arg_icxcy) settopologicalchargelattice_args.argptr[5] = unsafe.Pointer(&settopologicalchargelattice_args.arg_Nx) settopologicalchargelattice_args.argptr[6] = unsafe.Pointer(&settopologicalchargelattice_args.arg_Ny) settopologicalchargelattice_args.argptr[7] = unsafe.Pointer(&settopologicalchargelattice_args.arg_Nz) settopologicalchargelattice_args.argptr[8] = unsafe.Pointer(&settopologicalchargelattice_args.arg_PBC) } // Wrapper for settopologicalchargelattice CUDA kernel, asynchronous. func k_settopologicalchargelattice_async(s unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, icxcy float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("settopologicalchargelattice") } settopologicalchargelattice_args.Lock() defer settopologicalchargelattice_args.Unlock() if settopologicalchargelattice_code == 0 { settopologicalchargelattice_code = fatbinLoad(settopologicalchargelattice_map, "settopologicalchargelattice") } settopologicalchargelattice_args.arg_s = s settopologicalchargelattice_args.arg_mx = mx settopologicalchargelattice_args.arg_my = my settopologicalchargelattice_args.arg_mz = mz settopologicalchargelattice_args.arg_icxcy = icxcy settopologicalchargelattice_args.arg_Nx = Nx settopologicalchargelattice_args.arg_Ny = Ny settopologicalchargelattice_args.arg_Nz = Nz settopologicalchargelattice_args.arg_PBC = PBC args := settopologicalchargelattice_args.argptr[:] cu.LaunchKernel(settopologicalchargelattice_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("settopologicalchargelattice") } } // maps compute capability on PTX code for settopologicalchargelattice kernel. var settopologicalchargelattice_map = map[int]string{0: "", 70: settopologicalchargelattice_ptx_70} // settopologicalchargelattice PTX code for various compute capabilities. const ( settopologicalchargelattice_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl settopologicalchargelattice .visible .entry settopologicalchargelattice( .param .u64 settopologicalchargelattice_param_0, .param .u64 settopologicalchargelattice_param_1, .param .u64 settopologicalchargelattice_param_2, .param .u64 settopologicalchargelattice_param_3, .param .f32 settopologicalchargelattice_param_4, .param .u32 settopologicalchargelattice_param_5, .param .u32 settopologicalchargelattice_param_6, .param .u32 settopologicalchargelattice_param_7, .param .u8 settopologicalchargelattice_param_8 ) { .reg .pred %p<83>; .reg .b16 %rs<13>; .reg .f32 %f<295>; .reg .b32 %r<171>; .reg .b64 %rd<46>; ld.param.u64 %rd5, [settopologicalchargelattice_param_0]; ld.param.u64 %rd6, [settopologicalchargelattice_param_1]; ld.param.u64 %rd7, [settopologicalchargelattice_param_2]; ld.param.u64 %rd8, [settopologicalchargelattice_param_3]; ld.param.f32 %f52, [settopologicalchargelattice_param_4]; ld.param.u32 %r58, [settopologicalchargelattice_param_5]; ld.param.u32 %r59, [settopologicalchargelattice_param_6]; ld.param.u32 %r60, [settopologicalchargelattice_param_7]; ld.param.u8 %rs3, [settopologicalchargelattice_param_8]; cvta.to.global.u64 %rd1, %rd8; cvta.to.global.u64 %rd2, %rd7; cvta.to.global.u64 %rd3, %rd6; mov.u32 %r61, %ntid.x; mov.u32 %r62, %ctaid.x; mov.u32 %r63, %tid.x; mad.lo.s32 %r1, %r61, %r62, %r63; mov.u32 %r64, %ntid.y; mov.u32 %r65, %ctaid.y; mov.u32 %r66, %tid.y; mad.lo.s32 %r2, %r64, %r65, %r66; mov.u32 %r67, %ntid.z; mov.u32 %r68, %ctaid.z; mov.u32 %r69, %tid.z; mad.lo.s32 %r3, %r67, %r68, %r69; setp.ge.s32 %p3, %r2, %r59; setp.ge.s32 %p4, %r1, %r58; or.pred %p5, %p3, %p4; setp.ge.s32 %p6, %r3, %r60; or.pred %p7, %p5, %p6; @%p7 bra BB0_72; cvta.to.global.u64 %rd9, %rd5; mul.lo.s32 %r4, %r3, %r59; add.s32 %r70, %r4, %r2; mul.lo.s32 %r5, %r70, %r58; add.s32 %r71, %r5, %r1; mul.wide.s32 %rd10, %r71, 4; add.s64 %rd11, %rd3, %rd10; add.s64 %rd12, %rd2, %rd10; add.s64 %rd13, %rd1, %rd10; ld.global.nc.f32 %f1, [%rd11]; ld.global.nc.f32 %f2, [%rd12]; mul.f32 %f53, %f2, %f2; fma.rn.f32 %f54, %f1, %f1, %f53; ld.global.nc.f32 %f3, [%rd13]; fma.rn.f32 %f55, %f3, %f3, %f54; setp.eq.f32 %p8, %f55, 0f00000000; add.s64 %rd4, %rd9, %rd10; @%p8 bra BB0_71; bra.uni BB0_2; BB0_71: mov.u32 %r158, 0; st.global.u32 [%rd4], %r158; bra.uni BB0_72; BB0_2: and.b16 %rs1, %rs3, 1; setp.eq.s16 %p9, %rs1, 0; add.s32 %r6, %r1, 1; @%p9 bra BB0_4; rem.s32 %r72, %r6, %r58; add.s32 %r73, %r72, %r58; rem.s32 %r159, %r73, %r58; bra.uni BB0_5; BB0_4: add.s32 %r74, %r58, -1; min.s32 %r159, %r6, %r74; BB0_5: and.b16 %rs2, %rs3, 2; setp.eq.s16 %p10, %rs2, 0; add.s32 %r10, %r2, 1; @%p10 bra BB0_7; rem.s32 %r75, %r10, %r59; add.s32 %r76, %r75, %r59; rem.s32 %r160, %r76, %r59; bra.uni BB0_8; BB0_7: add.s32 %r77, %r59, -1; min.s32 %r160, %r10, %r77; BB0_8: add.s32 %r14, %r1, -1; @%p9 bra BB0_10; rem.s32 %r78, %r14, %r58; add.s32 %r79, %r78, %r58; rem.s32 %r161, %r79, %r58; bra.uni BB0_11; BB0_10: mov.u32 %r80, 0; max.s32 %r161, %r14, %r80; BB0_11: add.s32 %r18, %r159, %r5; add.s32 %r81, %r160, %r4; mad.lo.s32 %r19, %r81, %r58, %r1; add.s32 %r20, %r161, %r5; add.s32 %r21, %r2, -1; @%p10 bra BB0_13; rem.s32 %r82, %r21, %r59; add.s32 %r83, %r82, %r59; rem.s32 %r162, %r83, %r59; bra.uni BB0_14; BB0_13: mov.u32 %r84, 0; max.s32 %r162, %r21, %r84; BB0_14: add.s32 %r85, %r162, %r4; mad.lo.s32 %r86, %r85, %r58, %r1; mul.wide.s32 %rd14, %r18, 4; add.s64 %rd15, %rd3, %rd14; ld.global.nc.f32 %f4, [%rd15]; add.s64 %rd16, %rd2, %rd14; ld.global.nc.f32 %f5, [%rd16]; add.s64 %rd17, %rd1, %rd14; ld.global.nc.f32 %f6, [%rd17]; mul.wide.s32 %rd18, %r19, 4; add.s64 %rd19, %rd3, %rd18; ld.global.nc.f32 %f7, [%rd19]; add.s64 %rd20, %rd2, %rd18; ld.global.nc.f32 %f8, [%rd20]; add.s64 %rd21, %rd1, %rd18; ld.global.nc.f32 %f9, [%rd21]; mul.wide.s32 %rd22, %r20, 4; add.s64 %rd23, %rd3, %rd22; ld.global.nc.f32 %f10, [%rd23]; add.s64 %rd24, %rd2, %rd22; ld.global.nc.f32 %f11, [%rd24]; add.s64 %rd25, %rd1, %rd22; ld.global.nc.f32 %f12, [%rd25]; mul.wide.s32 %rd26, %r86, 4; add.s64 %rd27, %rd3, %rd26; ld.global.nc.f32 %f13, [%rd27]; add.s64 %rd28, %rd2, %rd26; ld.global.nc.f32 %f14, [%rd28]; add.s64 %rd29, %rd1, %rd26; ld.global.nc.f32 %f15, [%rd29]; setp.ne.s16 %p14, %rs1, 0; setp.ge.s32 %p15, %r6, %r58; setp.lt.s32 %p16, %r6, %r58; or.pred %p1, %p16, %p14; mov.f32 %f290, 0f00000000; and.pred %p17, %p15, %p9; @%p17 bra BB0_28; setp.ge.s32 %p18, %r10, %r59; and.pred %p20, %p18, %p10; @%p20 bra BB0_28; @%p10 bra BB0_18; rem.s32 %r87, %r10, %r59; add.s32 %r88, %r87, %r59; rem.s32 %r163, %r88, %r59; bra.uni BB0_19; BB0_18: add.s32 %r89, %r59, -1; min.s32 %r163, %r10, %r89; BB0_19: @%p9 bra BB0_21; rem.s32 %r90, %r6, %r58; add.s32 %r91, %r90, %r58; rem.s32 %r164, %r91, %r58; bra.uni BB0_22; BB0_21: add.s32 %r92, %r58, -1; min.s32 %r164, %r6, %r92; BB0_22: add.s32 %r93, %r163, %r4; mad.lo.s32 %r94, %r93, %r58, %r164; mul.wide.s32 %rd30, %r94, 4; add.s64 %rd31, %rd3, %rd30; add.s64 %rd32, %rd2, %rd30; add.s64 %rd33, %rd1, %rd30; ld.global.nc.f32 %f58, [%rd31]; ld.global.nc.f32 %f59, [%rd32]; mul.f32 %f60, %f59, %f59; fma.rn.f32 %f61, %f58, %f58, %f60; ld.global.nc.f32 %f62, [%rd33]; fma.rn.f32 %f16, %f62, %f62, %f61; mul.f32 %f63, %f6, %f8; mul.f32 %f64, %f5, %f9; sub.f32 %f65, %f64, %f63; mul.f32 %f66, %f4, %f9; mul.f32 %f67, %f6, %f7; sub.f32 %f68, %f67, %f66; mul.f32 %f69, %f5, %f7; mul.f32 %f70, %f4, %f8; sub.f32 %f71, %f70, %f69; mul.f32 %f72, %f2, %f68; fma.rn.f32 %f73, %f1, %f65, %f72; fma.rn.f32 %f74, %f3, %f71, %f73; mul.f32 %f75, %f2, %f5; fma.rn.f32 %f76, %f1, %f4, %f75; fma.rn.f32 %f77, %f3, %f6, %f76; add.f32 %f78, %f77, 0f3F800000; mul.f32 %f79, %f2, %f8; fma.rn.f32 %f80, %f1, %f7, %f79; fma.rn.f32 %f81, %f3, %f9, %f80; add.f32 %f82, %f78, %f81; mul.f32 %f83, %f5, %f8; fma.rn.f32 %f84, %f4, %f7, %f83; fma.rn.f32 %f85, %f6, %f9, %f84; add.f32 %f86, %f85, %f82; abs.f32 %f17, %f86; abs.f32 %f18, %f74; setp.eq.f32 %p23, %f17, 0f00000000; setp.eq.f32 %p24, %f18, 0f00000000; and.pred %p25, %p23, %p24; mov.b32 %r31, %f86; mov.b32 %r95, %f74; and.b32 %r32, %r95, -2147483648; @%p25 bra BB0_26; bra.uni BB0_23; BB0_26: shr.s32 %r102, %r31, 31; and.b32 %r103, %r102, 1078530011; or.b32 %r104, %r103, %r32; mov.b32 %f287, %r104; bra.uni BB0_27; BB0_23: setp.eq.f32 %p26, %f17, 0f7F800000; setp.eq.f32 %p27, %f18, 0f7F800000; and.pred %p28, %p26, %p27; @%p28 bra BB0_25; bra.uni BB0_24; BB0_25: shr.s32 %r98, %r31, 31; and.b32 %r99, %r98, 13483017; add.s32 %r100, %r99, 1061752795; or.b32 %r101, %r100, %r32; mov.b32 %f287, %r101; bra.uni BB0_27; BB0_24: max.f32 %f87, %f18, %f17; min.f32 %f88, %f18, %f17; div.rn.f32 %f89, %f88, %f87; mul.rn.f32 %f90, %f89, %f89; mov.f32 %f91, 0fC0B59883; mov.f32 %f92, 0fBF52C7EA; fma.rn.f32 %f93, %f90, %f92, %f91; mov.f32 %f94, 0fC0D21907; fma.rn.f32 %f95, %f93, %f90, %f94; mul.f32 %f96, %f90, %f95; mul.f32 %f97, %f89, %f96; add.f32 %f98, %f90, 0f41355DC0; mov.f32 %f99, 0f41E6BD60; fma.rn.f32 %f100, %f98, %f90, %f99; mov.f32 %f101, 0f419D92C8; fma.rn.f32 %f102, %f100, %f90, %f101; rcp.rn.f32 %f103, %f102; fma.rn.f32 %f104, %f97, %f103, %f89; mov.f32 %f105, 0f3FC90FDB; sub.f32 %f106, %f105, %f104; setp.gt.f32 %p29, %f18, %f17; selp.f32 %f107, %f106, %f104, %p29; mov.f32 %f108, 0f40490FDB; sub.f32 %f109, %f108, %f107; setp.lt.s32 %p30, %r31, 0; selp.f32 %f110, %f109, %f107, %p30; mov.b32 %r96, %f110; or.b32 %r97, %r96, %r32; mov.b32 %f111, %r97; add.f32 %f112, %f17, %f18; setp.gtu.f32 %p31, %f112, 0f7F800000; selp.f32 %f287, %f112, %f111, %p31; BB0_27: add.f32 %f113, %f287, %f287; setp.eq.f32 %p32, %f16, 0f00000000; selp.f32 %f114, 0f3F800000, 0f3F000000, %p32; fma.rn.f32 %f290, %f114, %f113, 0f00000000; BB0_28: setp.lt.s32 %p33, %r14, 0; setp.gt.s32 %p34, %r14, -1; or.pred %p2, %p34, %p14; and.pred %p37, %p33, %p9; @%p37 bra BB0_42; setp.ge.s32 %p38, %r10, %r59; and.pred %p40, %p38, %p10; @%p40 bra BB0_42; @%p10 bra BB0_32; rem.s32 %r105, %r10, %r59; add.s32 %r106, %r105, %r59; rem.s32 %r165, %r106, %r59; bra.uni BB0_33; BB0_32: add.s32 %r107, %r59, -1; min.s32 %r165, %r10, %r107; BB0_33: @%p9 bra BB0_35; rem.s32 %r108, %r14, %r58; add.s32 %r109, %r108, %r58; rem.s32 %r166, %r109, %r58; bra.uni BB0_36; BB0_35: mov.u32 %r110, 0; max.s32 %r166, %r14, %r110; BB0_36: add.s32 %r111, %r165, %r4; mad.lo.s32 %r112, %r111, %r58, %r166; mul.wide.s32 %rd34, %r112, 4; add.s64 %rd35, %rd3, %rd34; add.s64 %rd36, %rd2, %rd34; add.s64 %rd37, %rd1, %rd34; ld.global.nc.f32 %f115, [%rd35]; ld.global.nc.f32 %f116, [%rd36]; mul.f32 %f117, %f116, %f116; fma.rn.f32 %f118, %f115, %f115, %f117; ld.global.nc.f32 %f119, [%rd37]; fma.rn.f32 %f25, %f119, %f119, %f118; mul.f32 %f120, %f9, %f11; mul.f32 %f121, %f8, %f12; sub.f32 %f122, %f121, %f120; mul.f32 %f123, %f7, %f12; mul.f32 %f124, %f9, %f10; sub.f32 %f125, %f124, %f123; mul.f32 %f126, %f8, %f10; mul.f32 %f127, %f7, %f11; sub.f32 %f128, %f127, %f126; mul.f32 %f129, %f2, %f125; fma.rn.f32 %f130, %f1, %f122, %f129; fma.rn.f32 %f131, %f3, %f128, %f130; mul.f32 %f132, %f2, %f8; fma.rn.f32 %f133, %f1, %f7, %f132; fma.rn.f32 %f134, %f3, %f9, %f133; add.f32 %f135, %f134, 0f3F800000; mul.f32 %f136, %f2, %f11; fma.rn.f32 %f137, %f1, %f10, %f136; fma.rn.f32 %f138, %f3, %f12, %f137; add.f32 %f139, %f135, %f138; mul.f32 %f140, %f8, %f11; fma.rn.f32 %f141, %f7, %f10, %f140; fma.rn.f32 %f142, %f9, %f12, %f141; add.f32 %f143, %f142, %f139; abs.f32 %f26, %f143; abs.f32 %f27, %f131; setp.eq.f32 %p43, %f26, 0f00000000; setp.eq.f32 %p44, %f27, 0f00000000; and.pred %p45, %p43, %p44; mov.b32 %r39, %f143; mov.b32 %r113, %f131; and.b32 %r40, %r113, -2147483648; @%p45 bra BB0_40; bra.uni BB0_37; BB0_40: shr.s32 %r120, %r39, 31; and.b32 %r121, %r120, 1078530011; or.b32 %r122, %r121, %r40; mov.b32 %f289, %r122; bra.uni BB0_41; BB0_37: setp.eq.f32 %p46, %f26, 0f7F800000; setp.eq.f32 %p47, %f27, 0f7F800000; and.pred %p48, %p46, %p47; @%p48 bra BB0_39; bra.uni BB0_38; BB0_39: shr.s32 %r116, %r39, 31; and.b32 %r117, %r116, 13483017; add.s32 %r118, %r117, 1061752795; or.b32 %r119, %r118, %r40; mov.b32 %f289, %r119; bra.uni BB0_41; BB0_38: max.f32 %f144, %f27, %f26; min.f32 %f145, %f27, %f26; div.rn.f32 %f146, %f145, %f144; mul.rn.f32 %f147, %f146, %f146; mov.f32 %f148, 0fC0B59883; mov.f32 %f149, 0fBF52C7EA; fma.rn.f32 %f150, %f147, %f149, %f148; mov.f32 %f151, 0fC0D21907; fma.rn.f32 %f152, %f150, %f147, %f151; mul.f32 %f153, %f147, %f152; mul.f32 %f154, %f146, %f153; add.f32 %f155, %f147, 0f41355DC0; mov.f32 %f156, 0f41E6BD60; fma.rn.f32 %f157, %f155, %f147, %f156; mov.f32 %f158, 0f419D92C8; fma.rn.f32 %f159, %f157, %f147, %f158; rcp.rn.f32 %f160, %f159; fma.rn.f32 %f161, %f154, %f160, %f146; mov.f32 %f162, 0f3FC90FDB; sub.f32 %f163, %f162, %f161; setp.gt.f32 %p49, %f27, %f26; selp.f32 %f164, %f163, %f161, %p49; mov.f32 %f165, 0f40490FDB; sub.f32 %f166, %f165, %f164; setp.lt.s32 %p50, %r39, 0; selp.f32 %f167, %f166, %f164, %p50; mov.b32 %r114, %f167; or.b32 %r115, %r114, %r40; mov.b32 %f168, %r115; add.f32 %f169, %f26, %f27; setp.gtu.f32 %p51, %f169, 0f7F800000; selp.f32 %f289, %f169, %f168, %p51; BB0_41: add.f32 %f170, %f289, %f289; setp.eq.f32 %p52, %f25, 0f00000000; selp.f32 %f171, 0f3F800000, 0f3F000000, %p52; fma.rn.f32 %f290, %f171, %f170, %f290; BB0_42: @!%p2 bra BB0_56; bra.uni BB0_43; BB0_43: setp.lt.s32 %p53, %r21, 0; and.pred %p55, %p53, %p10; @%p55 bra BB0_56; @%p10 bra BB0_46; rem.s32 %r123, %r21, %r59; add.s32 %r124, %r123, %r59; rem.s32 %r167, %r124, %r59; bra.uni BB0_47; BB0_46: mov.u32 %r125, 0; max.s32 %r167, %r21, %r125; BB0_47: @%p9 bra BB0_49; rem.s32 %r126, %r14, %r58; add.s32 %r127, %r126, %r58; rem.s32 %r168, %r127, %r58; bra.uni BB0_50; BB0_49: mov.u32 %r128, 0; max.s32 %r168, %r14, %r128; BB0_50: add.s32 %r129, %r167, %r4; mad.lo.s32 %r130, %r129, %r58, %r168; mul.wide.s32 %rd38, %r130, 4; add.s64 %rd39, %rd3, %rd38; add.s64 %rd40, %rd2, %rd38; add.s64 %rd41, %rd1, %rd38; ld.global.nc.f32 %f172, [%rd39]; ld.global.nc.f32 %f173, [%rd40]; mul.f32 %f174, %f173, %f173; fma.rn.f32 %f175, %f172, %f172, %f174; ld.global.nc.f32 %f176, [%rd41]; fma.rn.f32 %f34, %f176, %f176, %f175; mul.f32 %f177, %f12, %f14; mul.f32 %f178, %f11, %f15; sub.f32 %f179, %f178, %f177; mul.f32 %f180, %f10, %f15; mul.f32 %f181, %f12, %f13; sub.f32 %f182, %f181, %f180; mul.f32 %f183, %f11, %f13; mul.f32 %f184, %f10, %f14; sub.f32 %f185, %f184, %f183; mul.f32 %f186, %f2, %f182; fma.rn.f32 %f187, %f1, %f179, %f186; fma.rn.f32 %f188, %f3, %f185, %f187; mul.f32 %f189, %f2, %f11; fma.rn.f32 %f190, %f1, %f10, %f189; fma.rn.f32 %f191, %f3, %f12, %f190; add.f32 %f192, %f191, 0f3F800000; mul.f32 %f193, %f2, %f14; fma.rn.f32 %f194, %f1, %f13, %f193; fma.rn.f32 %f195, %f3, %f15, %f194; add.f32 %f196, %f192, %f195; mul.f32 %f197, %f11, %f14; fma.rn.f32 %f198, %f10, %f13, %f197; fma.rn.f32 %f199, %f12, %f15, %f198; add.f32 %f200, %f199, %f196; abs.f32 %f35, %f200; abs.f32 %f36, %f188; setp.eq.f32 %p58, %f35, 0f00000000; setp.eq.f32 %p59, %f36, 0f00000000; and.pred %p60, %p58, %p59; mov.b32 %r47, %f200; mov.b32 %r131, %f188; and.b32 %r48, %r131, -2147483648; @%p60 bra BB0_54; bra.uni BB0_51; BB0_54: shr.s32 %r138, %r47, 31; and.b32 %r139, %r138, 1078530011; or.b32 %r140, %r139, %r48; mov.b32 %f291, %r140; bra.uni BB0_55; BB0_51: setp.eq.f32 %p61, %f35, 0f7F800000; setp.eq.f32 %p62, %f36, 0f7F800000; and.pred %p63, %p61, %p62; @%p63 bra BB0_53; bra.uni BB0_52; BB0_53: shr.s32 %r134, %r47, 31; and.b32 %r135, %r134, 13483017; add.s32 %r136, %r135, 1061752795; or.b32 %r137, %r136, %r48; mov.b32 %f291, %r137; bra.uni BB0_55; BB0_52: max.f32 %f201, %f36, %f35; min.f32 %f202, %f36, %f35; div.rn.f32 %f203, %f202, %f201; mul.rn.f32 %f204, %f203, %f203; mov.f32 %f205, 0fC0B59883; mov.f32 %f206, 0fBF52C7EA; fma.rn.f32 %f207, %f204, %f206, %f205; mov.f32 %f208, 0fC0D21907; fma.rn.f32 %f209, %f207, %f204, %f208; mul.f32 %f210, %f204, %f209; mul.f32 %f211, %f203, %f210; add.f32 %f212, %f204, 0f41355DC0; mov.f32 %f213, 0f41E6BD60; fma.rn.f32 %f214, %f212, %f204, %f213; mov.f32 %f215, 0f419D92C8; fma.rn.f32 %f216, %f214, %f204, %f215; rcp.rn.f32 %f217, %f216; fma.rn.f32 %f218, %f211, %f217, %f203; mov.f32 %f219, 0f3FC90FDB; sub.f32 %f220, %f219, %f218; setp.gt.f32 %p64, %f36, %f35; selp.f32 %f221, %f220, %f218, %p64; mov.f32 %f222, 0f40490FDB; sub.f32 %f223, %f222, %f221; setp.lt.s32 %p65, %r47, 0; selp.f32 %f224, %f223, %f221, %p65; mov.b32 %r132, %f224; or.b32 %r133, %r132, %r48; mov.b32 %f225, %r133; add.f32 %f226, %f35, %f36; setp.gtu.f32 %p66, %f226, 0f7F800000; selp.f32 %f291, %f226, %f225, %p66; BB0_55: add.f32 %f227, %f291, %f291; setp.eq.f32 %p67, %f34, 0f00000000; selp.f32 %f228, 0f3F800000, 0f3F000000, %p67; fma.rn.f32 %f290, %f228, %f227, %f290; BB0_56: @!%p1 bra BB0_70; bra.uni BB0_57; BB0_57: setp.lt.s32 %p68, %r21, 0; and.pred %p70, %p68, %p10; @%p70 bra BB0_70; @%p10 bra BB0_60; rem.s32 %r141, %r21, %r59; add.s32 %r142, %r141, %r59; rem.s32 %r169, %r142, %r59; bra.uni BB0_61; BB0_60: mov.u32 %r143, 0; max.s32 %r169, %r21, %r143; BB0_61: add.s32 %r52, %r169, %r4; @%p9 bra BB0_63; rem.s32 %r144, %r6, %r58; add.s32 %r145, %r144, %r58; rem.s32 %r170, %r145, %r58; bra.uni BB0_64; BB0_63: add.s32 %r146, %r58, -1; min.s32 %r170, %r6, %r146; BB0_64: mad.lo.s32 %r147, %r52, %r58, %r170; mul.wide.s32 %rd42, %r147, 4; add.s64 %rd43, %rd3, %rd42; add.s64 %rd44, %rd2, %rd42; add.s64 %rd45, %rd1, %rd42; ld.global.nc.f32 %f229, [%rd43]; ld.global.nc.f32 %f230, [%rd44]; mul.f32 %f231, %f230, %f230; fma.rn.f32 %f232, %f229, %f229, %f231; ld.global.nc.f32 %f233, [%rd45]; fma.rn.f32 %f43, %f233, %f233, %f232; mul.f32 %f234, %f5, %f15; mul.f32 %f235, %f6, %f14; sub.f32 %f236, %f235, %f234; mul.f32 %f237, %f6, %f13; mul.f32 %f238, %f4, %f15; sub.f32 %f239, %f238, %f237; mul.f32 %f240, %f4, %f14; mul.f32 %f241, %f5, %f13; sub.f32 %f242, %f241, %f240; mul.f32 %f243, %f2, %f239; fma.rn.f32 %f244, %f1, %f236, %f243; fma.rn.f32 %f245, %f3, %f242, %f244; mul.f32 %f246, %f2, %f14; fma.rn.f32 %f247, %f1, %f13, %f246; fma.rn.f32 %f248, %f3, %f15, %f247; add.f32 %f249, %f248, 0f3F800000; mul.f32 %f250, %f2, %f5; fma.rn.f32 %f251, %f1, %f4, %f250; fma.rn.f32 %f252, %f3, %f6, %f251; add.f32 %f253, %f252, %f249; mul.f32 %f254, %f5, %f14; fma.rn.f32 %f255, %f4, %f13, %f254; fma.rn.f32 %f256, %f6, %f15, %f255; add.f32 %f257, %f256, %f253; abs.f32 %f44, %f257; abs.f32 %f45, %f245; setp.eq.f32 %p73, %f44, 0f00000000; setp.eq.f32 %p74, %f45, 0f00000000; and.pred %p75, %p73, %p74; mov.b32 %r56, %f257; mov.b32 %r148, %f245; and.b32 %r57, %r148, -2147483648; @%p75 bra BB0_68; bra.uni BB0_65; BB0_68: shr.s32 %r155, %r56, 31; and.b32 %r156, %r155, 1078530011; or.b32 %r157, %r156, %r57; mov.b32 %f293, %r157; bra.uni BB0_69; BB0_65: setp.eq.f32 %p76, %f44, 0f7F800000; setp.eq.f32 %p77, %f45, 0f7F800000; and.pred %p78, %p76, %p77; @%p78 bra BB0_67; bra.uni BB0_66; BB0_67: shr.s32 %r151, %r56, 31; and.b32 %r152, %r151, 13483017; add.s32 %r153, %r152, 1061752795; or.b32 %r154, %r153, %r57; mov.b32 %f293, %r154; bra.uni BB0_69; BB0_66: max.f32 %f258, %f45, %f44; min.f32 %f259, %f45, %f44; div.rn.f32 %f260, %f259, %f258; mul.rn.f32 %f261, %f260, %f260; mov.f32 %f262, 0fC0B59883; mov.f32 %f263, 0fBF52C7EA; fma.rn.f32 %f264, %f261, %f263, %f262; mov.f32 %f265, 0fC0D21907; fma.rn.f32 %f266, %f264, %f261, %f265; mul.f32 %f267, %f261, %f266; mul.f32 %f268, %f260, %f267; add.f32 %f269, %f261, 0f41355DC0; mov.f32 %f270, 0f41E6BD60; fma.rn.f32 %f271, %f269, %f261, %f270; mov.f32 %f272, 0f419D92C8; fma.rn.f32 %f273, %f271, %f261, %f272; rcp.rn.f32 %f274, %f273; fma.rn.f32 %f275, %f268, %f274, %f260; mov.f32 %f276, 0f3FC90FDB; sub.f32 %f277, %f276, %f275; setp.gt.f32 %p79, %f45, %f44; selp.f32 %f278, %f277, %f275, %p79; mov.f32 %f279, 0f40490FDB; sub.f32 %f280, %f279, %f278; setp.lt.s32 %p80, %r56, 0; selp.f32 %f281, %f280, %f278, %p80; mov.b32 %r149, %f281; or.b32 %r150, %r149, %r57; mov.b32 %f282, %r150; add.f32 %f283, %f44, %f45; setp.gtu.f32 %p81, %f283, 0f7F800000; selp.f32 %f293, %f283, %f282, %p81; BB0_69: add.f32 %f284, %f293, %f293; setp.eq.f32 %p82, %f43, 0f00000000; selp.f32 %f285, 0f3F800000, 0f3F000000, %p82; fma.rn.f32 %f290, %f285, %f284, %f290; BB0_70: mul.f32 %f286, %f290, %f52; st.global.f32 [%rd4], %f286; BB0_72: ret; } ` ) mumax3-3.10/cuda/uniaxialanisotropy2.cu000066400000000000000000000025431371432437400201540ustar00rootroot00000000000000#include #include "float3.h" #include "amul.h" // Add uniaxial magnetocrystalline anisotropy field to B. // http://www.southampton.ac.uk/~fangohr/software/oxs_uniaxial4.html extern "C" __global__ void adduniaxialanisotropy2(float* __restrict__ Bx, float* __restrict__ By, float* __restrict__ Bz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ Ms_, float Ms_mul, float* __restrict__ K1_, float K1_mul, float* __restrict__ K2_, float K2_mul, float* __restrict__ ux_, float ux_mul, float* __restrict__ uy_, float uy_mul, float* __restrict__ uz_, float uz_mul, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { float3 u = normalized(vmul(ux_, uy_, uz_, ux_mul, uy_mul, uz_mul, i)); float invMs = inv_Msat(Ms_, Ms_mul, i); float K1 = amul(K1_, K1_mul, i) * invMs; float K2 = amul(K2_, K2_mul, i) * invMs; float3 m = {mx[i], my[i], mz[i]}; float mu = dot(m, u); float3 Ba = 2.0f*K1* (mu)*u+ 4.0f*K2*pow3(mu)*u; Bx[i] += Ba.x; By[i] += Ba.y; Bz[i] += Ba.z; } } mumax3-3.10/cuda/uniaxialanisotropy2_wrapper.go000066400000000000000000002253321371432437400217150ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for adduniaxialanisotropy2 kernel var adduniaxialanisotropy2_code cu.Function // Stores the arguments for adduniaxialanisotropy2 kernel invocation type adduniaxialanisotropy2_args_t struct { arg_Bx unsafe.Pointer arg_By unsafe.Pointer arg_Bz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_Ms_ unsafe.Pointer arg_Ms_mul float32 arg_K1_ unsafe.Pointer arg_K1_mul float32 arg_K2_ unsafe.Pointer arg_K2_mul float32 arg_ux_ unsafe.Pointer arg_ux_mul float32 arg_uy_ unsafe.Pointer arg_uy_mul float32 arg_uz_ unsafe.Pointer arg_uz_mul float32 arg_N int argptr [19]unsafe.Pointer sync.Mutex } // Stores the arguments for adduniaxialanisotropy2 kernel invocation var adduniaxialanisotropy2_args adduniaxialanisotropy2_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. adduniaxialanisotropy2_args.argptr[0] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_Bx) adduniaxialanisotropy2_args.argptr[1] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_By) adduniaxialanisotropy2_args.argptr[2] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_Bz) adduniaxialanisotropy2_args.argptr[3] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_mx) adduniaxialanisotropy2_args.argptr[4] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_my) adduniaxialanisotropy2_args.argptr[5] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_mz) adduniaxialanisotropy2_args.argptr[6] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_Ms_) adduniaxialanisotropy2_args.argptr[7] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_Ms_mul) adduniaxialanisotropy2_args.argptr[8] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_K1_) adduniaxialanisotropy2_args.argptr[9] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_K1_mul) adduniaxialanisotropy2_args.argptr[10] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_K2_) adduniaxialanisotropy2_args.argptr[11] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_K2_mul) adduniaxialanisotropy2_args.argptr[12] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_ux_) adduniaxialanisotropy2_args.argptr[13] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_ux_mul) adduniaxialanisotropy2_args.argptr[14] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_uy_) adduniaxialanisotropy2_args.argptr[15] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_uy_mul) adduniaxialanisotropy2_args.argptr[16] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_uz_) adduniaxialanisotropy2_args.argptr[17] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_uz_mul) adduniaxialanisotropy2_args.argptr[18] = unsafe.Pointer(&adduniaxialanisotropy2_args.arg_N) } // Wrapper for adduniaxialanisotropy2 CUDA kernel, asynchronous. func k_adduniaxialanisotropy2_async(Bx unsafe.Pointer, By unsafe.Pointer, Bz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, K1_ unsafe.Pointer, K1_mul float32, K2_ unsafe.Pointer, K2_mul float32, ux_ unsafe.Pointer, ux_mul float32, uy_ unsafe.Pointer, uy_mul float32, uz_ unsafe.Pointer, uz_mul float32, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("adduniaxialanisotropy2") } adduniaxialanisotropy2_args.Lock() defer adduniaxialanisotropy2_args.Unlock() if adduniaxialanisotropy2_code == 0 { adduniaxialanisotropy2_code = fatbinLoad(adduniaxialanisotropy2_map, "adduniaxialanisotropy2") } adduniaxialanisotropy2_args.arg_Bx = Bx adduniaxialanisotropy2_args.arg_By = By adduniaxialanisotropy2_args.arg_Bz = Bz adduniaxialanisotropy2_args.arg_mx = mx adduniaxialanisotropy2_args.arg_my = my adduniaxialanisotropy2_args.arg_mz = mz adduniaxialanisotropy2_args.arg_Ms_ = Ms_ adduniaxialanisotropy2_args.arg_Ms_mul = Ms_mul adduniaxialanisotropy2_args.arg_K1_ = K1_ adduniaxialanisotropy2_args.arg_K1_mul = K1_mul adduniaxialanisotropy2_args.arg_K2_ = K2_ adduniaxialanisotropy2_args.arg_K2_mul = K2_mul adduniaxialanisotropy2_args.arg_ux_ = ux_ adduniaxialanisotropy2_args.arg_ux_mul = ux_mul adduniaxialanisotropy2_args.arg_uy_ = uy_ adduniaxialanisotropy2_args.arg_uy_mul = uy_mul adduniaxialanisotropy2_args.arg_uz_ = uz_ adduniaxialanisotropy2_args.arg_uz_mul = uz_mul adduniaxialanisotropy2_args.arg_N = N args := adduniaxialanisotropy2_args.argptr[:] cu.LaunchKernel(adduniaxialanisotropy2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("adduniaxialanisotropy2") } } // maps compute capability on PTX code for adduniaxialanisotropy2 kernel. var adduniaxialanisotropy2_map = map[int]string{0: "", 30: adduniaxialanisotropy2_ptx_30, 32: adduniaxialanisotropy2_ptx_32, 35: adduniaxialanisotropy2_ptx_35, 37: adduniaxialanisotropy2_ptx_37, 50: adduniaxialanisotropy2_ptx_50, 52: adduniaxialanisotropy2_ptx_52, 53: adduniaxialanisotropy2_ptx_53, 60: adduniaxialanisotropy2_ptx_60, 61: adduniaxialanisotropy2_ptx_61, 62: adduniaxialanisotropy2_ptx_62, 70: adduniaxialanisotropy2_ptx_70, 72: adduniaxialanisotropy2_ptx_72, 75: adduniaxialanisotropy2_ptx_75} // adduniaxialanisotropy2 PTX code for various compute capabilities. const ( adduniaxialanisotropy2_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; mov.f32 %f67, 0f00000000; setp.eq.f32 %p5, %f7, 0f00000000; @%p5 bra BB0_9; rcp.rn.f32 %f67, %f7; BB0_9: setp.eq.s64 %p6, %rd7, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; BB0_11: mul.f32 %f12, %f64, %f67; mul.f32 %f13, %f65, %f67; mul.f32 %f14, %f66, %f67; setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra BB0_13; rcp.rn.f32 %f69, %f68; BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.f32 %f38, [%rd33]; ld.global.f32 %f39, [%rd35]; mul.f32 %f40, %f13, %f39; fma.rn.f32 %f41, %f12, %f38, %f40; ld.global.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f14, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f12, %f51; mul.f32 %f53, %f13, %f51; mul.f32 %f54, %f14, %f51; fma.rn.f32 %f55, %f12, %f46, %f52; fma.rn.f32 %f56, %f13, %f46, %f53; fma.rn.f32 %f57, %f14, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; BB0_18: ret; } ` adduniaxialanisotropy2_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; mov.f32 %f67, 0f00000000; setp.eq.f32 %p5, %f7, 0f00000000; @%p5 bra BB0_9; rcp.rn.f32 %f67, %f7; BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra BB0_13; rcp.rn.f32 %f69, %f68; BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; BB0_18: ret; } ` adduniaxialanisotropy2_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; mov.f32 %f67, 0f00000000; setp.eq.f32 %p5, %f7, 0f00000000; @%p5 bra BB0_9; rcp.rn.f32 %f67, %f7; BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra BB0_13; rcp.rn.f32 %f69, %f68; BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; BB0_18: ret; } ` adduniaxialanisotropy2_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; mov.f32 %f67, 0f00000000; setp.eq.f32 %p5, %f7, 0f00000000; @%p5 bra BB0_9; rcp.rn.f32 %f67, %f7; BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra BB0_13; rcp.rn.f32 %f69, %f68; BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; BB0_18: ret; } ` adduniaxialanisotropy2_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; mov.f32 %f67, 0f00000000; setp.eq.f32 %p5, %f7, 0f00000000; @%p5 bra BB0_9; rcp.rn.f32 %f67, %f7; BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra BB0_13; rcp.rn.f32 %f69, %f68; BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; BB0_18: ret; } ` adduniaxialanisotropy2_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; mov.f32 %f67, 0f00000000; setp.eq.f32 %p5, %f7, 0f00000000; @%p5 bra BB0_9; rcp.rn.f32 %f67, %f7; BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra BB0_13; rcp.rn.f32 %f69, %f68; BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; BB0_18: ret; } ` adduniaxialanisotropy2_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; mov.f32 %f67, 0f00000000; setp.eq.f32 %p5, %f7, 0f00000000; @%p5 bra BB0_9; rcp.rn.f32 %f67, %f7; BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra BB0_13; rcp.rn.f32 %f69, %f68; BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; BB0_18: ret; } ` adduniaxialanisotropy2_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; mov.f32 %f67, 0f00000000; setp.eq.f32 %p5, %f7, 0f00000000; @%p5 bra BB0_9; rcp.rn.f32 %f67, %f7; BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra BB0_13; rcp.rn.f32 %f69, %f68; BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; BB0_18: ret; } ` adduniaxialanisotropy2_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; mov.f32 %f67, 0f00000000; setp.eq.f32 %p5, %f7, 0f00000000; @%p5 bra BB0_9; rcp.rn.f32 %f67, %f7; BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra BB0_13; rcp.rn.f32 %f69, %f68; BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; BB0_18: ret; } ` adduniaxialanisotropy2_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; mov.f32 %f67, 0f00000000; setp.eq.f32 %p5, %f7, 0f00000000; @%p5 bra BB0_9; rcp.rn.f32 %f67, %f7; BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra BB0_13; rcp.rn.f32 %f69, %f68; BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; BB0_18: ret; } ` adduniaxialanisotropy2_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; mov.f32 %f67, 0f00000000; setp.eq.f32 %p5, %f7, 0f00000000; @%p5 bra BB0_9; rcp.rn.f32 %f67, %f7; BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra BB0_13; rcp.rn.f32 %f69, %f68; BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; BB0_18: ret; } ` adduniaxialanisotropy2_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; mov.f32 %f67, 0f00000000; setp.eq.f32 %p5, %f7, 0f00000000; @%p5 bra BB0_9; rcp.rn.f32 %f67, %f7; BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra BB0_13; rcp.rn.f32 %f69, %f68; BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; BB0_18: ret; } ` adduniaxialanisotropy2_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl adduniaxialanisotropy2 .visible .entry adduniaxialanisotropy2( .param .u64 adduniaxialanisotropy2_param_0, .param .u64 adduniaxialanisotropy2_param_1, .param .u64 adduniaxialanisotropy2_param_2, .param .u64 adduniaxialanisotropy2_param_3, .param .u64 adduniaxialanisotropy2_param_4, .param .u64 adduniaxialanisotropy2_param_5, .param .u64 adduniaxialanisotropy2_param_6, .param .f32 adduniaxialanisotropy2_param_7, .param .u64 adduniaxialanisotropy2_param_8, .param .f32 adduniaxialanisotropy2_param_9, .param .u64 adduniaxialanisotropy2_param_10, .param .f32 adduniaxialanisotropy2_param_11, .param .u64 adduniaxialanisotropy2_param_12, .param .f32 adduniaxialanisotropy2_param_13, .param .u64 adduniaxialanisotropy2_param_14, .param .f32 adduniaxialanisotropy2_param_15, .param .u64 adduniaxialanisotropy2_param_16, .param .f32 adduniaxialanisotropy2_param_17, .param .u32 adduniaxialanisotropy2_param_18 ) { .reg .pred %p<10>; .reg .f32 %f<72>; .reg .b32 %r<9>; .reg .b64 %rd<44>; ld.param.u64 %rd1, [adduniaxialanisotropy2_param_0]; ld.param.u64 %rd2, [adduniaxialanisotropy2_param_1]; ld.param.u64 %rd3, [adduniaxialanisotropy2_param_2]; ld.param.u64 %rd4, [adduniaxialanisotropy2_param_3]; ld.param.u64 %rd5, [adduniaxialanisotropy2_param_4]; ld.param.u64 %rd6, [adduniaxialanisotropy2_param_5]; ld.param.u64 %rd7, [adduniaxialanisotropy2_param_6]; ld.param.f32 %f68, [adduniaxialanisotropy2_param_7]; ld.param.u64 %rd8, [adduniaxialanisotropy2_param_8]; ld.param.f32 %f70, [adduniaxialanisotropy2_param_9]; ld.param.u64 %rd9, [adduniaxialanisotropy2_param_10]; ld.param.f32 %f71, [adduniaxialanisotropy2_param_11]; ld.param.u64 %rd10, [adduniaxialanisotropy2_param_12]; ld.param.f32 %f64, [adduniaxialanisotropy2_param_13]; ld.param.u64 %rd11, [adduniaxialanisotropy2_param_14]; ld.param.f32 %f65, [adduniaxialanisotropy2_param_15]; ld.param.u64 %rd12, [adduniaxialanisotropy2_param_16]; ld.param.f32 %f66, [adduniaxialanisotropy2_param_17]; ld.param.u32 %r2, [adduniaxialanisotropy2_param_18]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_18; setp.eq.s64 %p2, %rd10, 0; @%p2 bra BB0_3; cvta.to.global.u64 %rd13, %rd10; mul.wide.s32 %rd14, %r1, 4; add.s64 %rd15, %rd13, %rd14; ld.global.nc.f32 %f27, [%rd15]; mul.f32 %f64, %f27, %f64; BB0_3: setp.eq.s64 %p3, %rd11, 0; @%p3 bra BB0_5; cvta.to.global.u64 %rd16, %rd11; mul.wide.s32 %rd17, %r1, 4; add.s64 %rd18, %rd16, %rd17; ld.global.nc.f32 %f28, [%rd18]; mul.f32 %f65, %f28, %f65; BB0_5: setp.eq.s64 %p4, %rd12, 0; @%p4 bra BB0_7; cvta.to.global.u64 %rd19, %rd12; mul.wide.s32 %rd20, %r1, 4; add.s64 %rd21, %rd19, %rd20; ld.global.nc.f32 %f29, [%rd21]; mul.f32 %f66, %f29, %f66; BB0_7: mul.f32 %f31, %f65, %f65; fma.rn.f32 %f32, %f64, %f64, %f31; fma.rn.f32 %f33, %f66, %f66, %f32; sqrt.rn.f32 %f7, %f33; mov.f32 %f67, 0f00000000; setp.eq.f32 %p5, %f7, 0f00000000; @%p5 bra BB0_9; rcp.rn.f32 %f67, %f7; BB0_9: mul.f32 %f10, %f64, %f67; mul.f32 %f11, %f65, %f67; mul.f32 %f12, %f66, %f67; setp.eq.s64 %p6, %rd7, 0; @%p6 bra BB0_11; cvta.to.global.u64 %rd22, %rd7; mul.wide.s32 %rd23, %r1, 4; add.s64 %rd24, %rd22, %rd23; ld.global.nc.f32 %f34, [%rd24]; mul.f32 %f68, %f34, %f68; BB0_11: setp.eq.f32 %p7, %f68, 0f00000000; mov.f32 %f69, 0f00000000; @%p7 bra BB0_13; rcp.rn.f32 %f69, %f68; BB0_13: setp.eq.s64 %p8, %rd8, 0; @%p8 bra BB0_15; cvta.to.global.u64 %rd25, %rd8; mul.wide.s32 %rd26, %r1, 4; add.s64 %rd27, %rd25, %rd26; ld.global.nc.f32 %f36, [%rd27]; mul.f32 %f70, %f36, %f70; BB0_15: setp.eq.s64 %p9, %rd9, 0; @%p9 bra BB0_17; cvta.to.global.u64 %rd28, %rd9; mul.wide.s32 %rd29, %r1, 4; add.s64 %rd30, %rd28, %rd29; ld.global.nc.f32 %f37, [%rd30]; mul.f32 %f71, %f37, %f71; BB0_17: cvta.to.global.u64 %rd31, %rd4; mul.wide.s32 %rd32, %r1, 4; add.s64 %rd33, %rd31, %rd32; cvta.to.global.u64 %rd34, %rd5; add.s64 %rd35, %rd34, %rd32; cvta.to.global.u64 %rd36, %rd6; add.s64 %rd37, %rd36, %rd32; ld.global.nc.f32 %f38, [%rd33]; ld.global.nc.f32 %f39, [%rd35]; mul.f32 %f40, %f11, %f39; fma.rn.f32 %f41, %f10, %f38, %f40; ld.global.nc.f32 %f42, [%rd37]; fma.rn.f32 %f43, %f12, %f42, %f41; mul.f32 %f44, %f69, %f70; fma.rn.f32 %f45, %f69, %f70, %f44; mul.f32 %f46, %f45, %f43; mul.f32 %f47, %f69, %f71; mul.f32 %f48, %f47, 0f40800000; mul.f32 %f49, %f43, %f43; mul.f32 %f50, %f43, %f49; mul.f32 %f51, %f48, %f50; mul.f32 %f52, %f10, %f51; mul.f32 %f53, %f11, %f51; mul.f32 %f54, %f12, %f51; fma.rn.f32 %f55, %f10, %f46, %f52; fma.rn.f32 %f56, %f11, %f46, %f53; fma.rn.f32 %f57, %f12, %f46, %f54; cvta.to.global.u64 %rd38, %rd1; add.s64 %rd39, %rd38, %rd32; ld.global.f32 %f58, [%rd39]; add.f32 %f59, %f58, %f55; st.global.f32 [%rd39], %f59; cvta.to.global.u64 %rd40, %rd2; add.s64 %rd41, %rd40, %rd32; ld.global.f32 %f60, [%rd41]; add.f32 %f61, %f60, %f56; st.global.f32 [%rd41], %f61; cvta.to.global.u64 %rd42, %rd3; add.s64 %rd43, %rd42, %rd32; ld.global.f32 %f62, [%rd43]; add.f32 %f63, %f62, %f57; st.global.f32 [%rd43], %f63; BB0_18: ret; } ` ) mumax3-3.10/cuda/util.go000066400000000000000000000024471371432437400151060ustar00rootroot00000000000000package cuda import ( "fmt" "github.com/mumax/3/cuda/cu" ) // CUDA Launch parameters. // there might be better choices for recent hardware, // but it barely makes a difference in the end. const ( BlockSize = 512 TileX, TileY = 32, 32 MaxGridSize = 65535 ) // cuda launch configuration type config struct { Grid, Block cu.Dim3 } // Make a 1D kernel launch configuration suited for N threads. func make1DConf(N int) *config { bl := cu.Dim3{X: BlockSize, Y: 1, Z: 1} n2 := divUp(N, BlockSize) // N2 blocks left nx := divUp(n2, MaxGridSize) ny := divUp(n2, nx) gr := cu.Dim3{X: nx, Y: ny, Z: 1} return &config{gr, bl} } // Make a 3D kernel launch configuration suited for N threads. func make3DConf(N [3]int) *config { bl := cu.Dim3{X: TileX, Y: TileY, Z: 1} nx := divUp(N[X], TileX) ny := divUp(N[Y], TileY) gr := cu.Dim3{X: nx, Y: ny, Z: N[Z]} return &config{gr, bl} } // integer minimum func iMin(a, b int) int { if a < b { return a } return b } // Integer division rounded up. func divUp(x, y int) int { return ((x - 1) / y) + 1 } const ( X = 0 Y = 1 Z = 2 ) func checkSize(a interface { Size() [3]int }, b ...interface { Size() [3]int }) { sa := a.Size() for _, b := range b { if b.Size() != sa { panic(fmt.Sprintf("size mismatch: %v != %v", sa, b.Size())) } } } mumax3-3.10/cuda/zeromask.cu000066400000000000000000000005611371432437400157610ustar00rootroot00000000000000#include #include "float3.h" // set dst to zero in cells where mask != 0 extern "C" __global__ void zeromask(float* __restrict__ dst, float* maskLUT, uint8_t* regions, int N) { int i = ( blockIdx.y*gridDim.x + blockIdx.x ) * blockDim.x + threadIdx.x; if (i < N) { if (maskLUT[regions[i]] != 0) { dst[i] = 0; } } } mumax3-3.10/cuda/zeromask.go000066400000000000000000000005071371432437400157570ustar00rootroot00000000000000package cuda import ( "unsafe" "github.com/mumax/3/data" ) // Sets vector dst to zero where mask != 0. func ZeroMask(dst *data.Slice, mask LUTPtr, regions *Bytes) { N := dst.Len() cfg := make1DConf(N) for c := 0; c < dst.NComp(); c++ { k_zeromask_async(dst.DevPtr(c), unsafe.Pointer(mask), regions.Ptr, N, cfg) } } mumax3-3.10/cuda/zeromask_wrapper.go000066400000000000000000000434621371432437400175260ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for zeromask kernel var zeromask_code cu.Function // Stores the arguments for zeromask kernel invocation type zeromask_args_t struct { arg_dst unsafe.Pointer arg_maskLUT unsafe.Pointer arg_regions unsafe.Pointer arg_N int argptr [4]unsafe.Pointer sync.Mutex } // Stores the arguments for zeromask kernel invocation var zeromask_args zeromask_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. zeromask_args.argptr[0] = unsafe.Pointer(&zeromask_args.arg_dst) zeromask_args.argptr[1] = unsafe.Pointer(&zeromask_args.arg_maskLUT) zeromask_args.argptr[2] = unsafe.Pointer(&zeromask_args.arg_regions) zeromask_args.argptr[3] = unsafe.Pointer(&zeromask_args.arg_N) } // Wrapper for zeromask CUDA kernel, asynchronous. func k_zeromask_async(dst unsafe.Pointer, maskLUT unsafe.Pointer, regions unsafe.Pointer, N int, cfg *config) { if Synchronous { // debug Sync() timer.Start("zeromask") } zeromask_args.Lock() defer zeromask_args.Unlock() if zeromask_code == 0 { zeromask_code = fatbinLoad(zeromask_map, "zeromask") } zeromask_args.arg_dst = dst zeromask_args.arg_maskLUT = maskLUT zeromask_args.arg_regions = regions zeromask_args.arg_N = N args := zeromask_args.argptr[:] cu.LaunchKernel(zeromask_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("zeromask") } } // maps compute capability on PTX code for zeromask kernel. var zeromask_map = map[int]string{0: "", 30: zeromask_ptx_30, 32: zeromask_ptx_32, 35: zeromask_ptx_35, 37: zeromask_ptx_37, 50: zeromask_ptx_50, 52: zeromask_ptx_52, 53: zeromask_ptx_53, 60: zeromask_ptx_60, 61: zeromask_ptx_61, 62: zeromask_ptx_62, 70: zeromask_ptx_70, 72: zeromask_ptx_72, 75: zeromask_ptx_75} // zeromask PTX code for various compute capabilities. const ( zeromask_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<2>; .reg .b32 %r<11>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; cvta.to.global.u64 %rd7, %rd3; ld.global.u8 %r9, [%rd6]; mul.wide.u32 %rd8, %r9, 4; add.s64 %rd9, %rd7, %rd8; ld.global.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r10, 0; st.global.u32 [%rd12], %r10; BB0_3: ret; } ` zeromask_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; BB0_3: ret; } ` zeromask_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; BB0_3: ret; } ` zeromask_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; BB0_3: ret; } ` zeromask_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; BB0_3: ret; } ` zeromask_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; BB0_3: ret; } ` zeromask_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; BB0_3: ret; } ` zeromask_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; BB0_3: ret; } ` zeromask_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; BB0_3: ret; } ` zeromask_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; BB0_3: ret; } ` zeromask_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; BB0_3: ret; } ` zeromask_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; BB0_3: ret; } ` zeromask_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl zeromask .visible .entry zeromask( .param .u64 zeromask_param_0, .param .u64 zeromask_param_1, .param .u64 zeromask_param_2, .param .u32 zeromask_param_3 ) { .reg .pred %p<3>; .reg .b16 %rs<2>; .reg .f32 %f<2>; .reg .b32 %r<12>; .reg .b64 %rd<13>; ld.param.u64 %rd2, [zeromask_param_0]; ld.param.u64 %rd3, [zeromask_param_1]; ld.param.u64 %rd4, [zeromask_param_2]; ld.param.u32 %r2, [zeromask_param_3]; mov.u32 %r3, %nctaid.x; mov.u32 %r4, %ctaid.y; mov.u32 %r5, %ctaid.x; mad.lo.s32 %r6, %r3, %r4, %r5; mov.u32 %r7, %ntid.x; mov.u32 %r8, %tid.x; mad.lo.s32 %r1, %r6, %r7, %r8; setp.ge.s32 %p1, %r1, %r2; @%p1 bra BB0_3; cvta.to.global.u64 %rd5, %rd4; cvt.s64.s32 %rd1, %r1; add.s64 %rd6, %rd5, %rd1; ld.global.nc.u8 %rs1, [%rd6]; cvta.to.global.u64 %rd7, %rd3; cvt.u32.u16 %r9, %rs1; and.b32 %r10, %r9, 255; mul.wide.u32 %rd8, %r10, 4; add.s64 %rd9, %rd7, %rd8; ld.global.nc.f32 %f1, [%rd9]; setp.eq.f32 %p2, %f1, 0f00000000; @%p2 bra BB0_3; cvta.to.global.u64 %rd10, %rd2; shl.b64 %rd11, %rd1, 2; add.s64 %rd12, %rd10, %rd11; mov.u32 %r11, 0; st.global.u32 [%rd12], %r11; BB0_3: ret; } ` ) mumax3-3.10/cuda/zhangli.go000066400000000000000000000012561371432437400155620ustar00rootroot00000000000000package cuda import ( "github.com/mumax/3/data" ) // Add Zhang-Li ST torque (Tesla) to torque. // see zhangli.cu func AddZhangLiTorque(torque, m *data.Slice, Msat, J, alpha, xi, pol MSlice, mesh *data.Mesh) { c := mesh.CellSize() N := mesh.Size() cfg := make3DConf(N) k_addzhanglitorque2_async( torque.DevPtr(X), torque.DevPtr(Y), torque.DevPtr(Z), m.DevPtr(X), m.DevPtr(Y), m.DevPtr(Z), Msat.DevPtr(0), Msat.Mul(0), J.DevPtr(X), J.Mul(X), J.DevPtr(Y), J.Mul(Y), J.DevPtr(Z), J.Mul(Z), alpha.DevPtr(0), alpha.Mul(0), xi.DevPtr(0), xi.Mul(0), pol.DevPtr(0), pol.Mul(0), float32(c[X]), float32(c[Y]), float32(c[Z]), N[X], N[Y], N[Z], mesh.PBC_code(), cfg) } mumax3-3.10/cuda/zhangli2.cu000066400000000000000000000046401371432437400156460ustar00rootroot00000000000000#include "amul.h" #include "constants.h" #include "float3.h" #include "stencil.h" #include #define PREFACTOR ((MUB) / (2 * QE * GAMMA0)) // spatial derivatives without dividing by cell size #define deltax(in) (in[idx(hclampx(ix+1), iy, iz)] - in[idx(lclampx(ix-1), iy, iz)]) #define deltay(in) (in[idx(ix, hclampy(iy+1), iz)] - in[idx(ix, lclampy(iy-1), iz)]) #define deltaz(in) (in[idx(ix, iy, hclampz(iz+1))] - in[idx(ix, iy, lclampz(iz-1))]) extern "C" __global__ void addzhanglitorque2(float* __restrict__ tx, float* __restrict__ ty, float* __restrict__ tz, float* __restrict__ mx, float* __restrict__ my, float* __restrict__ mz, float* __restrict__ Ms_, float Ms_mul, float* __restrict__ jx_, float jx_mul, float* __restrict__ jy_, float jy_mul, float* __restrict__ jz_, float jz_mul, float* __restrict__ alpha_, float alpha_mul, float* __restrict__ xi_, float xi_mul, float* __restrict__ pol_, float pol_mul, float cx, float cy, float cz, int Nx, int Ny, int Nz, uint8_t PBC) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; int iz = blockIdx.z * blockDim.z + threadIdx.z; if (ix >= Nx || iy >= Ny || iz >= Nz) { return; } int i = idx(ix, iy, iz); float alpha = amul(alpha_, alpha_mul, i); float xi = amul(xi_, xi_mul, i); float pol = amul(pol_, pol_mul, i); float invMs = inv_Msat(Ms_, Ms_mul, i); float b = invMs * PREFACTOR / (1.0f + xi*xi); float3 J = pol*vmul(jx_, jy_, jz_, jx_mul, jy_mul, jz_mul, i); float3 hspin = make_float3(0.0f, 0.0f, 0.0f); // (u·∇)m if (J.x != 0.0f) { hspin += (b/cx)*J.x * make_float3(deltax(mx), deltax(my), deltax(mz)); } if (J.y != 0.0f) { hspin += (b/cy)*J.y * make_float3(deltay(mx), deltay(my), deltay(mz)); } if (J.z != 0.0f) { hspin += (b/cz)*J.z * make_float3(deltaz(mx), deltaz(my), deltaz(mz)); } float3 m = make_float3(mx[i], my[i], mz[i]); float3 torque = (-1.0f/(1.0f + alpha*alpha)) * ( (1.0f+xi*alpha) * cross(m, cross(m, hspin)) +( xi-alpha) * cross(m, hspin) ); // write back, adding to torque tx[i] += torque.x; ty[i] += torque.y; tz[i] += torque.z; } mumax3-3.10/cuda/zhangli2_wrapper.go000066400000000000000000006053251371432437400174130ustar00rootroot00000000000000package cuda /* THIS FILE IS AUTO-GENERATED BY CUDA2GO. EDITING IS FUTILE. */ import ( "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" "sync" "unsafe" ) // CUDA handle for addzhanglitorque2 kernel var addzhanglitorque2_code cu.Function // Stores the arguments for addzhanglitorque2 kernel invocation type addzhanglitorque2_args_t struct { arg_tx unsafe.Pointer arg_ty unsafe.Pointer arg_tz unsafe.Pointer arg_mx unsafe.Pointer arg_my unsafe.Pointer arg_mz unsafe.Pointer arg_Ms_ unsafe.Pointer arg_Ms_mul float32 arg_jx_ unsafe.Pointer arg_jx_mul float32 arg_jy_ unsafe.Pointer arg_jy_mul float32 arg_jz_ unsafe.Pointer arg_jz_mul float32 arg_alpha_ unsafe.Pointer arg_alpha_mul float32 arg_xi_ unsafe.Pointer arg_xi_mul float32 arg_pol_ unsafe.Pointer arg_pol_mul float32 arg_cx float32 arg_cy float32 arg_cz float32 arg_Nx int arg_Ny int arg_Nz int arg_PBC byte argptr [27]unsafe.Pointer sync.Mutex } // Stores the arguments for addzhanglitorque2 kernel invocation var addzhanglitorque2_args addzhanglitorque2_args_t func init() { // CUDA driver kernel call wants pointers to arguments, set them up once. addzhanglitorque2_args.argptr[0] = unsafe.Pointer(&addzhanglitorque2_args.arg_tx) addzhanglitorque2_args.argptr[1] = unsafe.Pointer(&addzhanglitorque2_args.arg_ty) addzhanglitorque2_args.argptr[2] = unsafe.Pointer(&addzhanglitorque2_args.arg_tz) addzhanglitorque2_args.argptr[3] = unsafe.Pointer(&addzhanglitorque2_args.arg_mx) addzhanglitorque2_args.argptr[4] = unsafe.Pointer(&addzhanglitorque2_args.arg_my) addzhanglitorque2_args.argptr[5] = unsafe.Pointer(&addzhanglitorque2_args.arg_mz) addzhanglitorque2_args.argptr[6] = unsafe.Pointer(&addzhanglitorque2_args.arg_Ms_) addzhanglitorque2_args.argptr[7] = unsafe.Pointer(&addzhanglitorque2_args.arg_Ms_mul) addzhanglitorque2_args.argptr[8] = unsafe.Pointer(&addzhanglitorque2_args.arg_jx_) addzhanglitorque2_args.argptr[9] = unsafe.Pointer(&addzhanglitorque2_args.arg_jx_mul) addzhanglitorque2_args.argptr[10] = unsafe.Pointer(&addzhanglitorque2_args.arg_jy_) addzhanglitorque2_args.argptr[11] = unsafe.Pointer(&addzhanglitorque2_args.arg_jy_mul) addzhanglitorque2_args.argptr[12] = unsafe.Pointer(&addzhanglitorque2_args.arg_jz_) addzhanglitorque2_args.argptr[13] = unsafe.Pointer(&addzhanglitorque2_args.arg_jz_mul) addzhanglitorque2_args.argptr[14] = unsafe.Pointer(&addzhanglitorque2_args.arg_alpha_) addzhanglitorque2_args.argptr[15] = unsafe.Pointer(&addzhanglitorque2_args.arg_alpha_mul) addzhanglitorque2_args.argptr[16] = unsafe.Pointer(&addzhanglitorque2_args.arg_xi_) addzhanglitorque2_args.argptr[17] = unsafe.Pointer(&addzhanglitorque2_args.arg_xi_mul) addzhanglitorque2_args.argptr[18] = unsafe.Pointer(&addzhanglitorque2_args.arg_pol_) addzhanglitorque2_args.argptr[19] = unsafe.Pointer(&addzhanglitorque2_args.arg_pol_mul) addzhanglitorque2_args.argptr[20] = unsafe.Pointer(&addzhanglitorque2_args.arg_cx) addzhanglitorque2_args.argptr[21] = unsafe.Pointer(&addzhanglitorque2_args.arg_cy) addzhanglitorque2_args.argptr[22] = unsafe.Pointer(&addzhanglitorque2_args.arg_cz) addzhanglitorque2_args.argptr[23] = unsafe.Pointer(&addzhanglitorque2_args.arg_Nx) addzhanglitorque2_args.argptr[24] = unsafe.Pointer(&addzhanglitorque2_args.arg_Ny) addzhanglitorque2_args.argptr[25] = unsafe.Pointer(&addzhanglitorque2_args.arg_Nz) addzhanglitorque2_args.argptr[26] = unsafe.Pointer(&addzhanglitorque2_args.arg_PBC) } // Wrapper for addzhanglitorque2 CUDA kernel, asynchronous. func k_addzhanglitorque2_async(tx unsafe.Pointer, ty unsafe.Pointer, tz unsafe.Pointer, mx unsafe.Pointer, my unsafe.Pointer, mz unsafe.Pointer, Ms_ unsafe.Pointer, Ms_mul float32, jx_ unsafe.Pointer, jx_mul float32, jy_ unsafe.Pointer, jy_mul float32, jz_ unsafe.Pointer, jz_mul float32, alpha_ unsafe.Pointer, alpha_mul float32, xi_ unsafe.Pointer, xi_mul float32, pol_ unsafe.Pointer, pol_mul float32, cx float32, cy float32, cz float32, Nx int, Ny int, Nz int, PBC byte, cfg *config) { if Synchronous { // debug Sync() timer.Start("addzhanglitorque2") } addzhanglitorque2_args.Lock() defer addzhanglitorque2_args.Unlock() if addzhanglitorque2_code == 0 { addzhanglitorque2_code = fatbinLoad(addzhanglitorque2_map, "addzhanglitorque2") } addzhanglitorque2_args.arg_tx = tx addzhanglitorque2_args.arg_ty = ty addzhanglitorque2_args.arg_tz = tz addzhanglitorque2_args.arg_mx = mx addzhanglitorque2_args.arg_my = my addzhanglitorque2_args.arg_mz = mz addzhanglitorque2_args.arg_Ms_ = Ms_ addzhanglitorque2_args.arg_Ms_mul = Ms_mul addzhanglitorque2_args.arg_jx_ = jx_ addzhanglitorque2_args.arg_jx_mul = jx_mul addzhanglitorque2_args.arg_jy_ = jy_ addzhanglitorque2_args.arg_jy_mul = jy_mul addzhanglitorque2_args.arg_jz_ = jz_ addzhanglitorque2_args.arg_jz_mul = jz_mul addzhanglitorque2_args.arg_alpha_ = alpha_ addzhanglitorque2_args.arg_alpha_mul = alpha_mul addzhanglitorque2_args.arg_xi_ = xi_ addzhanglitorque2_args.arg_xi_mul = xi_mul addzhanglitorque2_args.arg_pol_ = pol_ addzhanglitorque2_args.arg_pol_mul = pol_mul addzhanglitorque2_args.arg_cx = cx addzhanglitorque2_args.arg_cy = cy addzhanglitorque2_args.arg_cz = cz addzhanglitorque2_args.arg_Nx = Nx addzhanglitorque2_args.arg_Ny = Ny addzhanglitorque2_args.arg_Nz = Nz addzhanglitorque2_args.arg_PBC = PBC args := addzhanglitorque2_args.argptr[:] cu.LaunchKernel(addzhanglitorque2_code, cfg.Grid.X, cfg.Grid.Y, cfg.Grid.Z, cfg.Block.X, cfg.Block.Y, cfg.Block.Z, 0, stream0, args) if Synchronous { // debug Sync() timer.Stop("addzhanglitorque2") } } // maps compute capability on PTX code for addzhanglitorque2 kernel. var addzhanglitorque2_map = map[int]string{0: "", 30: addzhanglitorque2_ptx_30, 32: addzhanglitorque2_ptx_32, 35: addzhanglitorque2_ptx_35, 37: addzhanglitorque2_ptx_37, 50: addzhanglitorque2_ptx_50, 52: addzhanglitorque2_ptx_52, 53: addzhanglitorque2_ptx_53, 60: addzhanglitorque2_ptx_60, 61: addzhanglitorque2_ptx_61, 62: addzhanglitorque2_ptx_62, 70: addzhanglitorque2_ptx_70, 72: addzhanglitorque2_ptx_72, 75: addzhanglitorque2_ptx_75} // addzhanglitorque2 PTX code for various compute capabilities. const ( addzhanglitorque2_ptx_30 = ` .version 6.5 .target sm_30 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<20>; .reg .f32 %f<149>; .reg .b32 %r<491>; .reg .f64 %fd<5>; .reg .b64 %rd<102>; ld.param.u64 %rd1, [addzhanglitorque2_param_0]; ld.param.u64 %rd2, [addzhanglitorque2_param_1]; ld.param.u64 %rd3, [addzhanglitorque2_param_2]; ld.param.u64 %rd4, [addzhanglitorque2_param_3]; ld.param.u64 %rd5, [addzhanglitorque2_param_4]; ld.param.u64 %rd6, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r60, [addzhanglitorque2_param_23]; ld.param.u32 %r61, [addzhanglitorque2_param_24]; ld.param.u32 %r62, [addzhanglitorque2_param_25]; ld.param.u8 %rs3, [addzhanglitorque2_param_26]; mov.u32 %r63, %ntid.x; mov.u32 %r64, %ctaid.x; mov.u32 %r65, %tid.x; mad.lo.s32 %r66, %r63, %r64, %r65; mov.u32 %r67, %ntid.y; mov.u32 %r68, %ctaid.y; mov.u32 %r69, %tid.y; mad.lo.s32 %r70, %r67, %r68, %r69; mov.u32 %r71, %ntid.z; mov.u32 %r72, %ctaid.z; mov.u32 %r73, %tid.z; mad.lo.s32 %r74, %r71, %r72, %r73; setp.ge.s32 %p1, %r70, %r61; setp.ge.s32 %p2, %r66, %r60; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r74, %r62; or.pred %p5, %p3, %p4; @%p5 bra BB0_78; mad.lo.s32 %r83, %r74, %r61, %r70; mul.lo.s32 %r1, %r83, %r60; setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd14, %rd11; add.s32 %r88, %r1, %r66; mul.wide.s32 %rd15, %r88, 4; add.s64 %rd16, %rd14, %rd15; ld.global.f32 %f67, [%rd16]; mul.f32 %f132, %f67, %f132; BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_5; cvta.to.global.u64 %rd17, %rd12; add.s32 %r93, %r1, %r66; mul.wide.s32 %rd18, %r93, 4; add.s64 %rd19, %rd17, %rd18; ld.global.f32 %f68, [%rd19]; mul.f32 %f133, %f68, %f133; BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra BB0_7; cvta.to.global.u64 %rd20, %rd13; add.s32 %r98, %r1, %r66; mul.wide.s32 %rd21, %r98, 4; add.s64 %rd22, %rd20, %rd21; ld.global.f32 %f69, [%rd22]; mul.f32 %f134, %f69, %f134; BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra BB0_9; cvta.to.global.u64 %rd23, %rd7; add.s32 %r103, %r1, %r66; mul.wide.s32 %rd24, %r103, 4; add.s64 %rd25, %rd23, %rd24; ld.global.f32 %f70, [%rd25]; mul.f32 %f135, %f70, %f135; BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra BB0_11; rcp.rn.f32 %f136, %f135; BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra BB0_13; cvta.to.global.u64 %rd26, %rd8; add.s32 %r108, %r1, %r66; mul.wide.s32 %rd27, %r108, 4; add.s64 %rd28, %rd26, %rd27; ld.global.f32 %f73, [%rd28]; mul.f32 %f137, %f73, %f137; BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra BB0_15; cvta.to.global.u64 %rd29, %rd9; add.s32 %r113, %r1, %r66; mul.wide.s32 %rd30, %r113, 4; add.s64 %rd31, %rd29, %rd30; ld.global.f32 %f74, [%rd31]; mul.f32 %f138, %f74, %f138; BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra BB0_17; add.s32 %r118, %r1, %r66; cvta.to.global.u64 %rd32, %rd10; mul.wide.s32 %rd33, %r118, 4; add.s64 %rd34, %rd32, %rd33; ld.global.f32 %f75, [%rd34]; mul.f32 %f139, %f75, %f139; BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; mov.f32 %f143, 0f00000000; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra BB0_37; and.b16 %rs4, %rs3, 1; setp.eq.b16 %p15, %rs4, 1; @!%p15 bra BB0_20; bra.uni BB0_19; BB0_19: add.s32 %r123, %r66, 1; rem.s32 %r124, %r123, %r60; add.s32 %r125, %r124, %r60; rem.s32 %r473, %r125, %r60; bra.uni BB0_21; BB0_20: add.s32 %r126, %r60, -1; add.s32 %r131, %r66, 1; min.s32 %r473, %r131, %r126; BB0_21: setp.eq.b16 %p16, %rs4, 1; mad.lo.s32 %r141, %r83, %r60, %r473; cvta.to.global.u64 %rd35, %rd4; mul.wide.s32 %rd36, %r141, 4; add.s64 %rd37, %rd35, %rd36; ld.global.f32 %f21, [%rd37]; @!%p16 bra BB0_23; bra.uni BB0_22; BB0_22: add.s32 %r146, %r66, -1; rem.s32 %r147, %r146, %r60; add.s32 %r148, %r147, %r60; rem.s32 %r474, %r148, %r60; bra.uni BB0_24; BB0_23: add.s32 %r153, %r66, -1; mov.u32 %r154, 0; max.s32 %r474, %r153, %r154; BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f20, %f79; setp.eq.b16 %p17, %rs4, 1; mad.lo.s32 %r164, %r83, %r60, %r474; mul.wide.s32 %rd39, %r164, 4; add.s64 %rd40, %rd35, %rd39; ld.global.f32 %f80, [%rd40]; sub.f32 %f23, %f21, %f80; @!%p17 bra BB0_26; bra.uni BB0_25; BB0_25: add.s32 %r169, %r66, 1; rem.s32 %r170, %r169, %r60; add.s32 %r171, %r170, %r60; rem.s32 %r475, %r171, %r60; bra.uni BB0_27; BB0_26: add.s32 %r172, %r60, -1; add.s32 %r177, %r66, 1; min.s32 %r475, %r177, %r172; BB0_27: setp.eq.b16 %p18, %rs4, 1; mad.lo.s32 %r187, %r83, %r60, %r475; cvta.to.global.u64 %rd41, %rd5; mul.wide.s32 %rd42, %r187, 4; add.s64 %rd43, %rd41, %rd42; ld.global.f32 %f24, [%rd43]; @!%p18 bra BB0_29; bra.uni BB0_28; BB0_28: add.s32 %r192, %r66, -1; rem.s32 %r193, %r192, %r60; add.s32 %r194, %r193, %r60; rem.s32 %r476, %r194, %r60; bra.uni BB0_30; BB0_29: add.s32 %r199, %r66, -1; mov.u32 %r200, 0; max.s32 %r476, %r199, %r200; BB0_30: setp.eq.b16 %p19, %rs4, 1; mad.lo.s32 %r210, %r83, %r60, %r476; mul.wide.s32 %rd45, %r210, 4; add.s64 %rd46, %rd41, %rd45; ld.global.f32 %f81, [%rd46]; sub.f32 %f25, %f24, %f81; @!%p19 bra BB0_32; bra.uni BB0_31; BB0_31: add.s32 %r215, %r66, 1; rem.s32 %r216, %r215, %r60; add.s32 %r217, %r216, %r60; rem.s32 %r477, %r217, %r60; bra.uni BB0_33; BB0_32: add.s32 %r218, %r60, -1; add.s32 %r223, %r66, 1; min.s32 %r477, %r223, %r218; BB0_33: setp.eq.b16 %p20, %rs4, 1; mad.lo.s32 %r233, %r83, %r60, %r477; cvta.to.global.u64 %rd47, %rd6; mul.wide.s32 %rd48, %r233, 4; add.s64 %rd49, %rd47, %rd48; ld.global.f32 %f26, [%rd49]; @!%p20 bra BB0_35; bra.uni BB0_34; BB0_34: add.s32 %r238, %r66, -1; rem.s32 %r239, %r238, %r60; add.s32 %r240, %r239, %r60; rem.s32 %r478, %r240, %r60; bra.uni BB0_36; BB0_35: add.s32 %r245, %r66, -1; mov.u32 %r246, 0; max.s32 %r478, %r245, %r246; BB0_36: mad.lo.s32 %r256, %r83, %r60, %r478; mul.wide.s32 %rd51, %r256, 4; add.s64 %rd52, %rd47, %rd51; ld.global.f32 %f82, [%rd52]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra BB0_57; and.b16 %rs1, %rs3, 2; setp.eq.s16 %p22, %rs1, 0; add.s32 %r20, %r70, 1; @%p22 bra BB0_40; rem.s32 %r261, %r20, %r61; add.s32 %r262, %r261, %r61; rem.s32 %r479, %r262, %r61; bra.uni BB0_41; BB0_40: add.s32 %r263, %r61, -1; min.s32 %r479, %r20, %r263; BB0_41: mad.lo.s32 %r268, %r74, %r61, %r479; mad.lo.s32 %r273, %r268, %r60, %r66; cvta.to.global.u64 %rd53, %rd4; mul.wide.s32 %rd54, %r273, 4; add.s64 %rd55, %rd53, %rd54; ld.global.f32 %f33, [%rd55]; add.s32 %r24, %r70, -1; @%p22 bra BB0_43; rem.s32 %r278, %r24, %r61; add.s32 %r279, %r278, %r61; rem.s32 %r480, %r279, %r61; bra.uni BB0_44; BB0_43: mov.u32 %r280, 0; max.s32 %r480, %r24, %r280; BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f18, %f84; mad.lo.s32 %r285, %r74, %r61, %r480; mad.lo.s32 %r290, %r285, %r60, %r66; mul.wide.s32 %rd57, %r290, 4; add.s64 %rd58, %rd53, %rd57; ld.global.f32 %f85, [%rd58]; sub.f32 %f35, %f33, %f85; @%p22 bra BB0_46; rem.s32 %r291, %r20, %r61; add.s32 %r292, %r291, %r61; rem.s32 %r481, %r292, %r61; bra.uni BB0_47; BB0_46: add.s32 %r293, %r61, -1; min.s32 %r481, %r20, %r293; BB0_47: mad.lo.s32 %r298, %r74, %r61, %r481; mad.lo.s32 %r303, %r298, %r60, %r66; cvta.to.global.u64 %rd59, %rd5; mul.wide.s32 %rd60, %r303, 4; add.s64 %rd61, %rd59, %rd60; ld.global.f32 %f36, [%rd61]; @%p22 bra BB0_49; rem.s32 %r309, %r24, %r61; add.s32 %r310, %r309, %r61; rem.s32 %r482, %r310, %r61; bra.uni BB0_50; BB0_49: mov.u32 %r316, 0; max.s32 %r482, %r24, %r316; BB0_50: mad.lo.s32 %r321, %r74, %r61, %r482; mad.lo.s32 %r326, %r321, %r60, %r66; mul.wide.s32 %rd63, %r326, 4; add.s64 %rd64, %rd59, %rd63; ld.global.f32 %f86, [%rd64]; sub.f32 %f37, %f36, %f86; @%p22 bra BB0_52; rem.s32 %r332, %r20, %r61; add.s32 %r333, %r332, %r61; rem.s32 %r483, %r333, %r61; bra.uni BB0_53; BB0_52: add.s32 %r334, %r61, -1; min.s32 %r483, %r20, %r334; BB0_53: mad.lo.s32 %r344, %r74, %r61, %r483; mad.lo.s32 %r349, %r344, %r60, %r66; cvta.to.global.u64 %rd65, %rd6; mul.wide.s32 %rd66, %r349, 4; add.s64 %rd67, %rd65, %rd66; ld.global.f32 %f38, [%rd67]; @%p22 bra BB0_55; rem.s32 %r355, %r24, %r61; add.s32 %r356, %r355, %r61; rem.s32 %r484, %r356, %r61; bra.uni BB0_56; BB0_55: mov.u32 %r362, 0; max.s32 %r484, %r24, %r362; BB0_56: mad.lo.s32 %r367, %r74, %r61, %r484; mad.lo.s32 %r372, %r367, %r60, %r66; mul.wide.s32 %rd69, %r372, 4; add.s64 %rd70, %rd65, %rd69; ld.global.f32 %f87, [%rd70]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f19, %f89; and.b16 %rs2, %rs3, 4; setp.eq.s16 %p29, %rs2, 0; add.s32 %r40, %r74, 1; @%p29 bra BB0_60; rem.s32 %r377, %r40, %r62; add.s32 %r378, %r377, %r62; rem.s32 %r485, %r378, %r62; bra.uni BB0_61; BB0_60: add.s32 %r379, %r62, -1; min.s32 %r485, %r40, %r379; BB0_61: mad.lo.s32 %r384, %r485, %r61, %r70; mad.lo.s32 %r389, %r384, %r60, %r66; cvta.to.global.u64 %rd71, %rd4; mul.wide.s32 %rd72, %r389, 4; add.s64 %rd73, %rd71, %rd72; ld.global.f32 %f46, [%rd73]; add.s32 %r44, %r74, -1; @%p29 bra BB0_63; rem.s32 %r394, %r44, %r62; add.s32 %r395, %r394, %r62; rem.s32 %r486, %r395, %r62; bra.uni BB0_64; BB0_63: mov.u32 %r396, 0; max.s32 %r486, %r44, %r396; BB0_64: mad.lo.s32 %r401, %r486, %r61, %r70; mad.lo.s32 %r406, %r401, %r60, %r66; mul.wide.s32 %rd75, %r406, 4; add.s64 %rd76, %rd71, %rd75; ld.global.f32 %f90, [%rd76]; sub.f32 %f47, %f46, %f90; @%p29 bra BB0_66; rem.s32 %r407, %r40, %r62; add.s32 %r408, %r407, %r62; rem.s32 %r487, %r408, %r62; bra.uni BB0_67; BB0_66: add.s32 %r409, %r62, -1; min.s32 %r487, %r40, %r409; BB0_67: mad.lo.s32 %r414, %r487, %r61, %r70; mad.lo.s32 %r419, %r414, %r60, %r66; cvta.to.global.u64 %rd77, %rd5; mul.wide.s32 %rd78, %r419, 4; add.s64 %rd79, %rd77, %rd78; ld.global.f32 %f48, [%rd79]; @%p29 bra BB0_69; rem.s32 %r420, %r44, %r62; add.s32 %r421, %r420, %r62; rem.s32 %r488, %r421, %r62; bra.uni BB0_70; BB0_69: mov.u32 %r422, 0; max.s32 %r488, %r44, %r422; BB0_70: mad.lo.s32 %r427, %r488, %r61, %r70; mad.lo.s32 %r432, %r427, %r60, %r66; mul.wide.s32 %rd81, %r432, 4; add.s64 %rd82, %rd77, %rd81; ld.global.f32 %f91, [%rd82]; sub.f32 %f49, %f48, %f91; @%p29 bra BB0_72; rem.s32 %r433, %r40, %r62; add.s32 %r434, %r433, %r62; rem.s32 %r489, %r434, %r62; bra.uni BB0_73; BB0_72: add.s32 %r435, %r62, -1; min.s32 %r489, %r40, %r435; BB0_73: mad.lo.s32 %r440, %r489, %r61, %r70; mad.lo.s32 %r445, %r440, %r60, %r66; cvta.to.global.u64 %rd83, %rd6; mul.wide.s32 %rd84, %r445, 4; add.s64 %rd85, %rd83, %rd84; ld.global.f32 %f50, [%rd85]; @%p29 bra BB0_75; rem.s32 %r446, %r44, %r62; add.s32 %r447, %r446, %r62; rem.s32 %r490, %r447, %r62; bra.uni BB0_76; BB0_75: mov.u32 %r448, 0; max.s32 %r490, %r44, %r448; BB0_76: mad.lo.s32 %r453, %r490, %r61, %r70; mad.lo.s32 %r458, %r453, %r60, %r66; mul.wide.s32 %rd87, %r458, 4; add.s64 %rd88, %rd83, %rd87; ld.global.f32 %f92, [%rd88]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; BB0_77: mad.lo.s32 %r472, %r83, %r60, %r66; cvta.to.global.u64 %rd89, %rd4; mul.wide.s32 %rd90, %r472, 4; add.s64 %rd91, %rd89, %rd90; cvta.to.global.u64 %rd92, %rd5; add.s64 %rd93, %rd92, %rd90; cvta.to.global.u64 %rd94, %rd6; add.s64 %rd95, %rd94, %rd90; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.f32 %f98, [%rd93]; mul.f32 %f99, %f145, %f98; ld.global.f32 %f100, [%rd95]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.f32 %f104, [%rd91]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; cvta.to.global.u64 %rd96, %rd1; add.s64 %rd97, %rd96, %rd90; ld.global.f32 %f126, [%rd97]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd97], %f127; cvta.to.global.u64 %rd98, %rd2; add.s64 %rd99, %rd98, %rd90; ld.global.f32 %f128, [%rd99]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd99], %f129; cvta.to.global.u64 %rd100, %rd3; add.s64 %rd101, %rd100, %rd90; ld.global.f32 %f130, [%rd101]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd101], %f131; BB0_78: ret; } ` addzhanglitorque2_ptx_32 = ` .version 6.5 .target sm_32 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<15>; .reg .f32 %f<149>; .reg .b32 %r<182>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r70, %r71, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r73, %r74, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r76, %r77, %r78; setp.ge.s32 %p1, %r2, %r68; setp.ge.s32 %p2, %r1, %r67; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra BB0_11; rcp.rn.f32 %f136, %f135; BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; mov.f32 %f143, 0f00000000; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra BB0_20; rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r164, %r81, %r67; bra.uni BB0_21; BB0_20: add.s32 %r82, %r67, -1; min.s32 %r164, %r7, %r82; BB0_21: add.s32 %r83, %r164, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra BB0_23; rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r165, %r85, %r67; bra.uni BB0_24; BB0_23: mov.u32 %r86, 0; max.s32 %r165, %r11, %r86; BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f20, %f79; add.s32 %r87, %r165, %r5; mul.wide.s32 %rd40, %r87, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra BB0_26; rem.s32 %r88, %r7, %r67; add.s32 %r89, %r88, %r67; rem.s32 %r166, %r89, %r67; bra.uni BB0_27; BB0_26: add.s32 %r90, %r67, -1; min.s32 %r166, %r7, %r90; BB0_27: add.s32 %r91, %r166, %r5; mul.wide.s32 %rd42, %r91, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra BB0_29; rem.s32 %r92, %r11, %r67; add.s32 %r93, %r92, %r67; rem.s32 %r167, %r93, %r67; bra.uni BB0_30; BB0_29: mov.u32 %r94, 0; max.s32 %r167, %r11, %r94; BB0_30: add.s32 %r95, %r167, %r5; mul.wide.s32 %rd44, %r95, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra BB0_32; rem.s32 %r96, %r7, %r67; add.s32 %r97, %r96, %r67; rem.s32 %r168, %r97, %r67; bra.uni BB0_33; BB0_32: add.s32 %r98, %r67, -1; min.s32 %r168, %r7, %r98; BB0_33: add.s32 %r99, %r168, %r5; mul.wide.s32 %rd46, %r99, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra BB0_35; rem.s32 %r100, %r11, %r67; add.s32 %r101, %r100, %r67; rem.s32 %r169, %r101, %r67; bra.uni BB0_36; BB0_35: mov.u32 %r102, 0; max.s32 %r169, %r11, %r102; BB0_36: add.s32 %r103, %r169, %r5; mul.wide.s32 %rd48, %r103, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra BB0_40; rem.s32 %r104, %r27, %r68; add.s32 %r105, %r104, %r68; rem.s32 %r170, %r105, %r68; bra.uni BB0_41; BB0_40: add.s32 %r106, %r68, -1; min.s32 %r170, %r27, %r106; BB0_41: add.s32 %r107, %r170, %r4; mad.lo.s32 %r108, %r107, %r67, %r1; mul.wide.s32 %rd50, %r108, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra BB0_43; rem.s32 %r109, %r31, %r68; add.s32 %r110, %r109, %r68; rem.s32 %r171, %r110, %r68; bra.uni BB0_44; BB0_43: mov.u32 %r111, 0; max.s32 %r171, %r31, %r111; BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f18, %f84; add.s32 %r112, %r171, %r4; mad.lo.s32 %r113, %r112, %r67, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra BB0_46; rem.s32 %r114, %r27, %r68; add.s32 %r115, %r114, %r68; rem.s32 %r172, %r115, %r68; bra.uni BB0_47; BB0_46: add.s32 %r116, %r68, -1; min.s32 %r172, %r27, %r116; BB0_47: add.s32 %r117, %r172, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd54, %r118, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra BB0_49; rem.s32 %r119, %r31, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r173, %r120, %r68; bra.uni BB0_50; BB0_49: mov.u32 %r121, 0; max.s32 %r173, %r31, %r121; BB0_50: add.s32 %r122, %r173, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd56, %r123, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra BB0_52; rem.s32 %r124, %r27, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r174, %r125, %r68; bra.uni BB0_53; BB0_52: add.s32 %r126, %r68, -1; min.s32 %r174, %r27, %r126; BB0_53: add.s32 %r127, %r174, %r4; mad.lo.s32 %r128, %r127, %r67, %r1; mul.wide.s32 %rd58, %r128, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra BB0_55; rem.s32 %r129, %r31, %r68; add.s32 %r130, %r129, %r68; rem.s32 %r175, %r130, %r68; bra.uni BB0_56; BB0_55: mov.u32 %r131, 0; max.s32 %r175, %r31, %r131; BB0_56: add.s32 %r132, %r175, %r4; mad.lo.s32 %r133, %r132, %r67, %r1; mul.wide.s32 %rd60, %r133, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f19, %f89; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra BB0_60; rem.s32 %r134, %r47, %r69; add.s32 %r135, %r134, %r69; rem.s32 %r176, %r135, %r69; bra.uni BB0_61; BB0_60: add.s32 %r136, %r69, -1; min.s32 %r176, %r47, %r136; BB0_61: mad.lo.s32 %r137, %r176, %r68, %r2; mad.lo.s32 %r138, %r137, %r67, %r1; mul.wide.s32 %rd62, %r138, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra BB0_63; rem.s32 %r139, %r51, %r69; add.s32 %r140, %r139, %r69; rem.s32 %r177, %r140, %r69; bra.uni BB0_64; BB0_63: mov.u32 %r141, 0; max.s32 %r177, %r51, %r141; BB0_64: mad.lo.s32 %r142, %r177, %r68, %r2; mad.lo.s32 %r143, %r142, %r67, %r1; mul.wide.s32 %rd64, %r143, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra BB0_66; rem.s32 %r144, %r47, %r69; add.s32 %r145, %r144, %r69; rem.s32 %r178, %r145, %r69; bra.uni BB0_67; BB0_66: add.s32 %r146, %r69, -1; min.s32 %r178, %r47, %r146; BB0_67: mad.lo.s32 %r147, %r178, %r68, %r2; mad.lo.s32 %r148, %r147, %r67, %r1; mul.wide.s32 %rd66, %r148, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra BB0_69; rem.s32 %r149, %r51, %r69; add.s32 %r150, %r149, %r69; rem.s32 %r179, %r150, %r69; bra.uni BB0_70; BB0_69: mov.u32 %r151, 0; max.s32 %r179, %r51, %r151; BB0_70: mad.lo.s32 %r152, %r179, %r68, %r2; mad.lo.s32 %r153, %r152, %r67, %r1; mul.wide.s32 %rd68, %r153, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra BB0_72; rem.s32 %r154, %r47, %r69; add.s32 %r155, %r154, %r69; rem.s32 %r180, %r155, %r69; bra.uni BB0_73; BB0_72: add.s32 %r156, %r69, -1; min.s32 %r180, %r47, %r156; BB0_73: mad.lo.s32 %r157, %r180, %r68, %r2; mad.lo.s32 %r158, %r157, %r67, %r1; mul.wide.s32 %rd70, %r158, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra BB0_75; rem.s32 %r159, %r51, %r69; add.s32 %r160, %r159, %r69; rem.s32 %r181, %r160, %r69; bra.uni BB0_76; BB0_75: mov.u32 %r161, 0; max.s32 %r181, %r51, %r161; BB0_76: mad.lo.s32 %r162, %r181, %r68, %r2; mad.lo.s32 %r163, %r162, %r67, %r1; mul.wide.s32 %rd72, %r163, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; BB0_77: cvta.to.global.u64 %rd74, %rd6; cvta.to.global.u64 %rd75, %rd5; cvta.to.global.u64 %rd76, %rd4; mul.wide.s32 %rd77, %r6, 4; add.s64 %rd78, %rd3, %rd77; add.s64 %rd79, %rd2, %rd77; add.s64 %rd80, %rd1, %rd77; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd79]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd80]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd78]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; add.s64 %rd81, %rd76, %rd77; ld.global.f32 %f126, [%rd81]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd81], %f127; add.s64 %rd82, %rd75, %rd77; ld.global.f32 %f128, [%rd82]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd82], %f129; add.s64 %rd83, %rd74, %rd77; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; BB0_78: ret; } ` addzhanglitorque2_ptx_35 = ` .version 6.5 .target sm_35 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<15>; .reg .f32 %f<149>; .reg .b32 %r<182>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r70, %r71, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r73, %r74, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r76, %r77, %r78; setp.ge.s32 %p1, %r2, %r68; setp.ge.s32 %p2, %r1, %r67; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra BB0_11; rcp.rn.f32 %f136, %f135; BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; mov.f32 %f143, 0f00000000; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra BB0_20; rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r164, %r81, %r67; bra.uni BB0_21; BB0_20: add.s32 %r82, %r67, -1; min.s32 %r164, %r7, %r82; BB0_21: add.s32 %r83, %r164, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra BB0_23; rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r165, %r85, %r67; bra.uni BB0_24; BB0_23: mov.u32 %r86, 0; max.s32 %r165, %r11, %r86; BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f20, %f79; add.s32 %r87, %r165, %r5; mul.wide.s32 %rd40, %r87, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra BB0_26; rem.s32 %r88, %r7, %r67; add.s32 %r89, %r88, %r67; rem.s32 %r166, %r89, %r67; bra.uni BB0_27; BB0_26: add.s32 %r90, %r67, -1; min.s32 %r166, %r7, %r90; BB0_27: add.s32 %r91, %r166, %r5; mul.wide.s32 %rd42, %r91, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra BB0_29; rem.s32 %r92, %r11, %r67; add.s32 %r93, %r92, %r67; rem.s32 %r167, %r93, %r67; bra.uni BB0_30; BB0_29: mov.u32 %r94, 0; max.s32 %r167, %r11, %r94; BB0_30: add.s32 %r95, %r167, %r5; mul.wide.s32 %rd44, %r95, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra BB0_32; rem.s32 %r96, %r7, %r67; add.s32 %r97, %r96, %r67; rem.s32 %r168, %r97, %r67; bra.uni BB0_33; BB0_32: add.s32 %r98, %r67, -1; min.s32 %r168, %r7, %r98; BB0_33: add.s32 %r99, %r168, %r5; mul.wide.s32 %rd46, %r99, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra BB0_35; rem.s32 %r100, %r11, %r67; add.s32 %r101, %r100, %r67; rem.s32 %r169, %r101, %r67; bra.uni BB0_36; BB0_35: mov.u32 %r102, 0; max.s32 %r169, %r11, %r102; BB0_36: add.s32 %r103, %r169, %r5; mul.wide.s32 %rd48, %r103, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra BB0_40; rem.s32 %r104, %r27, %r68; add.s32 %r105, %r104, %r68; rem.s32 %r170, %r105, %r68; bra.uni BB0_41; BB0_40: add.s32 %r106, %r68, -1; min.s32 %r170, %r27, %r106; BB0_41: add.s32 %r107, %r170, %r4; mad.lo.s32 %r108, %r107, %r67, %r1; mul.wide.s32 %rd50, %r108, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra BB0_43; rem.s32 %r109, %r31, %r68; add.s32 %r110, %r109, %r68; rem.s32 %r171, %r110, %r68; bra.uni BB0_44; BB0_43: mov.u32 %r111, 0; max.s32 %r171, %r31, %r111; BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f18, %f84; add.s32 %r112, %r171, %r4; mad.lo.s32 %r113, %r112, %r67, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra BB0_46; rem.s32 %r114, %r27, %r68; add.s32 %r115, %r114, %r68; rem.s32 %r172, %r115, %r68; bra.uni BB0_47; BB0_46: add.s32 %r116, %r68, -1; min.s32 %r172, %r27, %r116; BB0_47: add.s32 %r117, %r172, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd54, %r118, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra BB0_49; rem.s32 %r119, %r31, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r173, %r120, %r68; bra.uni BB0_50; BB0_49: mov.u32 %r121, 0; max.s32 %r173, %r31, %r121; BB0_50: add.s32 %r122, %r173, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd56, %r123, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra BB0_52; rem.s32 %r124, %r27, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r174, %r125, %r68; bra.uni BB0_53; BB0_52: add.s32 %r126, %r68, -1; min.s32 %r174, %r27, %r126; BB0_53: add.s32 %r127, %r174, %r4; mad.lo.s32 %r128, %r127, %r67, %r1; mul.wide.s32 %rd58, %r128, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra BB0_55; rem.s32 %r129, %r31, %r68; add.s32 %r130, %r129, %r68; rem.s32 %r175, %r130, %r68; bra.uni BB0_56; BB0_55: mov.u32 %r131, 0; max.s32 %r175, %r31, %r131; BB0_56: add.s32 %r132, %r175, %r4; mad.lo.s32 %r133, %r132, %r67, %r1; mul.wide.s32 %rd60, %r133, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f19, %f89; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra BB0_60; rem.s32 %r134, %r47, %r69; add.s32 %r135, %r134, %r69; rem.s32 %r176, %r135, %r69; bra.uni BB0_61; BB0_60: add.s32 %r136, %r69, -1; min.s32 %r176, %r47, %r136; BB0_61: mad.lo.s32 %r137, %r176, %r68, %r2; mad.lo.s32 %r138, %r137, %r67, %r1; mul.wide.s32 %rd62, %r138, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra BB0_63; rem.s32 %r139, %r51, %r69; add.s32 %r140, %r139, %r69; rem.s32 %r177, %r140, %r69; bra.uni BB0_64; BB0_63: mov.u32 %r141, 0; max.s32 %r177, %r51, %r141; BB0_64: mad.lo.s32 %r142, %r177, %r68, %r2; mad.lo.s32 %r143, %r142, %r67, %r1; mul.wide.s32 %rd64, %r143, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra BB0_66; rem.s32 %r144, %r47, %r69; add.s32 %r145, %r144, %r69; rem.s32 %r178, %r145, %r69; bra.uni BB0_67; BB0_66: add.s32 %r146, %r69, -1; min.s32 %r178, %r47, %r146; BB0_67: mad.lo.s32 %r147, %r178, %r68, %r2; mad.lo.s32 %r148, %r147, %r67, %r1; mul.wide.s32 %rd66, %r148, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra BB0_69; rem.s32 %r149, %r51, %r69; add.s32 %r150, %r149, %r69; rem.s32 %r179, %r150, %r69; bra.uni BB0_70; BB0_69: mov.u32 %r151, 0; max.s32 %r179, %r51, %r151; BB0_70: mad.lo.s32 %r152, %r179, %r68, %r2; mad.lo.s32 %r153, %r152, %r67, %r1; mul.wide.s32 %rd68, %r153, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra BB0_72; rem.s32 %r154, %r47, %r69; add.s32 %r155, %r154, %r69; rem.s32 %r180, %r155, %r69; bra.uni BB0_73; BB0_72: add.s32 %r156, %r69, -1; min.s32 %r180, %r47, %r156; BB0_73: mad.lo.s32 %r157, %r180, %r68, %r2; mad.lo.s32 %r158, %r157, %r67, %r1; mul.wide.s32 %rd70, %r158, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra BB0_75; rem.s32 %r159, %r51, %r69; add.s32 %r160, %r159, %r69; rem.s32 %r181, %r160, %r69; bra.uni BB0_76; BB0_75: mov.u32 %r161, 0; max.s32 %r181, %r51, %r161; BB0_76: mad.lo.s32 %r162, %r181, %r68, %r2; mad.lo.s32 %r163, %r162, %r67, %r1; mul.wide.s32 %rd72, %r163, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; BB0_77: cvta.to.global.u64 %rd74, %rd6; cvta.to.global.u64 %rd75, %rd5; cvta.to.global.u64 %rd76, %rd4; mul.wide.s32 %rd77, %r6, 4; add.s64 %rd78, %rd3, %rd77; add.s64 %rd79, %rd2, %rd77; add.s64 %rd80, %rd1, %rd77; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd79]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd80]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd78]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; add.s64 %rd81, %rd76, %rd77; ld.global.f32 %f126, [%rd81]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd81], %f127; add.s64 %rd82, %rd75, %rd77; ld.global.f32 %f128, [%rd82]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd82], %f129; add.s64 %rd83, %rd74, %rd77; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; BB0_78: ret; } ` addzhanglitorque2_ptx_37 = ` .version 6.5 .target sm_37 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<15>; .reg .f32 %f<149>; .reg .b32 %r<182>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r70, %r71, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r73, %r74, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r76, %r77, %r78; setp.ge.s32 %p1, %r2, %r68; setp.ge.s32 %p2, %r1, %r67; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra BB0_11; rcp.rn.f32 %f136, %f135; BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; mov.f32 %f143, 0f00000000; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra BB0_20; rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r164, %r81, %r67; bra.uni BB0_21; BB0_20: add.s32 %r82, %r67, -1; min.s32 %r164, %r7, %r82; BB0_21: add.s32 %r83, %r164, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra BB0_23; rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r165, %r85, %r67; bra.uni BB0_24; BB0_23: mov.u32 %r86, 0; max.s32 %r165, %r11, %r86; BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f20, %f79; add.s32 %r87, %r165, %r5; mul.wide.s32 %rd40, %r87, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra BB0_26; rem.s32 %r88, %r7, %r67; add.s32 %r89, %r88, %r67; rem.s32 %r166, %r89, %r67; bra.uni BB0_27; BB0_26: add.s32 %r90, %r67, -1; min.s32 %r166, %r7, %r90; BB0_27: add.s32 %r91, %r166, %r5; mul.wide.s32 %rd42, %r91, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra BB0_29; rem.s32 %r92, %r11, %r67; add.s32 %r93, %r92, %r67; rem.s32 %r167, %r93, %r67; bra.uni BB0_30; BB0_29: mov.u32 %r94, 0; max.s32 %r167, %r11, %r94; BB0_30: add.s32 %r95, %r167, %r5; mul.wide.s32 %rd44, %r95, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra BB0_32; rem.s32 %r96, %r7, %r67; add.s32 %r97, %r96, %r67; rem.s32 %r168, %r97, %r67; bra.uni BB0_33; BB0_32: add.s32 %r98, %r67, -1; min.s32 %r168, %r7, %r98; BB0_33: add.s32 %r99, %r168, %r5; mul.wide.s32 %rd46, %r99, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra BB0_35; rem.s32 %r100, %r11, %r67; add.s32 %r101, %r100, %r67; rem.s32 %r169, %r101, %r67; bra.uni BB0_36; BB0_35: mov.u32 %r102, 0; max.s32 %r169, %r11, %r102; BB0_36: add.s32 %r103, %r169, %r5; mul.wide.s32 %rd48, %r103, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra BB0_40; rem.s32 %r104, %r27, %r68; add.s32 %r105, %r104, %r68; rem.s32 %r170, %r105, %r68; bra.uni BB0_41; BB0_40: add.s32 %r106, %r68, -1; min.s32 %r170, %r27, %r106; BB0_41: add.s32 %r107, %r170, %r4; mad.lo.s32 %r108, %r107, %r67, %r1; mul.wide.s32 %rd50, %r108, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra BB0_43; rem.s32 %r109, %r31, %r68; add.s32 %r110, %r109, %r68; rem.s32 %r171, %r110, %r68; bra.uni BB0_44; BB0_43: mov.u32 %r111, 0; max.s32 %r171, %r31, %r111; BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f18, %f84; add.s32 %r112, %r171, %r4; mad.lo.s32 %r113, %r112, %r67, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra BB0_46; rem.s32 %r114, %r27, %r68; add.s32 %r115, %r114, %r68; rem.s32 %r172, %r115, %r68; bra.uni BB0_47; BB0_46: add.s32 %r116, %r68, -1; min.s32 %r172, %r27, %r116; BB0_47: add.s32 %r117, %r172, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd54, %r118, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra BB0_49; rem.s32 %r119, %r31, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r173, %r120, %r68; bra.uni BB0_50; BB0_49: mov.u32 %r121, 0; max.s32 %r173, %r31, %r121; BB0_50: add.s32 %r122, %r173, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd56, %r123, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra BB0_52; rem.s32 %r124, %r27, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r174, %r125, %r68; bra.uni BB0_53; BB0_52: add.s32 %r126, %r68, -1; min.s32 %r174, %r27, %r126; BB0_53: add.s32 %r127, %r174, %r4; mad.lo.s32 %r128, %r127, %r67, %r1; mul.wide.s32 %rd58, %r128, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra BB0_55; rem.s32 %r129, %r31, %r68; add.s32 %r130, %r129, %r68; rem.s32 %r175, %r130, %r68; bra.uni BB0_56; BB0_55: mov.u32 %r131, 0; max.s32 %r175, %r31, %r131; BB0_56: add.s32 %r132, %r175, %r4; mad.lo.s32 %r133, %r132, %r67, %r1; mul.wide.s32 %rd60, %r133, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f19, %f89; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra BB0_60; rem.s32 %r134, %r47, %r69; add.s32 %r135, %r134, %r69; rem.s32 %r176, %r135, %r69; bra.uni BB0_61; BB0_60: add.s32 %r136, %r69, -1; min.s32 %r176, %r47, %r136; BB0_61: mad.lo.s32 %r137, %r176, %r68, %r2; mad.lo.s32 %r138, %r137, %r67, %r1; mul.wide.s32 %rd62, %r138, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra BB0_63; rem.s32 %r139, %r51, %r69; add.s32 %r140, %r139, %r69; rem.s32 %r177, %r140, %r69; bra.uni BB0_64; BB0_63: mov.u32 %r141, 0; max.s32 %r177, %r51, %r141; BB0_64: mad.lo.s32 %r142, %r177, %r68, %r2; mad.lo.s32 %r143, %r142, %r67, %r1; mul.wide.s32 %rd64, %r143, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra BB0_66; rem.s32 %r144, %r47, %r69; add.s32 %r145, %r144, %r69; rem.s32 %r178, %r145, %r69; bra.uni BB0_67; BB0_66: add.s32 %r146, %r69, -1; min.s32 %r178, %r47, %r146; BB0_67: mad.lo.s32 %r147, %r178, %r68, %r2; mad.lo.s32 %r148, %r147, %r67, %r1; mul.wide.s32 %rd66, %r148, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra BB0_69; rem.s32 %r149, %r51, %r69; add.s32 %r150, %r149, %r69; rem.s32 %r179, %r150, %r69; bra.uni BB0_70; BB0_69: mov.u32 %r151, 0; max.s32 %r179, %r51, %r151; BB0_70: mad.lo.s32 %r152, %r179, %r68, %r2; mad.lo.s32 %r153, %r152, %r67, %r1; mul.wide.s32 %rd68, %r153, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra BB0_72; rem.s32 %r154, %r47, %r69; add.s32 %r155, %r154, %r69; rem.s32 %r180, %r155, %r69; bra.uni BB0_73; BB0_72: add.s32 %r156, %r69, -1; min.s32 %r180, %r47, %r156; BB0_73: mad.lo.s32 %r157, %r180, %r68, %r2; mad.lo.s32 %r158, %r157, %r67, %r1; mul.wide.s32 %rd70, %r158, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra BB0_75; rem.s32 %r159, %r51, %r69; add.s32 %r160, %r159, %r69; rem.s32 %r181, %r160, %r69; bra.uni BB0_76; BB0_75: mov.u32 %r161, 0; max.s32 %r181, %r51, %r161; BB0_76: mad.lo.s32 %r162, %r181, %r68, %r2; mad.lo.s32 %r163, %r162, %r67, %r1; mul.wide.s32 %rd72, %r163, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; BB0_77: cvta.to.global.u64 %rd74, %rd6; cvta.to.global.u64 %rd75, %rd5; cvta.to.global.u64 %rd76, %rd4; mul.wide.s32 %rd77, %r6, 4; add.s64 %rd78, %rd3, %rd77; add.s64 %rd79, %rd2, %rd77; add.s64 %rd80, %rd1, %rd77; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd79]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd80]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd78]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; add.s64 %rd81, %rd76, %rd77; ld.global.f32 %f126, [%rd81]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd81], %f127; add.s64 %rd82, %rd75, %rd77; ld.global.f32 %f128, [%rd82]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd82], %f129; add.s64 %rd83, %rd74, %rd77; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; BB0_78: ret; } ` addzhanglitorque2_ptx_50 = ` .version 6.5 .target sm_50 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<15>; .reg .f32 %f<149>; .reg .b32 %r<182>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r70, %r71, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r73, %r74, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r76, %r77, %r78; setp.ge.s32 %p1, %r2, %r68; setp.ge.s32 %p2, %r1, %r67; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra BB0_11; rcp.rn.f32 %f136, %f135; BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; mov.f32 %f143, 0f00000000; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra BB0_20; rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r164, %r81, %r67; bra.uni BB0_21; BB0_20: add.s32 %r82, %r67, -1; min.s32 %r164, %r7, %r82; BB0_21: add.s32 %r83, %r164, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra BB0_23; rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r165, %r85, %r67; bra.uni BB0_24; BB0_23: mov.u32 %r86, 0; max.s32 %r165, %r11, %r86; BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f20, %f79; add.s32 %r87, %r165, %r5; mul.wide.s32 %rd40, %r87, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra BB0_26; rem.s32 %r88, %r7, %r67; add.s32 %r89, %r88, %r67; rem.s32 %r166, %r89, %r67; bra.uni BB0_27; BB0_26: add.s32 %r90, %r67, -1; min.s32 %r166, %r7, %r90; BB0_27: add.s32 %r91, %r166, %r5; mul.wide.s32 %rd42, %r91, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra BB0_29; rem.s32 %r92, %r11, %r67; add.s32 %r93, %r92, %r67; rem.s32 %r167, %r93, %r67; bra.uni BB0_30; BB0_29: mov.u32 %r94, 0; max.s32 %r167, %r11, %r94; BB0_30: add.s32 %r95, %r167, %r5; mul.wide.s32 %rd44, %r95, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra BB0_32; rem.s32 %r96, %r7, %r67; add.s32 %r97, %r96, %r67; rem.s32 %r168, %r97, %r67; bra.uni BB0_33; BB0_32: add.s32 %r98, %r67, -1; min.s32 %r168, %r7, %r98; BB0_33: add.s32 %r99, %r168, %r5; mul.wide.s32 %rd46, %r99, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra BB0_35; rem.s32 %r100, %r11, %r67; add.s32 %r101, %r100, %r67; rem.s32 %r169, %r101, %r67; bra.uni BB0_36; BB0_35: mov.u32 %r102, 0; max.s32 %r169, %r11, %r102; BB0_36: add.s32 %r103, %r169, %r5; mul.wide.s32 %rd48, %r103, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra BB0_40; rem.s32 %r104, %r27, %r68; add.s32 %r105, %r104, %r68; rem.s32 %r170, %r105, %r68; bra.uni BB0_41; BB0_40: add.s32 %r106, %r68, -1; min.s32 %r170, %r27, %r106; BB0_41: add.s32 %r107, %r170, %r4; mad.lo.s32 %r108, %r107, %r67, %r1; mul.wide.s32 %rd50, %r108, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra BB0_43; rem.s32 %r109, %r31, %r68; add.s32 %r110, %r109, %r68; rem.s32 %r171, %r110, %r68; bra.uni BB0_44; BB0_43: mov.u32 %r111, 0; max.s32 %r171, %r31, %r111; BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f18, %f84; add.s32 %r112, %r171, %r4; mad.lo.s32 %r113, %r112, %r67, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra BB0_46; rem.s32 %r114, %r27, %r68; add.s32 %r115, %r114, %r68; rem.s32 %r172, %r115, %r68; bra.uni BB0_47; BB0_46: add.s32 %r116, %r68, -1; min.s32 %r172, %r27, %r116; BB0_47: add.s32 %r117, %r172, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd54, %r118, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra BB0_49; rem.s32 %r119, %r31, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r173, %r120, %r68; bra.uni BB0_50; BB0_49: mov.u32 %r121, 0; max.s32 %r173, %r31, %r121; BB0_50: add.s32 %r122, %r173, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd56, %r123, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra BB0_52; rem.s32 %r124, %r27, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r174, %r125, %r68; bra.uni BB0_53; BB0_52: add.s32 %r126, %r68, -1; min.s32 %r174, %r27, %r126; BB0_53: add.s32 %r127, %r174, %r4; mad.lo.s32 %r128, %r127, %r67, %r1; mul.wide.s32 %rd58, %r128, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra BB0_55; rem.s32 %r129, %r31, %r68; add.s32 %r130, %r129, %r68; rem.s32 %r175, %r130, %r68; bra.uni BB0_56; BB0_55: mov.u32 %r131, 0; max.s32 %r175, %r31, %r131; BB0_56: add.s32 %r132, %r175, %r4; mad.lo.s32 %r133, %r132, %r67, %r1; mul.wide.s32 %rd60, %r133, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f19, %f89; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra BB0_60; rem.s32 %r134, %r47, %r69; add.s32 %r135, %r134, %r69; rem.s32 %r176, %r135, %r69; bra.uni BB0_61; BB0_60: add.s32 %r136, %r69, -1; min.s32 %r176, %r47, %r136; BB0_61: mad.lo.s32 %r137, %r176, %r68, %r2; mad.lo.s32 %r138, %r137, %r67, %r1; mul.wide.s32 %rd62, %r138, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra BB0_63; rem.s32 %r139, %r51, %r69; add.s32 %r140, %r139, %r69; rem.s32 %r177, %r140, %r69; bra.uni BB0_64; BB0_63: mov.u32 %r141, 0; max.s32 %r177, %r51, %r141; BB0_64: mad.lo.s32 %r142, %r177, %r68, %r2; mad.lo.s32 %r143, %r142, %r67, %r1; mul.wide.s32 %rd64, %r143, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra BB0_66; rem.s32 %r144, %r47, %r69; add.s32 %r145, %r144, %r69; rem.s32 %r178, %r145, %r69; bra.uni BB0_67; BB0_66: add.s32 %r146, %r69, -1; min.s32 %r178, %r47, %r146; BB0_67: mad.lo.s32 %r147, %r178, %r68, %r2; mad.lo.s32 %r148, %r147, %r67, %r1; mul.wide.s32 %rd66, %r148, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra BB0_69; rem.s32 %r149, %r51, %r69; add.s32 %r150, %r149, %r69; rem.s32 %r179, %r150, %r69; bra.uni BB0_70; BB0_69: mov.u32 %r151, 0; max.s32 %r179, %r51, %r151; BB0_70: mad.lo.s32 %r152, %r179, %r68, %r2; mad.lo.s32 %r153, %r152, %r67, %r1; mul.wide.s32 %rd68, %r153, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra BB0_72; rem.s32 %r154, %r47, %r69; add.s32 %r155, %r154, %r69; rem.s32 %r180, %r155, %r69; bra.uni BB0_73; BB0_72: add.s32 %r156, %r69, -1; min.s32 %r180, %r47, %r156; BB0_73: mad.lo.s32 %r157, %r180, %r68, %r2; mad.lo.s32 %r158, %r157, %r67, %r1; mul.wide.s32 %rd70, %r158, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra BB0_75; rem.s32 %r159, %r51, %r69; add.s32 %r160, %r159, %r69; rem.s32 %r181, %r160, %r69; bra.uni BB0_76; BB0_75: mov.u32 %r161, 0; max.s32 %r181, %r51, %r161; BB0_76: mad.lo.s32 %r162, %r181, %r68, %r2; mad.lo.s32 %r163, %r162, %r67, %r1; mul.wide.s32 %rd72, %r163, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; BB0_77: cvta.to.global.u64 %rd74, %rd6; cvta.to.global.u64 %rd75, %rd5; cvta.to.global.u64 %rd76, %rd4; mul.wide.s32 %rd77, %r6, 4; add.s64 %rd78, %rd3, %rd77; add.s64 %rd79, %rd2, %rd77; add.s64 %rd80, %rd1, %rd77; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd79]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd80]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd78]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; add.s64 %rd81, %rd76, %rd77; ld.global.f32 %f126, [%rd81]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd81], %f127; add.s64 %rd82, %rd75, %rd77; ld.global.f32 %f128, [%rd82]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd82], %f129; add.s64 %rd83, %rd74, %rd77; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; BB0_78: ret; } ` addzhanglitorque2_ptx_52 = ` .version 6.5 .target sm_52 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<15>; .reg .f32 %f<149>; .reg .b32 %r<182>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r70, %r71, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r73, %r74, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r76, %r77, %r78; setp.ge.s32 %p1, %r2, %r68; setp.ge.s32 %p2, %r1, %r67; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra BB0_11; rcp.rn.f32 %f136, %f135; BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; mov.f32 %f143, 0f00000000; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra BB0_20; rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r164, %r81, %r67; bra.uni BB0_21; BB0_20: add.s32 %r82, %r67, -1; min.s32 %r164, %r7, %r82; BB0_21: add.s32 %r83, %r164, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra BB0_23; rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r165, %r85, %r67; bra.uni BB0_24; BB0_23: mov.u32 %r86, 0; max.s32 %r165, %r11, %r86; BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f20, %f79; add.s32 %r87, %r165, %r5; mul.wide.s32 %rd40, %r87, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra BB0_26; rem.s32 %r88, %r7, %r67; add.s32 %r89, %r88, %r67; rem.s32 %r166, %r89, %r67; bra.uni BB0_27; BB0_26: add.s32 %r90, %r67, -1; min.s32 %r166, %r7, %r90; BB0_27: add.s32 %r91, %r166, %r5; mul.wide.s32 %rd42, %r91, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra BB0_29; rem.s32 %r92, %r11, %r67; add.s32 %r93, %r92, %r67; rem.s32 %r167, %r93, %r67; bra.uni BB0_30; BB0_29: mov.u32 %r94, 0; max.s32 %r167, %r11, %r94; BB0_30: add.s32 %r95, %r167, %r5; mul.wide.s32 %rd44, %r95, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra BB0_32; rem.s32 %r96, %r7, %r67; add.s32 %r97, %r96, %r67; rem.s32 %r168, %r97, %r67; bra.uni BB0_33; BB0_32: add.s32 %r98, %r67, -1; min.s32 %r168, %r7, %r98; BB0_33: add.s32 %r99, %r168, %r5; mul.wide.s32 %rd46, %r99, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra BB0_35; rem.s32 %r100, %r11, %r67; add.s32 %r101, %r100, %r67; rem.s32 %r169, %r101, %r67; bra.uni BB0_36; BB0_35: mov.u32 %r102, 0; max.s32 %r169, %r11, %r102; BB0_36: add.s32 %r103, %r169, %r5; mul.wide.s32 %rd48, %r103, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra BB0_40; rem.s32 %r104, %r27, %r68; add.s32 %r105, %r104, %r68; rem.s32 %r170, %r105, %r68; bra.uni BB0_41; BB0_40: add.s32 %r106, %r68, -1; min.s32 %r170, %r27, %r106; BB0_41: add.s32 %r107, %r170, %r4; mad.lo.s32 %r108, %r107, %r67, %r1; mul.wide.s32 %rd50, %r108, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra BB0_43; rem.s32 %r109, %r31, %r68; add.s32 %r110, %r109, %r68; rem.s32 %r171, %r110, %r68; bra.uni BB0_44; BB0_43: mov.u32 %r111, 0; max.s32 %r171, %r31, %r111; BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f18, %f84; add.s32 %r112, %r171, %r4; mad.lo.s32 %r113, %r112, %r67, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra BB0_46; rem.s32 %r114, %r27, %r68; add.s32 %r115, %r114, %r68; rem.s32 %r172, %r115, %r68; bra.uni BB0_47; BB0_46: add.s32 %r116, %r68, -1; min.s32 %r172, %r27, %r116; BB0_47: add.s32 %r117, %r172, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd54, %r118, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra BB0_49; rem.s32 %r119, %r31, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r173, %r120, %r68; bra.uni BB0_50; BB0_49: mov.u32 %r121, 0; max.s32 %r173, %r31, %r121; BB0_50: add.s32 %r122, %r173, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd56, %r123, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra BB0_52; rem.s32 %r124, %r27, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r174, %r125, %r68; bra.uni BB0_53; BB0_52: add.s32 %r126, %r68, -1; min.s32 %r174, %r27, %r126; BB0_53: add.s32 %r127, %r174, %r4; mad.lo.s32 %r128, %r127, %r67, %r1; mul.wide.s32 %rd58, %r128, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra BB0_55; rem.s32 %r129, %r31, %r68; add.s32 %r130, %r129, %r68; rem.s32 %r175, %r130, %r68; bra.uni BB0_56; BB0_55: mov.u32 %r131, 0; max.s32 %r175, %r31, %r131; BB0_56: add.s32 %r132, %r175, %r4; mad.lo.s32 %r133, %r132, %r67, %r1; mul.wide.s32 %rd60, %r133, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f19, %f89; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra BB0_60; rem.s32 %r134, %r47, %r69; add.s32 %r135, %r134, %r69; rem.s32 %r176, %r135, %r69; bra.uni BB0_61; BB0_60: add.s32 %r136, %r69, -1; min.s32 %r176, %r47, %r136; BB0_61: mad.lo.s32 %r137, %r176, %r68, %r2; mad.lo.s32 %r138, %r137, %r67, %r1; mul.wide.s32 %rd62, %r138, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra BB0_63; rem.s32 %r139, %r51, %r69; add.s32 %r140, %r139, %r69; rem.s32 %r177, %r140, %r69; bra.uni BB0_64; BB0_63: mov.u32 %r141, 0; max.s32 %r177, %r51, %r141; BB0_64: mad.lo.s32 %r142, %r177, %r68, %r2; mad.lo.s32 %r143, %r142, %r67, %r1; mul.wide.s32 %rd64, %r143, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra BB0_66; rem.s32 %r144, %r47, %r69; add.s32 %r145, %r144, %r69; rem.s32 %r178, %r145, %r69; bra.uni BB0_67; BB0_66: add.s32 %r146, %r69, -1; min.s32 %r178, %r47, %r146; BB0_67: mad.lo.s32 %r147, %r178, %r68, %r2; mad.lo.s32 %r148, %r147, %r67, %r1; mul.wide.s32 %rd66, %r148, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra BB0_69; rem.s32 %r149, %r51, %r69; add.s32 %r150, %r149, %r69; rem.s32 %r179, %r150, %r69; bra.uni BB0_70; BB0_69: mov.u32 %r151, 0; max.s32 %r179, %r51, %r151; BB0_70: mad.lo.s32 %r152, %r179, %r68, %r2; mad.lo.s32 %r153, %r152, %r67, %r1; mul.wide.s32 %rd68, %r153, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra BB0_72; rem.s32 %r154, %r47, %r69; add.s32 %r155, %r154, %r69; rem.s32 %r180, %r155, %r69; bra.uni BB0_73; BB0_72: add.s32 %r156, %r69, -1; min.s32 %r180, %r47, %r156; BB0_73: mad.lo.s32 %r157, %r180, %r68, %r2; mad.lo.s32 %r158, %r157, %r67, %r1; mul.wide.s32 %rd70, %r158, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra BB0_75; rem.s32 %r159, %r51, %r69; add.s32 %r160, %r159, %r69; rem.s32 %r181, %r160, %r69; bra.uni BB0_76; BB0_75: mov.u32 %r161, 0; max.s32 %r181, %r51, %r161; BB0_76: mad.lo.s32 %r162, %r181, %r68, %r2; mad.lo.s32 %r163, %r162, %r67, %r1; mul.wide.s32 %rd72, %r163, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; BB0_77: cvta.to.global.u64 %rd74, %rd6; cvta.to.global.u64 %rd75, %rd5; cvta.to.global.u64 %rd76, %rd4; mul.wide.s32 %rd77, %r6, 4; add.s64 %rd78, %rd3, %rd77; add.s64 %rd79, %rd2, %rd77; add.s64 %rd80, %rd1, %rd77; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd79]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd80]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd78]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; add.s64 %rd81, %rd76, %rd77; ld.global.f32 %f126, [%rd81]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd81], %f127; add.s64 %rd82, %rd75, %rd77; ld.global.f32 %f128, [%rd82]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd82], %f129; add.s64 %rd83, %rd74, %rd77; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; BB0_78: ret; } ` addzhanglitorque2_ptx_53 = ` .version 6.5 .target sm_53 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<15>; .reg .f32 %f<149>; .reg .b32 %r<182>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r70, %r71, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r73, %r74, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r76, %r77, %r78; setp.ge.s32 %p1, %r2, %r68; setp.ge.s32 %p2, %r1, %r67; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra BB0_11; rcp.rn.f32 %f136, %f135; BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; mov.f32 %f143, 0f00000000; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra BB0_20; rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r164, %r81, %r67; bra.uni BB0_21; BB0_20: add.s32 %r82, %r67, -1; min.s32 %r164, %r7, %r82; BB0_21: add.s32 %r83, %r164, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra BB0_23; rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r165, %r85, %r67; bra.uni BB0_24; BB0_23: mov.u32 %r86, 0; max.s32 %r165, %r11, %r86; BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f20, %f79; add.s32 %r87, %r165, %r5; mul.wide.s32 %rd40, %r87, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra BB0_26; rem.s32 %r88, %r7, %r67; add.s32 %r89, %r88, %r67; rem.s32 %r166, %r89, %r67; bra.uni BB0_27; BB0_26: add.s32 %r90, %r67, -1; min.s32 %r166, %r7, %r90; BB0_27: add.s32 %r91, %r166, %r5; mul.wide.s32 %rd42, %r91, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra BB0_29; rem.s32 %r92, %r11, %r67; add.s32 %r93, %r92, %r67; rem.s32 %r167, %r93, %r67; bra.uni BB0_30; BB0_29: mov.u32 %r94, 0; max.s32 %r167, %r11, %r94; BB0_30: add.s32 %r95, %r167, %r5; mul.wide.s32 %rd44, %r95, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra BB0_32; rem.s32 %r96, %r7, %r67; add.s32 %r97, %r96, %r67; rem.s32 %r168, %r97, %r67; bra.uni BB0_33; BB0_32: add.s32 %r98, %r67, -1; min.s32 %r168, %r7, %r98; BB0_33: add.s32 %r99, %r168, %r5; mul.wide.s32 %rd46, %r99, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra BB0_35; rem.s32 %r100, %r11, %r67; add.s32 %r101, %r100, %r67; rem.s32 %r169, %r101, %r67; bra.uni BB0_36; BB0_35: mov.u32 %r102, 0; max.s32 %r169, %r11, %r102; BB0_36: add.s32 %r103, %r169, %r5; mul.wide.s32 %rd48, %r103, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra BB0_40; rem.s32 %r104, %r27, %r68; add.s32 %r105, %r104, %r68; rem.s32 %r170, %r105, %r68; bra.uni BB0_41; BB0_40: add.s32 %r106, %r68, -1; min.s32 %r170, %r27, %r106; BB0_41: add.s32 %r107, %r170, %r4; mad.lo.s32 %r108, %r107, %r67, %r1; mul.wide.s32 %rd50, %r108, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra BB0_43; rem.s32 %r109, %r31, %r68; add.s32 %r110, %r109, %r68; rem.s32 %r171, %r110, %r68; bra.uni BB0_44; BB0_43: mov.u32 %r111, 0; max.s32 %r171, %r31, %r111; BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f18, %f84; add.s32 %r112, %r171, %r4; mad.lo.s32 %r113, %r112, %r67, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra BB0_46; rem.s32 %r114, %r27, %r68; add.s32 %r115, %r114, %r68; rem.s32 %r172, %r115, %r68; bra.uni BB0_47; BB0_46: add.s32 %r116, %r68, -1; min.s32 %r172, %r27, %r116; BB0_47: add.s32 %r117, %r172, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd54, %r118, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra BB0_49; rem.s32 %r119, %r31, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r173, %r120, %r68; bra.uni BB0_50; BB0_49: mov.u32 %r121, 0; max.s32 %r173, %r31, %r121; BB0_50: add.s32 %r122, %r173, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd56, %r123, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra BB0_52; rem.s32 %r124, %r27, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r174, %r125, %r68; bra.uni BB0_53; BB0_52: add.s32 %r126, %r68, -1; min.s32 %r174, %r27, %r126; BB0_53: add.s32 %r127, %r174, %r4; mad.lo.s32 %r128, %r127, %r67, %r1; mul.wide.s32 %rd58, %r128, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra BB0_55; rem.s32 %r129, %r31, %r68; add.s32 %r130, %r129, %r68; rem.s32 %r175, %r130, %r68; bra.uni BB0_56; BB0_55: mov.u32 %r131, 0; max.s32 %r175, %r31, %r131; BB0_56: add.s32 %r132, %r175, %r4; mad.lo.s32 %r133, %r132, %r67, %r1; mul.wide.s32 %rd60, %r133, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f19, %f89; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra BB0_60; rem.s32 %r134, %r47, %r69; add.s32 %r135, %r134, %r69; rem.s32 %r176, %r135, %r69; bra.uni BB0_61; BB0_60: add.s32 %r136, %r69, -1; min.s32 %r176, %r47, %r136; BB0_61: mad.lo.s32 %r137, %r176, %r68, %r2; mad.lo.s32 %r138, %r137, %r67, %r1; mul.wide.s32 %rd62, %r138, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra BB0_63; rem.s32 %r139, %r51, %r69; add.s32 %r140, %r139, %r69; rem.s32 %r177, %r140, %r69; bra.uni BB0_64; BB0_63: mov.u32 %r141, 0; max.s32 %r177, %r51, %r141; BB0_64: mad.lo.s32 %r142, %r177, %r68, %r2; mad.lo.s32 %r143, %r142, %r67, %r1; mul.wide.s32 %rd64, %r143, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra BB0_66; rem.s32 %r144, %r47, %r69; add.s32 %r145, %r144, %r69; rem.s32 %r178, %r145, %r69; bra.uni BB0_67; BB0_66: add.s32 %r146, %r69, -1; min.s32 %r178, %r47, %r146; BB0_67: mad.lo.s32 %r147, %r178, %r68, %r2; mad.lo.s32 %r148, %r147, %r67, %r1; mul.wide.s32 %rd66, %r148, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra BB0_69; rem.s32 %r149, %r51, %r69; add.s32 %r150, %r149, %r69; rem.s32 %r179, %r150, %r69; bra.uni BB0_70; BB0_69: mov.u32 %r151, 0; max.s32 %r179, %r51, %r151; BB0_70: mad.lo.s32 %r152, %r179, %r68, %r2; mad.lo.s32 %r153, %r152, %r67, %r1; mul.wide.s32 %rd68, %r153, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra BB0_72; rem.s32 %r154, %r47, %r69; add.s32 %r155, %r154, %r69; rem.s32 %r180, %r155, %r69; bra.uni BB0_73; BB0_72: add.s32 %r156, %r69, -1; min.s32 %r180, %r47, %r156; BB0_73: mad.lo.s32 %r157, %r180, %r68, %r2; mad.lo.s32 %r158, %r157, %r67, %r1; mul.wide.s32 %rd70, %r158, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra BB0_75; rem.s32 %r159, %r51, %r69; add.s32 %r160, %r159, %r69; rem.s32 %r181, %r160, %r69; bra.uni BB0_76; BB0_75: mov.u32 %r161, 0; max.s32 %r181, %r51, %r161; BB0_76: mad.lo.s32 %r162, %r181, %r68, %r2; mad.lo.s32 %r163, %r162, %r67, %r1; mul.wide.s32 %rd72, %r163, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; BB0_77: cvta.to.global.u64 %rd74, %rd6; cvta.to.global.u64 %rd75, %rd5; cvta.to.global.u64 %rd76, %rd4; mul.wide.s32 %rd77, %r6, 4; add.s64 %rd78, %rd3, %rd77; add.s64 %rd79, %rd2, %rd77; add.s64 %rd80, %rd1, %rd77; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd79]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd80]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd78]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; add.s64 %rd81, %rd76, %rd77; ld.global.f32 %f126, [%rd81]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd81], %f127; add.s64 %rd82, %rd75, %rd77; ld.global.f32 %f128, [%rd82]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd82], %f129; add.s64 %rd83, %rd74, %rd77; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; BB0_78: ret; } ` addzhanglitorque2_ptx_60 = ` .version 6.5 .target sm_60 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<15>; .reg .f32 %f<149>; .reg .b32 %r<182>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r70, %r71, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r73, %r74, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r76, %r77, %r78; setp.ge.s32 %p1, %r2, %r68; setp.ge.s32 %p2, %r1, %r67; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra BB0_11; rcp.rn.f32 %f136, %f135; BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; mov.f32 %f143, 0f00000000; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra BB0_20; rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r164, %r81, %r67; bra.uni BB0_21; BB0_20: add.s32 %r82, %r67, -1; min.s32 %r164, %r7, %r82; BB0_21: add.s32 %r83, %r164, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra BB0_23; rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r165, %r85, %r67; bra.uni BB0_24; BB0_23: mov.u32 %r86, 0; max.s32 %r165, %r11, %r86; BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f20, %f79; add.s32 %r87, %r165, %r5; mul.wide.s32 %rd40, %r87, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra BB0_26; rem.s32 %r88, %r7, %r67; add.s32 %r89, %r88, %r67; rem.s32 %r166, %r89, %r67; bra.uni BB0_27; BB0_26: add.s32 %r90, %r67, -1; min.s32 %r166, %r7, %r90; BB0_27: add.s32 %r91, %r166, %r5; mul.wide.s32 %rd42, %r91, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra BB0_29; rem.s32 %r92, %r11, %r67; add.s32 %r93, %r92, %r67; rem.s32 %r167, %r93, %r67; bra.uni BB0_30; BB0_29: mov.u32 %r94, 0; max.s32 %r167, %r11, %r94; BB0_30: add.s32 %r95, %r167, %r5; mul.wide.s32 %rd44, %r95, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra BB0_32; rem.s32 %r96, %r7, %r67; add.s32 %r97, %r96, %r67; rem.s32 %r168, %r97, %r67; bra.uni BB0_33; BB0_32: add.s32 %r98, %r67, -1; min.s32 %r168, %r7, %r98; BB0_33: add.s32 %r99, %r168, %r5; mul.wide.s32 %rd46, %r99, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra BB0_35; rem.s32 %r100, %r11, %r67; add.s32 %r101, %r100, %r67; rem.s32 %r169, %r101, %r67; bra.uni BB0_36; BB0_35: mov.u32 %r102, 0; max.s32 %r169, %r11, %r102; BB0_36: add.s32 %r103, %r169, %r5; mul.wide.s32 %rd48, %r103, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra BB0_40; rem.s32 %r104, %r27, %r68; add.s32 %r105, %r104, %r68; rem.s32 %r170, %r105, %r68; bra.uni BB0_41; BB0_40: add.s32 %r106, %r68, -1; min.s32 %r170, %r27, %r106; BB0_41: add.s32 %r107, %r170, %r4; mad.lo.s32 %r108, %r107, %r67, %r1; mul.wide.s32 %rd50, %r108, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra BB0_43; rem.s32 %r109, %r31, %r68; add.s32 %r110, %r109, %r68; rem.s32 %r171, %r110, %r68; bra.uni BB0_44; BB0_43: mov.u32 %r111, 0; max.s32 %r171, %r31, %r111; BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f18, %f84; add.s32 %r112, %r171, %r4; mad.lo.s32 %r113, %r112, %r67, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra BB0_46; rem.s32 %r114, %r27, %r68; add.s32 %r115, %r114, %r68; rem.s32 %r172, %r115, %r68; bra.uni BB0_47; BB0_46: add.s32 %r116, %r68, -1; min.s32 %r172, %r27, %r116; BB0_47: add.s32 %r117, %r172, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd54, %r118, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra BB0_49; rem.s32 %r119, %r31, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r173, %r120, %r68; bra.uni BB0_50; BB0_49: mov.u32 %r121, 0; max.s32 %r173, %r31, %r121; BB0_50: add.s32 %r122, %r173, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd56, %r123, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra BB0_52; rem.s32 %r124, %r27, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r174, %r125, %r68; bra.uni BB0_53; BB0_52: add.s32 %r126, %r68, -1; min.s32 %r174, %r27, %r126; BB0_53: add.s32 %r127, %r174, %r4; mad.lo.s32 %r128, %r127, %r67, %r1; mul.wide.s32 %rd58, %r128, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra BB0_55; rem.s32 %r129, %r31, %r68; add.s32 %r130, %r129, %r68; rem.s32 %r175, %r130, %r68; bra.uni BB0_56; BB0_55: mov.u32 %r131, 0; max.s32 %r175, %r31, %r131; BB0_56: add.s32 %r132, %r175, %r4; mad.lo.s32 %r133, %r132, %r67, %r1; mul.wide.s32 %rd60, %r133, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f19, %f89; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra BB0_60; rem.s32 %r134, %r47, %r69; add.s32 %r135, %r134, %r69; rem.s32 %r176, %r135, %r69; bra.uni BB0_61; BB0_60: add.s32 %r136, %r69, -1; min.s32 %r176, %r47, %r136; BB0_61: mad.lo.s32 %r137, %r176, %r68, %r2; mad.lo.s32 %r138, %r137, %r67, %r1; mul.wide.s32 %rd62, %r138, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra BB0_63; rem.s32 %r139, %r51, %r69; add.s32 %r140, %r139, %r69; rem.s32 %r177, %r140, %r69; bra.uni BB0_64; BB0_63: mov.u32 %r141, 0; max.s32 %r177, %r51, %r141; BB0_64: mad.lo.s32 %r142, %r177, %r68, %r2; mad.lo.s32 %r143, %r142, %r67, %r1; mul.wide.s32 %rd64, %r143, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra BB0_66; rem.s32 %r144, %r47, %r69; add.s32 %r145, %r144, %r69; rem.s32 %r178, %r145, %r69; bra.uni BB0_67; BB0_66: add.s32 %r146, %r69, -1; min.s32 %r178, %r47, %r146; BB0_67: mad.lo.s32 %r147, %r178, %r68, %r2; mad.lo.s32 %r148, %r147, %r67, %r1; mul.wide.s32 %rd66, %r148, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra BB0_69; rem.s32 %r149, %r51, %r69; add.s32 %r150, %r149, %r69; rem.s32 %r179, %r150, %r69; bra.uni BB0_70; BB0_69: mov.u32 %r151, 0; max.s32 %r179, %r51, %r151; BB0_70: mad.lo.s32 %r152, %r179, %r68, %r2; mad.lo.s32 %r153, %r152, %r67, %r1; mul.wide.s32 %rd68, %r153, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra BB0_72; rem.s32 %r154, %r47, %r69; add.s32 %r155, %r154, %r69; rem.s32 %r180, %r155, %r69; bra.uni BB0_73; BB0_72: add.s32 %r156, %r69, -1; min.s32 %r180, %r47, %r156; BB0_73: mad.lo.s32 %r157, %r180, %r68, %r2; mad.lo.s32 %r158, %r157, %r67, %r1; mul.wide.s32 %rd70, %r158, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra BB0_75; rem.s32 %r159, %r51, %r69; add.s32 %r160, %r159, %r69; rem.s32 %r181, %r160, %r69; bra.uni BB0_76; BB0_75: mov.u32 %r161, 0; max.s32 %r181, %r51, %r161; BB0_76: mad.lo.s32 %r162, %r181, %r68, %r2; mad.lo.s32 %r163, %r162, %r67, %r1; mul.wide.s32 %rd72, %r163, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; BB0_77: cvta.to.global.u64 %rd74, %rd6; cvta.to.global.u64 %rd75, %rd5; cvta.to.global.u64 %rd76, %rd4; mul.wide.s32 %rd77, %r6, 4; add.s64 %rd78, %rd3, %rd77; add.s64 %rd79, %rd2, %rd77; add.s64 %rd80, %rd1, %rd77; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd79]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd80]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd78]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; add.s64 %rd81, %rd76, %rd77; ld.global.f32 %f126, [%rd81]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd81], %f127; add.s64 %rd82, %rd75, %rd77; ld.global.f32 %f128, [%rd82]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd82], %f129; add.s64 %rd83, %rd74, %rd77; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; BB0_78: ret; } ` addzhanglitorque2_ptx_61 = ` .version 6.5 .target sm_61 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<15>; .reg .f32 %f<149>; .reg .b32 %r<182>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r70, %r71, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r73, %r74, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r76, %r77, %r78; setp.ge.s32 %p1, %r2, %r68; setp.ge.s32 %p2, %r1, %r67; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra BB0_11; rcp.rn.f32 %f136, %f135; BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; mov.f32 %f143, 0f00000000; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra BB0_20; rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r164, %r81, %r67; bra.uni BB0_21; BB0_20: add.s32 %r82, %r67, -1; min.s32 %r164, %r7, %r82; BB0_21: add.s32 %r83, %r164, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra BB0_23; rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r165, %r85, %r67; bra.uni BB0_24; BB0_23: mov.u32 %r86, 0; max.s32 %r165, %r11, %r86; BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f20, %f79; add.s32 %r87, %r165, %r5; mul.wide.s32 %rd40, %r87, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra BB0_26; rem.s32 %r88, %r7, %r67; add.s32 %r89, %r88, %r67; rem.s32 %r166, %r89, %r67; bra.uni BB0_27; BB0_26: add.s32 %r90, %r67, -1; min.s32 %r166, %r7, %r90; BB0_27: add.s32 %r91, %r166, %r5; mul.wide.s32 %rd42, %r91, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra BB0_29; rem.s32 %r92, %r11, %r67; add.s32 %r93, %r92, %r67; rem.s32 %r167, %r93, %r67; bra.uni BB0_30; BB0_29: mov.u32 %r94, 0; max.s32 %r167, %r11, %r94; BB0_30: add.s32 %r95, %r167, %r5; mul.wide.s32 %rd44, %r95, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra BB0_32; rem.s32 %r96, %r7, %r67; add.s32 %r97, %r96, %r67; rem.s32 %r168, %r97, %r67; bra.uni BB0_33; BB0_32: add.s32 %r98, %r67, -1; min.s32 %r168, %r7, %r98; BB0_33: add.s32 %r99, %r168, %r5; mul.wide.s32 %rd46, %r99, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra BB0_35; rem.s32 %r100, %r11, %r67; add.s32 %r101, %r100, %r67; rem.s32 %r169, %r101, %r67; bra.uni BB0_36; BB0_35: mov.u32 %r102, 0; max.s32 %r169, %r11, %r102; BB0_36: add.s32 %r103, %r169, %r5; mul.wide.s32 %rd48, %r103, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra BB0_40; rem.s32 %r104, %r27, %r68; add.s32 %r105, %r104, %r68; rem.s32 %r170, %r105, %r68; bra.uni BB0_41; BB0_40: add.s32 %r106, %r68, -1; min.s32 %r170, %r27, %r106; BB0_41: add.s32 %r107, %r170, %r4; mad.lo.s32 %r108, %r107, %r67, %r1; mul.wide.s32 %rd50, %r108, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra BB0_43; rem.s32 %r109, %r31, %r68; add.s32 %r110, %r109, %r68; rem.s32 %r171, %r110, %r68; bra.uni BB0_44; BB0_43: mov.u32 %r111, 0; max.s32 %r171, %r31, %r111; BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f18, %f84; add.s32 %r112, %r171, %r4; mad.lo.s32 %r113, %r112, %r67, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra BB0_46; rem.s32 %r114, %r27, %r68; add.s32 %r115, %r114, %r68; rem.s32 %r172, %r115, %r68; bra.uni BB0_47; BB0_46: add.s32 %r116, %r68, -1; min.s32 %r172, %r27, %r116; BB0_47: add.s32 %r117, %r172, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd54, %r118, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra BB0_49; rem.s32 %r119, %r31, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r173, %r120, %r68; bra.uni BB0_50; BB0_49: mov.u32 %r121, 0; max.s32 %r173, %r31, %r121; BB0_50: add.s32 %r122, %r173, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd56, %r123, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra BB0_52; rem.s32 %r124, %r27, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r174, %r125, %r68; bra.uni BB0_53; BB0_52: add.s32 %r126, %r68, -1; min.s32 %r174, %r27, %r126; BB0_53: add.s32 %r127, %r174, %r4; mad.lo.s32 %r128, %r127, %r67, %r1; mul.wide.s32 %rd58, %r128, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra BB0_55; rem.s32 %r129, %r31, %r68; add.s32 %r130, %r129, %r68; rem.s32 %r175, %r130, %r68; bra.uni BB0_56; BB0_55: mov.u32 %r131, 0; max.s32 %r175, %r31, %r131; BB0_56: add.s32 %r132, %r175, %r4; mad.lo.s32 %r133, %r132, %r67, %r1; mul.wide.s32 %rd60, %r133, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f19, %f89; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra BB0_60; rem.s32 %r134, %r47, %r69; add.s32 %r135, %r134, %r69; rem.s32 %r176, %r135, %r69; bra.uni BB0_61; BB0_60: add.s32 %r136, %r69, -1; min.s32 %r176, %r47, %r136; BB0_61: mad.lo.s32 %r137, %r176, %r68, %r2; mad.lo.s32 %r138, %r137, %r67, %r1; mul.wide.s32 %rd62, %r138, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra BB0_63; rem.s32 %r139, %r51, %r69; add.s32 %r140, %r139, %r69; rem.s32 %r177, %r140, %r69; bra.uni BB0_64; BB0_63: mov.u32 %r141, 0; max.s32 %r177, %r51, %r141; BB0_64: mad.lo.s32 %r142, %r177, %r68, %r2; mad.lo.s32 %r143, %r142, %r67, %r1; mul.wide.s32 %rd64, %r143, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra BB0_66; rem.s32 %r144, %r47, %r69; add.s32 %r145, %r144, %r69; rem.s32 %r178, %r145, %r69; bra.uni BB0_67; BB0_66: add.s32 %r146, %r69, -1; min.s32 %r178, %r47, %r146; BB0_67: mad.lo.s32 %r147, %r178, %r68, %r2; mad.lo.s32 %r148, %r147, %r67, %r1; mul.wide.s32 %rd66, %r148, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra BB0_69; rem.s32 %r149, %r51, %r69; add.s32 %r150, %r149, %r69; rem.s32 %r179, %r150, %r69; bra.uni BB0_70; BB0_69: mov.u32 %r151, 0; max.s32 %r179, %r51, %r151; BB0_70: mad.lo.s32 %r152, %r179, %r68, %r2; mad.lo.s32 %r153, %r152, %r67, %r1; mul.wide.s32 %rd68, %r153, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra BB0_72; rem.s32 %r154, %r47, %r69; add.s32 %r155, %r154, %r69; rem.s32 %r180, %r155, %r69; bra.uni BB0_73; BB0_72: add.s32 %r156, %r69, -1; min.s32 %r180, %r47, %r156; BB0_73: mad.lo.s32 %r157, %r180, %r68, %r2; mad.lo.s32 %r158, %r157, %r67, %r1; mul.wide.s32 %rd70, %r158, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra BB0_75; rem.s32 %r159, %r51, %r69; add.s32 %r160, %r159, %r69; rem.s32 %r181, %r160, %r69; bra.uni BB0_76; BB0_75: mov.u32 %r161, 0; max.s32 %r181, %r51, %r161; BB0_76: mad.lo.s32 %r162, %r181, %r68, %r2; mad.lo.s32 %r163, %r162, %r67, %r1; mul.wide.s32 %rd72, %r163, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; BB0_77: cvta.to.global.u64 %rd74, %rd6; cvta.to.global.u64 %rd75, %rd5; cvta.to.global.u64 %rd76, %rd4; mul.wide.s32 %rd77, %r6, 4; add.s64 %rd78, %rd3, %rd77; add.s64 %rd79, %rd2, %rd77; add.s64 %rd80, %rd1, %rd77; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd79]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd80]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd78]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; add.s64 %rd81, %rd76, %rd77; ld.global.f32 %f126, [%rd81]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd81], %f127; add.s64 %rd82, %rd75, %rd77; ld.global.f32 %f128, [%rd82]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd82], %f129; add.s64 %rd83, %rd74, %rd77; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; BB0_78: ret; } ` addzhanglitorque2_ptx_62 = ` .version 6.5 .target sm_62 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<15>; .reg .f32 %f<149>; .reg .b32 %r<182>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r70, %r71, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r73, %r74, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r76, %r77, %r78; setp.ge.s32 %p1, %r2, %r68; setp.ge.s32 %p2, %r1, %r67; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra BB0_11; rcp.rn.f32 %f136, %f135; BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; mov.f32 %f143, 0f00000000; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra BB0_20; rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r164, %r81, %r67; bra.uni BB0_21; BB0_20: add.s32 %r82, %r67, -1; min.s32 %r164, %r7, %r82; BB0_21: add.s32 %r83, %r164, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra BB0_23; rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r165, %r85, %r67; bra.uni BB0_24; BB0_23: mov.u32 %r86, 0; max.s32 %r165, %r11, %r86; BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f20, %f79; add.s32 %r87, %r165, %r5; mul.wide.s32 %rd40, %r87, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra BB0_26; rem.s32 %r88, %r7, %r67; add.s32 %r89, %r88, %r67; rem.s32 %r166, %r89, %r67; bra.uni BB0_27; BB0_26: add.s32 %r90, %r67, -1; min.s32 %r166, %r7, %r90; BB0_27: add.s32 %r91, %r166, %r5; mul.wide.s32 %rd42, %r91, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra BB0_29; rem.s32 %r92, %r11, %r67; add.s32 %r93, %r92, %r67; rem.s32 %r167, %r93, %r67; bra.uni BB0_30; BB0_29: mov.u32 %r94, 0; max.s32 %r167, %r11, %r94; BB0_30: add.s32 %r95, %r167, %r5; mul.wide.s32 %rd44, %r95, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra BB0_32; rem.s32 %r96, %r7, %r67; add.s32 %r97, %r96, %r67; rem.s32 %r168, %r97, %r67; bra.uni BB0_33; BB0_32: add.s32 %r98, %r67, -1; min.s32 %r168, %r7, %r98; BB0_33: add.s32 %r99, %r168, %r5; mul.wide.s32 %rd46, %r99, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra BB0_35; rem.s32 %r100, %r11, %r67; add.s32 %r101, %r100, %r67; rem.s32 %r169, %r101, %r67; bra.uni BB0_36; BB0_35: mov.u32 %r102, 0; max.s32 %r169, %r11, %r102; BB0_36: add.s32 %r103, %r169, %r5; mul.wide.s32 %rd48, %r103, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra BB0_40; rem.s32 %r104, %r27, %r68; add.s32 %r105, %r104, %r68; rem.s32 %r170, %r105, %r68; bra.uni BB0_41; BB0_40: add.s32 %r106, %r68, -1; min.s32 %r170, %r27, %r106; BB0_41: add.s32 %r107, %r170, %r4; mad.lo.s32 %r108, %r107, %r67, %r1; mul.wide.s32 %rd50, %r108, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra BB0_43; rem.s32 %r109, %r31, %r68; add.s32 %r110, %r109, %r68; rem.s32 %r171, %r110, %r68; bra.uni BB0_44; BB0_43: mov.u32 %r111, 0; max.s32 %r171, %r31, %r111; BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f18, %f84; add.s32 %r112, %r171, %r4; mad.lo.s32 %r113, %r112, %r67, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra BB0_46; rem.s32 %r114, %r27, %r68; add.s32 %r115, %r114, %r68; rem.s32 %r172, %r115, %r68; bra.uni BB0_47; BB0_46: add.s32 %r116, %r68, -1; min.s32 %r172, %r27, %r116; BB0_47: add.s32 %r117, %r172, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd54, %r118, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra BB0_49; rem.s32 %r119, %r31, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r173, %r120, %r68; bra.uni BB0_50; BB0_49: mov.u32 %r121, 0; max.s32 %r173, %r31, %r121; BB0_50: add.s32 %r122, %r173, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd56, %r123, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra BB0_52; rem.s32 %r124, %r27, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r174, %r125, %r68; bra.uni BB0_53; BB0_52: add.s32 %r126, %r68, -1; min.s32 %r174, %r27, %r126; BB0_53: add.s32 %r127, %r174, %r4; mad.lo.s32 %r128, %r127, %r67, %r1; mul.wide.s32 %rd58, %r128, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra BB0_55; rem.s32 %r129, %r31, %r68; add.s32 %r130, %r129, %r68; rem.s32 %r175, %r130, %r68; bra.uni BB0_56; BB0_55: mov.u32 %r131, 0; max.s32 %r175, %r31, %r131; BB0_56: add.s32 %r132, %r175, %r4; mad.lo.s32 %r133, %r132, %r67, %r1; mul.wide.s32 %rd60, %r133, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f19, %f89; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra BB0_60; rem.s32 %r134, %r47, %r69; add.s32 %r135, %r134, %r69; rem.s32 %r176, %r135, %r69; bra.uni BB0_61; BB0_60: add.s32 %r136, %r69, -1; min.s32 %r176, %r47, %r136; BB0_61: mad.lo.s32 %r137, %r176, %r68, %r2; mad.lo.s32 %r138, %r137, %r67, %r1; mul.wide.s32 %rd62, %r138, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra BB0_63; rem.s32 %r139, %r51, %r69; add.s32 %r140, %r139, %r69; rem.s32 %r177, %r140, %r69; bra.uni BB0_64; BB0_63: mov.u32 %r141, 0; max.s32 %r177, %r51, %r141; BB0_64: mad.lo.s32 %r142, %r177, %r68, %r2; mad.lo.s32 %r143, %r142, %r67, %r1; mul.wide.s32 %rd64, %r143, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra BB0_66; rem.s32 %r144, %r47, %r69; add.s32 %r145, %r144, %r69; rem.s32 %r178, %r145, %r69; bra.uni BB0_67; BB0_66: add.s32 %r146, %r69, -1; min.s32 %r178, %r47, %r146; BB0_67: mad.lo.s32 %r147, %r178, %r68, %r2; mad.lo.s32 %r148, %r147, %r67, %r1; mul.wide.s32 %rd66, %r148, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra BB0_69; rem.s32 %r149, %r51, %r69; add.s32 %r150, %r149, %r69; rem.s32 %r179, %r150, %r69; bra.uni BB0_70; BB0_69: mov.u32 %r151, 0; max.s32 %r179, %r51, %r151; BB0_70: mad.lo.s32 %r152, %r179, %r68, %r2; mad.lo.s32 %r153, %r152, %r67, %r1; mul.wide.s32 %rd68, %r153, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra BB0_72; rem.s32 %r154, %r47, %r69; add.s32 %r155, %r154, %r69; rem.s32 %r180, %r155, %r69; bra.uni BB0_73; BB0_72: add.s32 %r156, %r69, -1; min.s32 %r180, %r47, %r156; BB0_73: mad.lo.s32 %r157, %r180, %r68, %r2; mad.lo.s32 %r158, %r157, %r67, %r1; mul.wide.s32 %rd70, %r158, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra BB0_75; rem.s32 %r159, %r51, %r69; add.s32 %r160, %r159, %r69; rem.s32 %r181, %r160, %r69; bra.uni BB0_76; BB0_75: mov.u32 %r161, 0; max.s32 %r181, %r51, %r161; BB0_76: mad.lo.s32 %r162, %r181, %r68, %r2; mad.lo.s32 %r163, %r162, %r67, %r1; mul.wide.s32 %rd72, %r163, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; BB0_77: cvta.to.global.u64 %rd74, %rd6; cvta.to.global.u64 %rd75, %rd5; cvta.to.global.u64 %rd76, %rd4; mul.wide.s32 %rd77, %r6, 4; add.s64 %rd78, %rd3, %rd77; add.s64 %rd79, %rd2, %rd77; add.s64 %rd80, %rd1, %rd77; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd79]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd80]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd78]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; add.s64 %rd81, %rd76, %rd77; ld.global.f32 %f126, [%rd81]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd81], %f127; add.s64 %rd82, %rd75, %rd77; ld.global.f32 %f128, [%rd82]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd82], %f129; add.s64 %rd83, %rd74, %rd77; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; BB0_78: ret; } ` addzhanglitorque2_ptx_70 = ` .version 6.5 .target sm_70 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<15>; .reg .f32 %f<149>; .reg .b32 %r<182>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r70, %r71, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r73, %r74, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r76, %r77, %r78; setp.ge.s32 %p1, %r2, %r68; setp.ge.s32 %p2, %r1, %r67; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra BB0_11; rcp.rn.f32 %f136, %f135; BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; mov.f32 %f143, 0f00000000; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra BB0_20; rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r164, %r81, %r67; bra.uni BB0_21; BB0_20: add.s32 %r82, %r67, -1; min.s32 %r164, %r7, %r82; BB0_21: add.s32 %r83, %r164, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra BB0_23; rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r165, %r85, %r67; bra.uni BB0_24; BB0_23: mov.u32 %r86, 0; max.s32 %r165, %r11, %r86; BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f20, %f79; add.s32 %r87, %r165, %r5; mul.wide.s32 %rd40, %r87, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra BB0_26; rem.s32 %r88, %r7, %r67; add.s32 %r89, %r88, %r67; rem.s32 %r166, %r89, %r67; bra.uni BB0_27; BB0_26: add.s32 %r90, %r67, -1; min.s32 %r166, %r7, %r90; BB0_27: add.s32 %r91, %r166, %r5; mul.wide.s32 %rd42, %r91, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra BB0_29; rem.s32 %r92, %r11, %r67; add.s32 %r93, %r92, %r67; rem.s32 %r167, %r93, %r67; bra.uni BB0_30; BB0_29: mov.u32 %r94, 0; max.s32 %r167, %r11, %r94; BB0_30: add.s32 %r95, %r167, %r5; mul.wide.s32 %rd44, %r95, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra BB0_32; rem.s32 %r96, %r7, %r67; add.s32 %r97, %r96, %r67; rem.s32 %r168, %r97, %r67; bra.uni BB0_33; BB0_32: add.s32 %r98, %r67, -1; min.s32 %r168, %r7, %r98; BB0_33: add.s32 %r99, %r168, %r5; mul.wide.s32 %rd46, %r99, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra BB0_35; rem.s32 %r100, %r11, %r67; add.s32 %r101, %r100, %r67; rem.s32 %r169, %r101, %r67; bra.uni BB0_36; BB0_35: mov.u32 %r102, 0; max.s32 %r169, %r11, %r102; BB0_36: add.s32 %r103, %r169, %r5; mul.wide.s32 %rd48, %r103, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra BB0_40; rem.s32 %r104, %r27, %r68; add.s32 %r105, %r104, %r68; rem.s32 %r170, %r105, %r68; bra.uni BB0_41; BB0_40: add.s32 %r106, %r68, -1; min.s32 %r170, %r27, %r106; BB0_41: add.s32 %r107, %r170, %r4; mad.lo.s32 %r108, %r107, %r67, %r1; mul.wide.s32 %rd50, %r108, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra BB0_43; rem.s32 %r109, %r31, %r68; add.s32 %r110, %r109, %r68; rem.s32 %r171, %r110, %r68; bra.uni BB0_44; BB0_43: mov.u32 %r111, 0; max.s32 %r171, %r31, %r111; BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f18, %f84; add.s32 %r112, %r171, %r4; mad.lo.s32 %r113, %r112, %r67, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra BB0_46; rem.s32 %r114, %r27, %r68; add.s32 %r115, %r114, %r68; rem.s32 %r172, %r115, %r68; bra.uni BB0_47; BB0_46: add.s32 %r116, %r68, -1; min.s32 %r172, %r27, %r116; BB0_47: add.s32 %r117, %r172, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd54, %r118, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra BB0_49; rem.s32 %r119, %r31, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r173, %r120, %r68; bra.uni BB0_50; BB0_49: mov.u32 %r121, 0; max.s32 %r173, %r31, %r121; BB0_50: add.s32 %r122, %r173, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd56, %r123, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra BB0_52; rem.s32 %r124, %r27, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r174, %r125, %r68; bra.uni BB0_53; BB0_52: add.s32 %r126, %r68, -1; min.s32 %r174, %r27, %r126; BB0_53: add.s32 %r127, %r174, %r4; mad.lo.s32 %r128, %r127, %r67, %r1; mul.wide.s32 %rd58, %r128, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra BB0_55; rem.s32 %r129, %r31, %r68; add.s32 %r130, %r129, %r68; rem.s32 %r175, %r130, %r68; bra.uni BB0_56; BB0_55: mov.u32 %r131, 0; max.s32 %r175, %r31, %r131; BB0_56: add.s32 %r132, %r175, %r4; mad.lo.s32 %r133, %r132, %r67, %r1; mul.wide.s32 %rd60, %r133, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f19, %f89; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra BB0_60; rem.s32 %r134, %r47, %r69; add.s32 %r135, %r134, %r69; rem.s32 %r176, %r135, %r69; bra.uni BB0_61; BB0_60: add.s32 %r136, %r69, -1; min.s32 %r176, %r47, %r136; BB0_61: mad.lo.s32 %r137, %r176, %r68, %r2; mad.lo.s32 %r138, %r137, %r67, %r1; mul.wide.s32 %rd62, %r138, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra BB0_63; rem.s32 %r139, %r51, %r69; add.s32 %r140, %r139, %r69; rem.s32 %r177, %r140, %r69; bra.uni BB0_64; BB0_63: mov.u32 %r141, 0; max.s32 %r177, %r51, %r141; BB0_64: mad.lo.s32 %r142, %r177, %r68, %r2; mad.lo.s32 %r143, %r142, %r67, %r1; mul.wide.s32 %rd64, %r143, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra BB0_66; rem.s32 %r144, %r47, %r69; add.s32 %r145, %r144, %r69; rem.s32 %r178, %r145, %r69; bra.uni BB0_67; BB0_66: add.s32 %r146, %r69, -1; min.s32 %r178, %r47, %r146; BB0_67: mad.lo.s32 %r147, %r178, %r68, %r2; mad.lo.s32 %r148, %r147, %r67, %r1; mul.wide.s32 %rd66, %r148, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra BB0_69; rem.s32 %r149, %r51, %r69; add.s32 %r150, %r149, %r69; rem.s32 %r179, %r150, %r69; bra.uni BB0_70; BB0_69: mov.u32 %r151, 0; max.s32 %r179, %r51, %r151; BB0_70: mad.lo.s32 %r152, %r179, %r68, %r2; mad.lo.s32 %r153, %r152, %r67, %r1; mul.wide.s32 %rd68, %r153, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra BB0_72; rem.s32 %r154, %r47, %r69; add.s32 %r155, %r154, %r69; rem.s32 %r180, %r155, %r69; bra.uni BB0_73; BB0_72: add.s32 %r156, %r69, -1; min.s32 %r180, %r47, %r156; BB0_73: mad.lo.s32 %r157, %r180, %r68, %r2; mad.lo.s32 %r158, %r157, %r67, %r1; mul.wide.s32 %rd70, %r158, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra BB0_75; rem.s32 %r159, %r51, %r69; add.s32 %r160, %r159, %r69; rem.s32 %r181, %r160, %r69; bra.uni BB0_76; BB0_75: mov.u32 %r161, 0; max.s32 %r181, %r51, %r161; BB0_76: mad.lo.s32 %r162, %r181, %r68, %r2; mad.lo.s32 %r163, %r162, %r67, %r1; mul.wide.s32 %rd72, %r163, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; BB0_77: cvta.to.global.u64 %rd74, %rd6; cvta.to.global.u64 %rd75, %rd5; cvta.to.global.u64 %rd76, %rd4; mul.wide.s32 %rd77, %r6, 4; add.s64 %rd78, %rd3, %rd77; add.s64 %rd79, %rd2, %rd77; add.s64 %rd80, %rd1, %rd77; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd79]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd80]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd78]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; add.s64 %rd81, %rd76, %rd77; ld.global.f32 %f126, [%rd81]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd81], %f127; add.s64 %rd82, %rd75, %rd77; ld.global.f32 %f128, [%rd82]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd82], %f129; add.s64 %rd83, %rd74, %rd77; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; BB0_78: ret; } ` addzhanglitorque2_ptx_72 = ` .version 6.5 .target sm_72 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<15>; .reg .f32 %f<149>; .reg .b32 %r<182>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r70, %r71, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r73, %r74, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r76, %r77, %r78; setp.ge.s32 %p1, %r2, %r68; setp.ge.s32 %p2, %r1, %r67; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra BB0_11; rcp.rn.f32 %f136, %f135; BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; mov.f32 %f143, 0f00000000; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra BB0_20; rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r164, %r81, %r67; bra.uni BB0_21; BB0_20: add.s32 %r82, %r67, -1; min.s32 %r164, %r7, %r82; BB0_21: add.s32 %r83, %r164, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra BB0_23; rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r165, %r85, %r67; bra.uni BB0_24; BB0_23: mov.u32 %r86, 0; max.s32 %r165, %r11, %r86; BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f20, %f79; add.s32 %r87, %r165, %r5; mul.wide.s32 %rd40, %r87, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra BB0_26; rem.s32 %r88, %r7, %r67; add.s32 %r89, %r88, %r67; rem.s32 %r166, %r89, %r67; bra.uni BB0_27; BB0_26: add.s32 %r90, %r67, -1; min.s32 %r166, %r7, %r90; BB0_27: add.s32 %r91, %r166, %r5; mul.wide.s32 %rd42, %r91, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra BB0_29; rem.s32 %r92, %r11, %r67; add.s32 %r93, %r92, %r67; rem.s32 %r167, %r93, %r67; bra.uni BB0_30; BB0_29: mov.u32 %r94, 0; max.s32 %r167, %r11, %r94; BB0_30: add.s32 %r95, %r167, %r5; mul.wide.s32 %rd44, %r95, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra BB0_32; rem.s32 %r96, %r7, %r67; add.s32 %r97, %r96, %r67; rem.s32 %r168, %r97, %r67; bra.uni BB0_33; BB0_32: add.s32 %r98, %r67, -1; min.s32 %r168, %r7, %r98; BB0_33: add.s32 %r99, %r168, %r5; mul.wide.s32 %rd46, %r99, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra BB0_35; rem.s32 %r100, %r11, %r67; add.s32 %r101, %r100, %r67; rem.s32 %r169, %r101, %r67; bra.uni BB0_36; BB0_35: mov.u32 %r102, 0; max.s32 %r169, %r11, %r102; BB0_36: add.s32 %r103, %r169, %r5; mul.wide.s32 %rd48, %r103, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra BB0_40; rem.s32 %r104, %r27, %r68; add.s32 %r105, %r104, %r68; rem.s32 %r170, %r105, %r68; bra.uni BB0_41; BB0_40: add.s32 %r106, %r68, -1; min.s32 %r170, %r27, %r106; BB0_41: add.s32 %r107, %r170, %r4; mad.lo.s32 %r108, %r107, %r67, %r1; mul.wide.s32 %rd50, %r108, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra BB0_43; rem.s32 %r109, %r31, %r68; add.s32 %r110, %r109, %r68; rem.s32 %r171, %r110, %r68; bra.uni BB0_44; BB0_43: mov.u32 %r111, 0; max.s32 %r171, %r31, %r111; BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f18, %f84; add.s32 %r112, %r171, %r4; mad.lo.s32 %r113, %r112, %r67, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra BB0_46; rem.s32 %r114, %r27, %r68; add.s32 %r115, %r114, %r68; rem.s32 %r172, %r115, %r68; bra.uni BB0_47; BB0_46: add.s32 %r116, %r68, -1; min.s32 %r172, %r27, %r116; BB0_47: add.s32 %r117, %r172, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd54, %r118, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra BB0_49; rem.s32 %r119, %r31, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r173, %r120, %r68; bra.uni BB0_50; BB0_49: mov.u32 %r121, 0; max.s32 %r173, %r31, %r121; BB0_50: add.s32 %r122, %r173, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd56, %r123, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra BB0_52; rem.s32 %r124, %r27, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r174, %r125, %r68; bra.uni BB0_53; BB0_52: add.s32 %r126, %r68, -1; min.s32 %r174, %r27, %r126; BB0_53: add.s32 %r127, %r174, %r4; mad.lo.s32 %r128, %r127, %r67, %r1; mul.wide.s32 %rd58, %r128, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra BB0_55; rem.s32 %r129, %r31, %r68; add.s32 %r130, %r129, %r68; rem.s32 %r175, %r130, %r68; bra.uni BB0_56; BB0_55: mov.u32 %r131, 0; max.s32 %r175, %r31, %r131; BB0_56: add.s32 %r132, %r175, %r4; mad.lo.s32 %r133, %r132, %r67, %r1; mul.wide.s32 %rd60, %r133, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f19, %f89; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra BB0_60; rem.s32 %r134, %r47, %r69; add.s32 %r135, %r134, %r69; rem.s32 %r176, %r135, %r69; bra.uni BB0_61; BB0_60: add.s32 %r136, %r69, -1; min.s32 %r176, %r47, %r136; BB0_61: mad.lo.s32 %r137, %r176, %r68, %r2; mad.lo.s32 %r138, %r137, %r67, %r1; mul.wide.s32 %rd62, %r138, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra BB0_63; rem.s32 %r139, %r51, %r69; add.s32 %r140, %r139, %r69; rem.s32 %r177, %r140, %r69; bra.uni BB0_64; BB0_63: mov.u32 %r141, 0; max.s32 %r177, %r51, %r141; BB0_64: mad.lo.s32 %r142, %r177, %r68, %r2; mad.lo.s32 %r143, %r142, %r67, %r1; mul.wide.s32 %rd64, %r143, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra BB0_66; rem.s32 %r144, %r47, %r69; add.s32 %r145, %r144, %r69; rem.s32 %r178, %r145, %r69; bra.uni BB0_67; BB0_66: add.s32 %r146, %r69, -1; min.s32 %r178, %r47, %r146; BB0_67: mad.lo.s32 %r147, %r178, %r68, %r2; mad.lo.s32 %r148, %r147, %r67, %r1; mul.wide.s32 %rd66, %r148, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra BB0_69; rem.s32 %r149, %r51, %r69; add.s32 %r150, %r149, %r69; rem.s32 %r179, %r150, %r69; bra.uni BB0_70; BB0_69: mov.u32 %r151, 0; max.s32 %r179, %r51, %r151; BB0_70: mad.lo.s32 %r152, %r179, %r68, %r2; mad.lo.s32 %r153, %r152, %r67, %r1; mul.wide.s32 %rd68, %r153, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra BB0_72; rem.s32 %r154, %r47, %r69; add.s32 %r155, %r154, %r69; rem.s32 %r180, %r155, %r69; bra.uni BB0_73; BB0_72: add.s32 %r156, %r69, -1; min.s32 %r180, %r47, %r156; BB0_73: mad.lo.s32 %r157, %r180, %r68, %r2; mad.lo.s32 %r158, %r157, %r67, %r1; mul.wide.s32 %rd70, %r158, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra BB0_75; rem.s32 %r159, %r51, %r69; add.s32 %r160, %r159, %r69; rem.s32 %r181, %r160, %r69; bra.uni BB0_76; BB0_75: mov.u32 %r161, 0; max.s32 %r181, %r51, %r161; BB0_76: mad.lo.s32 %r162, %r181, %r68, %r2; mad.lo.s32 %r163, %r162, %r67, %r1; mul.wide.s32 %rd72, %r163, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; BB0_77: cvta.to.global.u64 %rd74, %rd6; cvta.to.global.u64 %rd75, %rd5; cvta.to.global.u64 %rd76, %rd4; mul.wide.s32 %rd77, %r6, 4; add.s64 %rd78, %rd3, %rd77; add.s64 %rd79, %rd2, %rd77; add.s64 %rd80, %rd1, %rd77; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd79]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd80]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd78]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; add.s64 %rd81, %rd76, %rd77; ld.global.f32 %f126, [%rd81]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd81], %f127; add.s64 %rd82, %rd75, %rd77; ld.global.f32 %f128, [%rd82]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd82], %f129; add.s64 %rd83, %rd74, %rd77; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; BB0_78: ret; } ` addzhanglitorque2_ptx_75 = ` .version 6.5 .target sm_75 .address_size 64 // .globl addzhanglitorque2 .visible .entry addzhanglitorque2( .param .u64 addzhanglitorque2_param_0, .param .u64 addzhanglitorque2_param_1, .param .u64 addzhanglitorque2_param_2, .param .u64 addzhanglitorque2_param_3, .param .u64 addzhanglitorque2_param_4, .param .u64 addzhanglitorque2_param_5, .param .u64 addzhanglitorque2_param_6, .param .f32 addzhanglitorque2_param_7, .param .u64 addzhanglitorque2_param_8, .param .f32 addzhanglitorque2_param_9, .param .u64 addzhanglitorque2_param_10, .param .f32 addzhanglitorque2_param_11, .param .u64 addzhanglitorque2_param_12, .param .f32 addzhanglitorque2_param_13, .param .u64 addzhanglitorque2_param_14, .param .f32 addzhanglitorque2_param_15, .param .u64 addzhanglitorque2_param_16, .param .f32 addzhanglitorque2_param_17, .param .u64 addzhanglitorque2_param_18, .param .f32 addzhanglitorque2_param_19, .param .f32 addzhanglitorque2_param_20, .param .f32 addzhanglitorque2_param_21, .param .f32 addzhanglitorque2_param_22, .param .u32 addzhanglitorque2_param_23, .param .u32 addzhanglitorque2_param_24, .param .u32 addzhanglitorque2_param_25, .param .u8 addzhanglitorque2_param_26 ) { .reg .pred %p<35>; .reg .b16 %rs<15>; .reg .f32 %f<149>; .reg .b32 %r<182>; .reg .f64 %fd<5>; .reg .b64 %rd<84>; ld.param.u64 %rd4, [addzhanglitorque2_param_0]; ld.param.u64 %rd5, [addzhanglitorque2_param_1]; ld.param.u64 %rd6, [addzhanglitorque2_param_2]; ld.param.u64 %rd14, [addzhanglitorque2_param_3]; ld.param.u64 %rd15, [addzhanglitorque2_param_4]; ld.param.u64 %rd16, [addzhanglitorque2_param_5]; ld.param.u64 %rd7, [addzhanglitorque2_param_6]; ld.param.f32 %f135, [addzhanglitorque2_param_7]; ld.param.u64 %rd8, [addzhanglitorque2_param_8]; ld.param.f32 %f137, [addzhanglitorque2_param_9]; ld.param.u64 %rd9, [addzhanglitorque2_param_10]; ld.param.f32 %f138, [addzhanglitorque2_param_11]; ld.param.u64 %rd10, [addzhanglitorque2_param_12]; ld.param.f32 %f139, [addzhanglitorque2_param_13]; ld.param.u64 %rd11, [addzhanglitorque2_param_14]; ld.param.f32 %f132, [addzhanglitorque2_param_15]; ld.param.u64 %rd12, [addzhanglitorque2_param_16]; ld.param.f32 %f133, [addzhanglitorque2_param_17]; ld.param.u64 %rd13, [addzhanglitorque2_param_18]; ld.param.f32 %f134, [addzhanglitorque2_param_19]; ld.param.f32 %f64, [addzhanglitorque2_param_20]; ld.param.f32 %f65, [addzhanglitorque2_param_21]; ld.param.f32 %f66, [addzhanglitorque2_param_22]; ld.param.u32 %r67, [addzhanglitorque2_param_23]; ld.param.u32 %r68, [addzhanglitorque2_param_24]; ld.param.u32 %r69, [addzhanglitorque2_param_25]; ld.param.u8 %rs4, [addzhanglitorque2_param_26]; cvta.to.global.u64 %rd1, %rd16; cvta.to.global.u64 %rd2, %rd15; cvta.to.global.u64 %rd3, %rd14; mov.u32 %r70, %ntid.x; mov.u32 %r71, %ctaid.x; mov.u32 %r72, %tid.x; mad.lo.s32 %r1, %r70, %r71, %r72; mov.u32 %r73, %ntid.y; mov.u32 %r74, %ctaid.y; mov.u32 %r75, %tid.y; mad.lo.s32 %r2, %r73, %r74, %r75; mov.u32 %r76, %ntid.z; mov.u32 %r77, %ctaid.z; mov.u32 %r78, %tid.z; mad.lo.s32 %r3, %r76, %r77, %r78; setp.ge.s32 %p1, %r2, %r68; setp.ge.s32 %p2, %r1, %r67; or.pred %p3, %p1, %p2; setp.ge.s32 %p4, %r3, %r69; or.pred %p5, %p3, %p4; @%p5 bra BB0_78; mul.lo.s32 %r4, %r3, %r68; add.s32 %r79, %r4, %r2; mul.lo.s32 %r5, %r79, %r67; add.s32 %r6, %r5, %r1; setp.eq.s64 %p6, %rd11, 0; @%p6 bra BB0_3; cvta.to.global.u64 %rd17, %rd11; mul.wide.s32 %rd18, %r6, 4; add.s64 %rd19, %rd17, %rd18; ld.global.nc.f32 %f67, [%rd19]; mul.f32 %f132, %f67, %f132; BB0_3: setp.eq.s64 %p7, %rd12, 0; @%p7 bra BB0_5; cvta.to.global.u64 %rd20, %rd12; mul.wide.s32 %rd21, %r6, 4; add.s64 %rd22, %rd20, %rd21; ld.global.nc.f32 %f68, [%rd22]; mul.f32 %f133, %f68, %f133; BB0_5: setp.eq.s64 %p8, %rd13, 0; @%p8 bra BB0_7; cvta.to.global.u64 %rd23, %rd13; mul.wide.s32 %rd24, %r6, 4; add.s64 %rd25, %rd23, %rd24; ld.global.nc.f32 %f69, [%rd25]; mul.f32 %f134, %f69, %f134; BB0_7: setp.eq.s64 %p9, %rd7, 0; @%p9 bra BB0_9; cvta.to.global.u64 %rd26, %rd7; mul.wide.s32 %rd27, %r6, 4; add.s64 %rd28, %rd26, %rd27; ld.global.nc.f32 %f70, [%rd28]; mul.f32 %f135, %f70, %f135; BB0_9: setp.eq.f32 %p10, %f135, 0f00000000; mov.f32 %f136, 0f00000000; @%p10 bra BB0_11; rcp.rn.f32 %f136, %f135; BB0_11: cvt.f64.f32 %fd1, %f136; mul.f64 %fd2, %fd1, 0d3CA7B4966C8AC112; fma.rn.f32 %f72, %f133, %f133, 0f3F800000; cvt.f64.f32 %fd3, %f72; div.rn.f64 %fd4, %fd2, %fd3; cvt.rn.f32.f64 %f11, %fd4; setp.eq.s64 %p11, %rd8, 0; @%p11 bra BB0_13; cvta.to.global.u64 %rd29, %rd8; mul.wide.s32 %rd30, %r6, 4; add.s64 %rd31, %rd29, %rd30; ld.global.nc.f32 %f73, [%rd31]; mul.f32 %f137, %f73, %f137; BB0_13: setp.eq.s64 %p12, %rd9, 0; @%p12 bra BB0_15; cvta.to.global.u64 %rd32, %rd9; mul.wide.s32 %rd33, %r6, 4; add.s64 %rd34, %rd32, %rd33; ld.global.nc.f32 %f74, [%rd34]; mul.f32 %f138, %f74, %f138; BB0_15: setp.eq.s64 %p13, %rd10, 0; @%p13 bra BB0_17; cvta.to.global.u64 %rd35, %rd10; mul.wide.s32 %rd36, %r6, 4; add.s64 %rd37, %rd35, %rd36; ld.global.nc.f32 %f75, [%rd37]; mul.f32 %f139, %f75, %f139; BB0_17: mul.f32 %f18, %f134, %f138; mul.f32 %f19, %f134, %f139; mul.f32 %f20, %f134, %f137; mov.f32 %f143, 0f00000000; setp.eq.f32 %p14, %f20, 0f00000000; mov.f32 %f144, %f143; mov.f32 %f145, %f143; @%p14 bra BB0_37; and.b16 %rs1, %rs4, 1; setp.eq.s16 %p15, %rs1, 0; add.s32 %r7, %r1, 1; @%p15 bra BB0_20; rem.s32 %r80, %r7, %r67; add.s32 %r81, %r80, %r67; rem.s32 %r164, %r81, %r67; bra.uni BB0_21; BB0_20: add.s32 %r82, %r67, -1; min.s32 %r164, %r7, %r82; BB0_21: add.s32 %r83, %r164, %r5; mul.wide.s32 %rd38, %r83, 4; add.s64 %rd39, %rd3, %rd38; ld.global.nc.f32 %f21, [%rd39]; add.s32 %r11, %r1, -1; @%p15 bra BB0_23; rem.s32 %r84, %r11, %r67; add.s32 %r85, %r84, %r67; rem.s32 %r165, %r85, %r67; bra.uni BB0_24; BB0_23: mov.u32 %r86, 0; max.s32 %r165, %r11, %r86; BB0_24: div.rn.f32 %f79, %f11, %f64; mul.f32 %f22, %f20, %f79; add.s32 %r87, %r165, %r5; mul.wide.s32 %rd40, %r87, 4; add.s64 %rd41, %rd3, %rd40; ld.global.nc.f32 %f80, [%rd41]; sub.f32 %f23, %f21, %f80; @%p15 bra BB0_26; rem.s32 %r88, %r7, %r67; add.s32 %r89, %r88, %r67; rem.s32 %r166, %r89, %r67; bra.uni BB0_27; BB0_26: add.s32 %r90, %r67, -1; min.s32 %r166, %r7, %r90; BB0_27: add.s32 %r91, %r166, %r5; mul.wide.s32 %rd42, %r91, 4; add.s64 %rd43, %rd2, %rd42; ld.global.nc.f32 %f24, [%rd43]; @%p15 bra BB0_29; rem.s32 %r92, %r11, %r67; add.s32 %r93, %r92, %r67; rem.s32 %r167, %r93, %r67; bra.uni BB0_30; BB0_29: mov.u32 %r94, 0; max.s32 %r167, %r11, %r94; BB0_30: add.s32 %r95, %r167, %r5; mul.wide.s32 %rd44, %r95, 4; add.s64 %rd45, %rd2, %rd44; ld.global.nc.f32 %f81, [%rd45]; sub.f32 %f25, %f24, %f81; @%p15 bra BB0_32; rem.s32 %r96, %r7, %r67; add.s32 %r97, %r96, %r67; rem.s32 %r168, %r97, %r67; bra.uni BB0_33; BB0_32: add.s32 %r98, %r67, -1; min.s32 %r168, %r7, %r98; BB0_33: add.s32 %r99, %r168, %r5; mul.wide.s32 %rd46, %r99, 4; add.s64 %rd47, %rd1, %rd46; ld.global.nc.f32 %f26, [%rd47]; @%p15 bra BB0_35; rem.s32 %r100, %r11, %r67; add.s32 %r101, %r100, %r67; rem.s32 %r169, %r101, %r67; bra.uni BB0_36; BB0_35: mov.u32 %r102, 0; max.s32 %r169, %r11, %r102; BB0_36: add.s32 %r103, %r169, %r5; mul.wide.s32 %rd48, %r103, 4; add.s64 %rd49, %rd1, %rd48; ld.global.nc.f32 %f82, [%rd49]; sub.f32 %f83, %f26, %f82; fma.rn.f32 %f143, %f22, %f23, 0f00000000; fma.rn.f32 %f144, %f22, %f25, 0f00000000; fma.rn.f32 %f145, %f22, %f83, 0f00000000; BB0_37: setp.eq.f32 %p21, %f18, 0f00000000; @%p21 bra BB0_57; and.b16 %rs2, %rs4, 2; setp.eq.s16 %p22, %rs2, 0; add.s32 %r27, %r2, 1; @%p22 bra BB0_40; rem.s32 %r104, %r27, %r68; add.s32 %r105, %r104, %r68; rem.s32 %r170, %r105, %r68; bra.uni BB0_41; BB0_40: add.s32 %r106, %r68, -1; min.s32 %r170, %r27, %r106; BB0_41: add.s32 %r107, %r170, %r4; mad.lo.s32 %r108, %r107, %r67, %r1; mul.wide.s32 %rd50, %r108, 4; add.s64 %rd51, %rd3, %rd50; ld.global.nc.f32 %f33, [%rd51]; add.s32 %r31, %r2, -1; @%p22 bra BB0_43; rem.s32 %r109, %r31, %r68; add.s32 %r110, %r109, %r68; rem.s32 %r171, %r110, %r68; bra.uni BB0_44; BB0_43: mov.u32 %r111, 0; max.s32 %r171, %r31, %r111; BB0_44: div.rn.f32 %f84, %f11, %f65; mul.f32 %f34, %f18, %f84; add.s32 %r112, %r171, %r4; mad.lo.s32 %r113, %r112, %r67, %r1; mul.wide.s32 %rd52, %r113, 4; add.s64 %rd53, %rd3, %rd52; ld.global.nc.f32 %f85, [%rd53]; sub.f32 %f35, %f33, %f85; @%p22 bra BB0_46; rem.s32 %r114, %r27, %r68; add.s32 %r115, %r114, %r68; rem.s32 %r172, %r115, %r68; bra.uni BB0_47; BB0_46: add.s32 %r116, %r68, -1; min.s32 %r172, %r27, %r116; BB0_47: add.s32 %r117, %r172, %r4; mad.lo.s32 %r118, %r117, %r67, %r1; mul.wide.s32 %rd54, %r118, 4; add.s64 %rd55, %rd2, %rd54; ld.global.nc.f32 %f36, [%rd55]; @%p22 bra BB0_49; rem.s32 %r119, %r31, %r68; add.s32 %r120, %r119, %r68; rem.s32 %r173, %r120, %r68; bra.uni BB0_50; BB0_49: mov.u32 %r121, 0; max.s32 %r173, %r31, %r121; BB0_50: add.s32 %r122, %r173, %r4; mad.lo.s32 %r123, %r122, %r67, %r1; mul.wide.s32 %rd56, %r123, 4; add.s64 %rd57, %rd2, %rd56; ld.global.nc.f32 %f86, [%rd57]; sub.f32 %f37, %f36, %f86; @%p22 bra BB0_52; rem.s32 %r124, %r27, %r68; add.s32 %r125, %r124, %r68; rem.s32 %r174, %r125, %r68; bra.uni BB0_53; BB0_52: add.s32 %r126, %r68, -1; min.s32 %r174, %r27, %r126; BB0_53: add.s32 %r127, %r174, %r4; mad.lo.s32 %r128, %r127, %r67, %r1; mul.wide.s32 %rd58, %r128, 4; add.s64 %rd59, %rd1, %rd58; ld.global.nc.f32 %f38, [%rd59]; @%p22 bra BB0_55; rem.s32 %r129, %r31, %r68; add.s32 %r130, %r129, %r68; rem.s32 %r175, %r130, %r68; bra.uni BB0_56; BB0_55: mov.u32 %r131, 0; max.s32 %r175, %r31, %r131; BB0_56: add.s32 %r132, %r175, %r4; mad.lo.s32 %r133, %r132, %r67, %r1; mul.wide.s32 %rd60, %r133, 4; add.s64 %rd61, %rd1, %rd60; ld.global.nc.f32 %f87, [%rd61]; sub.f32 %f88, %f38, %f87; fma.rn.f32 %f143, %f34, %f35, %f143; fma.rn.f32 %f144, %f34, %f37, %f144; fma.rn.f32 %f145, %f34, %f88, %f145; BB0_57: setp.eq.f32 %p28, %f19, 0f00000000; @%p28 bra BB0_77; div.rn.f32 %f89, %f11, %f66; mul.f32 %f45, %f19, %f89; and.b16 %rs3, %rs4, 4; setp.eq.s16 %p29, %rs3, 0; add.s32 %r47, %r3, 1; @%p29 bra BB0_60; rem.s32 %r134, %r47, %r69; add.s32 %r135, %r134, %r69; rem.s32 %r176, %r135, %r69; bra.uni BB0_61; BB0_60: add.s32 %r136, %r69, -1; min.s32 %r176, %r47, %r136; BB0_61: mad.lo.s32 %r137, %r176, %r68, %r2; mad.lo.s32 %r138, %r137, %r67, %r1; mul.wide.s32 %rd62, %r138, 4; add.s64 %rd63, %rd3, %rd62; ld.global.nc.f32 %f46, [%rd63]; add.s32 %r51, %r3, -1; @%p29 bra BB0_63; rem.s32 %r139, %r51, %r69; add.s32 %r140, %r139, %r69; rem.s32 %r177, %r140, %r69; bra.uni BB0_64; BB0_63: mov.u32 %r141, 0; max.s32 %r177, %r51, %r141; BB0_64: mad.lo.s32 %r142, %r177, %r68, %r2; mad.lo.s32 %r143, %r142, %r67, %r1; mul.wide.s32 %rd64, %r143, 4; add.s64 %rd65, %rd3, %rd64; ld.global.nc.f32 %f90, [%rd65]; sub.f32 %f47, %f46, %f90; @%p29 bra BB0_66; rem.s32 %r144, %r47, %r69; add.s32 %r145, %r144, %r69; rem.s32 %r178, %r145, %r69; bra.uni BB0_67; BB0_66: add.s32 %r146, %r69, -1; min.s32 %r178, %r47, %r146; BB0_67: mad.lo.s32 %r147, %r178, %r68, %r2; mad.lo.s32 %r148, %r147, %r67, %r1; mul.wide.s32 %rd66, %r148, 4; add.s64 %rd67, %rd2, %rd66; ld.global.nc.f32 %f48, [%rd67]; @%p29 bra BB0_69; rem.s32 %r149, %r51, %r69; add.s32 %r150, %r149, %r69; rem.s32 %r179, %r150, %r69; bra.uni BB0_70; BB0_69: mov.u32 %r151, 0; max.s32 %r179, %r51, %r151; BB0_70: mad.lo.s32 %r152, %r179, %r68, %r2; mad.lo.s32 %r153, %r152, %r67, %r1; mul.wide.s32 %rd68, %r153, 4; add.s64 %rd69, %rd2, %rd68; ld.global.nc.f32 %f91, [%rd69]; sub.f32 %f49, %f48, %f91; @%p29 bra BB0_72; rem.s32 %r154, %r47, %r69; add.s32 %r155, %r154, %r69; rem.s32 %r180, %r155, %r69; bra.uni BB0_73; BB0_72: add.s32 %r156, %r69, -1; min.s32 %r180, %r47, %r156; BB0_73: mad.lo.s32 %r157, %r180, %r68, %r2; mad.lo.s32 %r158, %r157, %r67, %r1; mul.wide.s32 %rd70, %r158, 4; add.s64 %rd71, %rd1, %rd70; ld.global.nc.f32 %f50, [%rd71]; @%p29 bra BB0_75; rem.s32 %r159, %r51, %r69; add.s32 %r160, %r159, %r69; rem.s32 %r181, %r160, %r69; bra.uni BB0_76; BB0_75: mov.u32 %r161, 0; max.s32 %r181, %r51, %r161; BB0_76: mad.lo.s32 %r162, %r181, %r68, %r2; mad.lo.s32 %r163, %r162, %r67, %r1; mul.wide.s32 %rd72, %r163, 4; add.s64 %rd73, %rd1, %rd72; ld.global.nc.f32 %f92, [%rd73]; sub.f32 %f93, %f50, %f92; fma.rn.f32 %f143, %f45, %f47, %f143; fma.rn.f32 %f144, %f45, %f49, %f144; fma.rn.f32 %f145, %f45, %f93, %f145; BB0_77: cvta.to.global.u64 %rd74, %rd6; cvta.to.global.u64 %rd75, %rd5; cvta.to.global.u64 %rd76, %rd4; mul.wide.s32 %rd77, %r6, 4; add.s64 %rd78, %rd3, %rd77; add.s64 %rd79, %rd2, %rd77; add.s64 %rd80, %rd1, %rd77; fma.rn.f32 %f94, %f132, %f132, 0f3F800000; mov.f32 %f95, 0fBF800000; div.rn.f32 %f96, %f95, %f94; fma.rn.f32 %f97, %f132, %f133, 0f3F800000; ld.global.nc.f32 %f98, [%rd79]; mul.f32 %f99, %f145, %f98; ld.global.nc.f32 %f100, [%rd80]; mul.f32 %f101, %f144, %f100; sub.f32 %f102, %f99, %f101; mul.f32 %f103, %f143, %f100; ld.global.nc.f32 %f104, [%rd78]; mul.f32 %f105, %f145, %f104; sub.f32 %f106, %f103, %f105; mul.f32 %f107, %f144, %f104; mul.f32 %f108, %f143, %f98; sub.f32 %f109, %f107, %f108; mul.f32 %f110, %f98, %f109; mul.f32 %f111, %f100, %f106; sub.f32 %f112, %f110, %f111; mul.f32 %f113, %f100, %f102; mul.f32 %f114, %f104, %f109; sub.f32 %f115, %f113, %f114; mul.f32 %f116, %f104, %f106; mul.f32 %f117, %f98, %f102; sub.f32 %f118, %f116, %f117; mul.f32 %f119, %f97, %f112; mul.f32 %f120, %f97, %f115; mul.f32 %f121, %f97, %f118; sub.f32 %f122, %f133, %f132; fma.rn.f32 %f123, %f122, %f102, %f119; fma.rn.f32 %f124, %f122, %f106, %f120; fma.rn.f32 %f125, %f122, %f109, %f121; add.s64 %rd81, %rd76, %rd77; ld.global.f32 %f126, [%rd81]; fma.rn.f32 %f127, %f96, %f123, %f126; st.global.f32 [%rd81], %f127; add.s64 %rd82, %rd75, %rd77; ld.global.f32 %f128, [%rd82]; fma.rn.f32 %f129, %f96, %f124, %f128; st.global.f32 [%rd82], %f129; add.s64 %rd83, %rd74, %rd77; ld.global.f32 %f130, [%rd83]; fma.rn.f32 %f131, %f96, %f125, %f130; st.global.f32 [%rd83], %f131; BB0_78: ret; } ` ) mumax3-3.10/data/000077500000000000000000000000001371432437400135705ustar00rootroot00000000000000mumax3-3.10/data/Makefile000066400000000000000000000000241371432437400152240ustar00rootroot00000000000000all: go install -v mumax3-3.10/data/crop.go000066400000000000000000000007461371432437400150710ustar00rootroot00000000000000package data // Cut-out a piece between given bounds (incl, excl) func Crop(in *Slice, x1, x2, y1, y2, z1, z2 int) *Slice { Nx := x2 - x1 Ny := y2 - y1 Nz := z2 - z1 size := [3]int{Nx, Ny, Nz} ncomp := in.NComp() out := NewSlice(ncomp, size) a := in.Tensors() b := out.Tensors() for c := 0; c < ncomp; c++ { for z := 0; z < Nz; z++ { for y := 0; y < Ny; y++ { for x := 0; x < Nx; x++ { b[c][z][y][x] = a[c][z+z1][y+y1][x+x1] } } } } return out } mumax3-3.10/data/doc.go000066400000000000000000000001531371432437400146630ustar00rootroot00000000000000/* Package data provides structures to store arrays in a hardware-agnostic (GPU-CPU) way. */ package data mumax3-3.10/data/mesh.go000066400000000000000000000043731371432437400150620ustar00rootroot00000000000000package data import ( "fmt" "log" ) // Mesh stores info of a finite-difference mesh. type Mesh struct { gridSize [3]int cellSize [3]float64 pbc [3]int Unit string // unit of cellSize, default: "m" } // Retruns a new mesh with N0 x N1 x N2 cells of size cellx x celly x cellz. // Optional periodic boundary conditions (pbc): number of repetitions // in X, Y, Z direction. 0,0,0 means no periodicity. func NewMesh(N0, N1, N2 int, cellx, celly, cellz float64, pbc ...int) *Mesh { var pbc3 [3]int if len(pbc) == 3 { copy(pbc3[:], pbc) } else { if len(pbc) != 0 { log.Panic("mesh: need 0 or 3 PBC arguments, got:", pbc) } } size := [3]int{N0, N1, N2} return &Mesh{size, [3]float64{cellx, celly, cellz}, pbc3, "m"} } // Returns N0, N1, N2, as passed to constructor. func (m *Mesh) Size() [3]int { if m == nil { return [3]int{0, 0, 0} } else { return m.gridSize } } // Returns cellx, celly, cellz, as passed to constructor. func (m *Mesh) CellSize() [3]float64 { return m.cellSize } // Returns pbc (periodic boundary conditions), as passed to constructor. func (m *Mesh) PBC() [3]int { return m.pbc } func (m *Mesh) SetPBC(nx, ny, nz int) { m.pbc = [3]int{nx, ny, nz} } // Total number of cells, not taking into account PBCs. // N0 * N1 * N2 func (m *Mesh) NCell() int { return m.gridSize[0] * m.gridSize[1] * m.gridSize[2] } // WorldSize equals (grid)Size x CellSize. func (m *Mesh) WorldSize() [3]float64 { return [3]float64{float64(m.gridSize[0]) * m.cellSize[0], float64(m.gridSize[1]) * m.cellSize[1], float64(m.gridSize[2]) * m.cellSize[2]} } // 3 bools, packed in one byte, indicating whether there are periodic boundary conditions in // X (LSB), Y(LSB<<1), Z(LSB<<2) func (m *Mesh) PBC_code() byte { var code byte if m.pbc[X] != 0 { code = 1 } if m.pbc[Y] != 0 { code |= 2 } if m.pbc[Z] != 0 { code |= 4 } return code } func (m *Mesh) String() string { s := m.gridSize c := m.cellSize pbc := "" if m.pbc != [3]int{0, 0, 0} { pbc = fmt.Sprintf(", PBC: [%v x %v x %v],", m.pbc[0], m.pbc[1], m.pbc[2]) } return fmt.Sprintf("[%v x %v x %v] x [%vm x %vm x %vm]%v", s[0], s[1], s[2], float32(c[0]), float32(c[1]), float32(c[2]), pbc) } // product of elements. func prod(size [3]int) int { return size[0] * size[1] * size[2] } mumax3-3.10/data/meta.go000066400000000000000000000003461371432437400150500ustar00rootroot00000000000000package data // Holds meta data to be saved together with a slice. // Typically winds up in OVF or DUMP header type Meta struct { Name, Unit string Time, TimeStep float64 CellSize [3]float64 MeshUnit string } mumax3-3.10/data/resample.go000066400000000000000000000036631371432437400157370ustar00rootroot00000000000000package data import ( "github.com/mumax/3/util" ) // Resample returns a slice of new size N, // using nearest neighbor interpolation over the input slice. func Resample(in *Slice, N [3]int) *Slice { if in.Size() == N { return in // nothing to do } In := in.Tensors() out := NewSlice(in.NComp(), N) Out := out.Tensors() size1 := SizeOf(In[0]) size2 := SizeOf(Out[0]) for c := range Out { for i := range Out[c] { i1 := (i * size1[Z]) / size2[Z] for j := range Out[c][i] { j1 := (j * size1[Y]) / size2[Y] for k := range Out[c][i][j] { k1 := (k * size1[X]) / size2[X] Out[c][i][j][k] = In[c][i1][j1][k1] } } } } return out } // Downsample returns a slice of new size N, smaller than in.Size(). // Averaging interpolation over the input slice. // in is returned untouched if the sizes are equal. func Downsample(In [][][][]float32, N [3]int) [][][][]float32 { if SizeOf(In[0]) == N { return In // nothing to do } nComp := len(In) out := NewSlice(nComp, N) Out := out.Tensors() srcsize := SizeOf(In[0]) dstsize := SizeOf(Out[0]) Dx := dstsize[X] Dy := dstsize[Y] Dz := dstsize[Z] Sx := srcsize[X] Sy := srcsize[Y] Sz := srcsize[Z] scalex := Sx / Dx scaley := Sy / Dy scalez := Sz / Dz util.Assert(scalex > 0 && scaley > 0) for c := range Out { for iz := 0; iz < Dz; iz++ { for iy := 0; iy < Dy; iy++ { for ix := 0; ix < Dx; ix++ { sum, n := 0.0, 0.0 for I := 0; I < scalez; I++ { i2 := iz*scalez + I for J := 0; J < scaley; J++ { j2 := iy*scaley + J for K := 0; K < scalex; K++ { k2 := ix*scalex + K if i2 < Sz && j2 < Sy && k2 < Sx { sum += float64(In[c][i2][j2][k2]) n++ } } } } Out[c][iz][iy][ix] = float32(sum / n) } } } } return Out } // Returns the 3D size of block func SizeOf(block [][][]float32) [3]int { return [3]int{len(block[0][0]), len(block[0]), len(block)} } mumax3-3.10/data/reshape.go000066400000000000000000000011441371432437400155460ustar00rootroot00000000000000package data // Array reshaping. import "fmt" // Re-interpret a contiguous array as a multi-dimensional array of given size. // Underlying storage is shared. func reshape(array []float32, size [3]int) [][][]float32 { Nx, Ny, Nz := size[0], size[1], size[2] if Nx*Ny*Nz != len(array) { panic(fmt.Errorf("reshape: size mismatch: %v*%v*%v != %v", Nx, Ny, Nz, len(array))) } sliced := make([][][]float32, Nz) for i := range sliced { sliced[i] = make([][]float32, Ny) } for i := range sliced { for j := range sliced[i] { sliced[i][j] = array[(i*Ny+j)*Nx+0 : (i*Ny+j)*Nx+Nx] } } return sliced } mumax3-3.10/data/slice.go000066400000000000000000000175041371432437400152250ustar00rootroot00000000000000package data // Slice stores N-component GPU or host data. import ( "bytes" "fmt" "github.com/mumax/3/util" "log" "reflect" "unsafe" ) // Slice is like a [][]float32, but may be stored in GPU or host memory. type Slice struct { ptrs []unsafe.Pointer size [3]int memType int8 } // this package must not depend on CUDA. If CUDA is // loaded, these functions are set to cu.MemFree, ... // NOTE: cpyDtoH and cpuHtoD are only needed to support 32-bit builds, // otherwise, it could be removed in favor of memCpy only. var ( memFree, memFreeHost func(unsafe.Pointer) memCpy, memCpyDtoH, memCpyHtoD func(dst, src unsafe.Pointer, bytes int64) ) // Internal: enables slices on GPU. Called upon cuda init. func EnableGPU(free, freeHost func(unsafe.Pointer), cpy, cpyDtoH, cpyHtoD func(dst, src unsafe.Pointer, bytes int64)) { memFree = free memFreeHost = freeHost memCpy = cpy memCpyDtoH = cpyDtoH memCpyHtoD = cpyHtoD } // Make a CPU Slice with nComp components of size length. func NewSlice(nComp int, size [3]int) *Slice { length := prod(size) ptrs := make([]unsafe.Pointer, nComp) for i := range ptrs { ptrs[i] = unsafe.Pointer(&(make([]float32, length)[0])) } return SliceFromPtrs(size, CPUMemory, ptrs) } func SliceFromArray(data [][]float32, size [3]int) *Slice { nComp := len(data) length := prod(size) ptrs := make([]unsafe.Pointer, nComp) for i := range ptrs { if len(data[i]) != length { panic("size mismatch") } ptrs[i] = unsafe.Pointer(&data[i][0]) } return SliceFromPtrs(size, CPUMemory, ptrs) } // Return a slice without underlying storage. Used to represent a mask containing all 1's. func NilSlice(nComp int, size [3]int) *Slice { return SliceFromPtrs(size, GPUMemory, make([]unsafe.Pointer, nComp)) } // Internal: construct a Slice using bare memory pointers. func SliceFromPtrs(size [3]int, memType int8, ptrs []unsafe.Pointer) *Slice { length := prod(size) nComp := len(ptrs) util.Argument(nComp > 0 && length > 0) s := new(Slice) s.ptrs = make([]unsafe.Pointer, nComp) s.size = size for c := range ptrs { s.ptrs[c] = ptrs[c] } s.memType = memType return s } // Frees the underlying storage and zeros the Slice header to avoid accidental use. // Slices sharing storage will be invalid after Free. Double free is OK. func (s *Slice) Free() { if s == nil { return } // free storage switch s.memType { case 0: return // already freed case GPUMemory: for _, ptr := range s.ptrs { memFree(ptr) } //case UnifiedMemory: // for _, ptr := range s.ptrs { // memFreeHost(ptr) // } case CPUMemory: // nothing to do default: panic("invalid memory type") } s.Disable() } // INTERNAL. Overwrite struct fields with zeros to avoid // accidental use after Free. func (s *Slice) Disable() { s.ptrs = s.ptrs[:0] s.size = [3]int{0, 0, 0} s.memType = 0 } // value for Slice.memType const ( CPUMemory = 1 << 0 GPUMemory = 1 << 1 //UnifiedMemory = CPUMemory | GPUMemory ) // MemType returns the memory type of the underlying storage: // CPUMemory, GPUMemory or UnifiedMemory func (s *Slice) MemType() int { return int(s.memType) } // GPUAccess returns whether the Slice is accessible by the GPU. // true means it is either stored on GPU or in unified host memory. func (s *Slice) GPUAccess() bool { return s.memType&GPUMemory != 0 } // CPUAccess returns whether the Slice is accessible by the CPU. // true means it is stored in host memory. func (s *Slice) CPUAccess() bool { return s.memType&CPUMemory != 0 } // NComp returns the number of components. func (s *Slice) NComp() int { return len(s.ptrs) } // Len returns the number of elements per component. func (s *Slice) Len() int { return prod(s.size) } func (s *Slice) Size() [3]int { if s == nil { return [3]int{0, 0, 0} } return s.size } // Comp returns a single component of the Slice. func (s *Slice) Comp(i int) *Slice { sl := new(Slice) sl.ptrs = make([]unsafe.Pointer, 1) sl.ptrs[0] = s.ptrs[i] sl.size = s.size sl.memType = s.memType return sl } // DevPtr returns a CUDA device pointer to a component. // Slice must have GPUAccess. // It is safe to call on a nil slice, returns NULL. func (s *Slice) DevPtr(component int) unsafe.Pointer { if s == nil { return nil } if !s.GPUAccess() { panic("slice not accessible by GPU") } return s.ptrs[component] } const SIZEOF_FLOAT32 = 4 // Host returns the Slice as a [][]float32 indexed by component, cell number. // It should have CPUAccess() == true. func (s *Slice) Host() [][]float32 { if !s.CPUAccess() { log.Panic("slice not accessible by CPU") } list := make([][]float32, s.NComp()) for c := range list { hdr := (*reflect.SliceHeader)(unsafe.Pointer(&list[c])) hdr.Data = uintptr(s.ptrs[c]) hdr.Len = s.Len() hdr.Cap = hdr.Len } return list } // Returns a copy of the Slice, allocated on CPU. func (s *Slice) HostCopy() *Slice { cpy := NewSlice(s.NComp(), s.Size()) Copy(cpy, s) return cpy } func Copy(dst, src *Slice) { if dst.NComp() != src.NComp() || dst.Len() != src.Len() { panic(fmt.Sprintf("slice copy: illegal sizes: dst: %vx%v, src: %vx%v", dst.NComp(), dst.Len(), src.NComp(), src.Len())) } d, s := dst.GPUAccess(), src.GPUAccess() bytes := SIZEOF_FLOAT32 * int64(dst.Len()) switch { default: panic("bug") case d && s: for c := 0; c < dst.NComp(); c++ { memCpy(dst.DevPtr(c), src.DevPtr(c), bytes) } case s && !d: for c := 0; c < dst.NComp(); c++ { memCpyDtoH(dst.ptrs[c], src.DevPtr(c), bytes) } case !s && d: for c := 0; c < dst.NComp(); c++ { memCpyHtoD(dst.DevPtr(c), src.ptrs[c], bytes) } case !d && !s: dst, src := dst.Host(), src.Host() for c := range dst { copy(dst[c], src[c]) } } } // Floats returns the data as 3D array, // indexed by cell position. Data should be // scalar (1 component) and have CPUAccess() == true. func (f *Slice) Scalars() [][][]float32 { x := f.Tensors() if len(x) != 1 { panic(fmt.Sprintf("expecting 1 component, got %v", f.NComp())) } return x[0] } // Vectors returns the data as 4D array, // indexed by component, cell position. Data should have // 3 components and have CPUAccess() == true. func (f *Slice) Vectors() [3][][][]float32 { x := f.Tensors() if len(x) != 3 { panic(fmt.Sprintf("expecting 3 components, got %v", f.NComp())) } return [3][][][]float32{x[0], x[1], x[2]} } // Tensors returns the data as 4D array, // indexed by component, cell position. // Requires CPUAccess() == true. func (f *Slice) Tensors() [][][][]float32 { tensors := make([][][][]float32, f.NComp()) host := f.Host() for i := range tensors { tensors[i] = reshape(host[i], f.Size()) } return tensors } // IsNil returns true if either s is nil or s.pointer[0] == nil func (s *Slice) IsNil() bool { if s == nil { return true } return s.ptrs[0] == nil } func (s *Slice) String() string { if s == nil { return "nil" } var buf bytes.Buffer util.Fprint(&buf, s.Tensors()) return buf.String() } func (s *Slice) Set(comp, ix, iy, iz int, value float64) { s.checkComp(comp) s.Host()[comp][s.Index(ix, iy, iz)] = float32(value) } func (s *Slice) SetVector(ix, iy, iz int, v Vector) { i := s.Index(ix, iy, iz) for c := range v { s.Host()[c][i] = float32(v[c]) } } func (s *Slice) SetScalar(ix, iy, iz int, v float64) { s.Host()[0][s.Index(ix, iy, iz)] = float32(v) } func (s *Slice) Get(comp, ix, iy, iz int) float64 { s.checkComp(comp) return float64(s.Host()[comp][s.Index(ix, iy, iz)]) } func (s *Slice) checkComp(comp int) { if comp < 0 || comp >= s.NComp() { panic(fmt.Sprintf("slice: invalid component index: %v (number of components=%v)\n", comp, s.NComp())) } } func (s *Slice) Index(ix, iy, iz int) int { return Index(s.Size(), ix, iy, iz) } func Index(size [3]int, ix, iy, iz int) int { if ix < 0 || ix >= size[X] || iy < 0 || iy >= size[Y] || iz < 0 || iz >= size[Z] { panic(fmt.Sprintf("Slice index out of bounds: %v,%v,%v (bounds=%v)\n", ix, iy, iz, size)) } return (iz*size[Y]+iy)*size[X] + ix } mumax3-3.10/data/slice_test.go000066400000000000000000000006321371432437400162560ustar00rootroot00000000000000package data import ( "testing" ) func TestIndex(t *testing.T) { mesh := [3]int{6, 5, 4} slice := NewSlice(7, mesh) data := slice.Tensors() if len(data) != 7 { //c t.Fail() } if len(data[0]) != 4 { // z t.Fail() } if len(data[0][0]) != 5 { // y t.Fail() } if len(data[0][0][0]) != 6 { // x t.Fail() } slice.Set(4, 5, 4, 3, 345) // c x y z if data[4][3][4][5] != 345 { t.Fail() } } mumax3-3.10/data/vector.go000066400000000000000000000023371371432437400154260ustar00rootroot00000000000000package data import "math" // 3-component vector type Vector [3]float64 func (v Vector) X() float64 { return v[0] } func (v Vector) Y() float64 { return v[1] } func (v Vector) Z() float64 { return v[2] } // Returns a*v. func (v Vector) Mul(a float64) Vector { return Vector{a * v[0], a * v[1], a * v[2]} } // Returns (1/a)*v. func (v Vector) Div(a float64) Vector { return v.Mul(1 / a) } // Returns a+b. func (a Vector) Add(b Vector) Vector { return Vector{a[0] + b[0], a[1] + b[1], a[2] + b[2]} } // Returns a+s*b. func (a Vector) MAdd(s float64, b Vector) Vector { return Vector{a[0] + s*b[0], a[1] + s*b[1], a[2] + s*b[2]} } // Returns a-b. func (a Vector) Sub(b Vector) Vector { return Vector{a[0] - b[0], a[1] - b[1], a[2] - b[2]} } // Returns the norm of v. func (v Vector) Len() float64 { len2 := v.Dot(v) return math.Sqrt(len2) } // Returns the dot (inner) product a.b. func (a Vector) Dot(b Vector) float64 { return a[0]*b[0] + a[1]*b[1] + a[2]*b[2] } // Returns the cross (vector) product a x b // in a right-handed coordinate system. func (a Vector) Cross(b Vector) Vector { x := a[1]*b[2] - a[2]*b[1] y := a[2]*b[0] - a[0]*b[2] z := a[0]*b[1] - a[1]*b[0] return Vector{x, y, z} } const ( X = 0 Y = 1 Z = 2 ) mumax3-3.10/deploy/000077500000000000000000000000001371432437400141535ustar00rootroot00000000000000mumax3-3.10/deploy/.gitignore000066400000000000000000000000071371432437400161400ustar00rootroot00000000000000build/ mumax3-3.10/deploy/deploy_linux.bash000077500000000000000000000051101371432437400175250ustar00rootroot00000000000000# The cuda versions against which we will compile mumax3 for CUDAVERSION in 7.0 7.5 8.0 9.0 9.1 9.2 10.0 10.1 10.2 11.0; do # downgrade host compiler for nvcc for old cuda versions if [ 1 -eq "$(echo "${CUDAVERSION} < 9.2" | bc)" ]; then export NVCC_CCBIN=/usr/bin/gcc-4.8 else export NVCC_CCBIN=/usr/bin/gcc fi # The final location of the mumax3 executables and libs MUMAX3UNAME=mumax3.10_linux_cuda${CUDAVERSION} BUILDDIR=./build/${MUMAX3UNAME} rm -rf $BUILDDIR mkdir -p $BUILDDIR # The location of the home dirctory of this cuda version # We export this variable so that cuda/Makefile knows how to build the wrappers export CUDA_HOME=/usr/local/cuda-${CUDAVERSION} # All supported compute capabilities of this cuda version # We export CUDA_CC so that cuda/Makefile knows what to include in the fat wrappers case $CUDAVERSION in "7.0") export CUDA_CC="20 30 32 35 37 50 52 53";; "7.5") export CUDA_CC="20 30 32 35 37 50 52 53";; "8.0") export CUDA_CC="20 30 32 35 37 50 52 53 60 61 62";; "9.0") export CUDA_CC=" 30 32 35 37 50 52 53 60 61 62 70";; "9.1") export CUDA_CC=" 30 32 35 37 50 52 53 60 61 62 70 72";; "9.2") export CUDA_CC=" 30 32 35 37 50 52 53 60 61 62 70 72";; "10.0") export CUDA_CC=" 30 32 35 37 50 52 53 60 61 62 70 72 75";; "10.1") export CUDA_CC=" 30 32 35 37 50 52 53 60 61 62 70 72 75";; "10.2") export CUDA_CC=" 30 32 35 37 50 52 53 60 61 62 70 72 75";; "11.0") export CUDA_CC=" 30 32 35 37 50 52 53 60 61 62 70 72 75 80";; esac # The path for shared libraries (relative to the build directory) RPATH=lib mkdir -p $BUILDDIR/$RPATH # We overwrite the CGO Flags to make sure that it is compiled against $CUDAVERSION export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH export CGO_LDFLAGS="-lcufft -lcurand -lcuda -L${CUDA_HOME}/lib64 -Wl,-rpath -Wl,\$ORIGIN/$RPATH" export CGO_CFLAGS="-I${CUDA_HOME}/include" # (Re)build everything (cd .. && make realclean && make -j 4 || exit 1) # Copy the executable and the cuda libraries to the output directory cp $GOPATH/bin/mumax3 $BUILDDIR cp $GOPATH/bin/mumax3-convert $BUILDDIR cp $GOPATH/bin/mumax3-server $BUILDDIR cp ../LICENSE $BUILDDIR cp $( ldd ${BUILDDIR}/mumax3 | grep libcufft | awk '{print $3}' ) ${BUILDDIR}/${RPATH} cp $( ldd ${BUILDDIR}/mumax3 | grep libcurand | awk '{print $3}' ) ${BUILDDIR}/${RPATH} (cd build && tar -czf ${MUMAX3UNAME}.tar.gz ${MUMAX3UNAME}) done mumax3-3.10/deploy/deploy_windows.ps1000066400000000000000000000071251371432437400176530ustar00rootroot00000000000000# This script compiles mumax3 for windows 10 against multiple cuda versions. # The cuda version against which we will compile mumax3 foreach ($CUDA_VERSION in "9.2","10.0","10.1","10.2","11.0") { # The final location of executables and libraries ready to be shipped to the user. $builddir = "build/mumax3.10_windows_cuda$CUDA_VERSION" # The nvidia toolkit installer for cuda 10.2 shoud have set the environment # variable CUDA_PATH_V10_2 which points to the root directory of the # cuda toolbox. (or similar for other cuda versions) # This script might not work if this path contains spaces! switch ( $CUDA_VERSION ) { "9.2" { $CUDA_HOME = $env:CUDA_PATH_V9_2 } "10.0" { $CUDA_HOME = $env:CUDA_PATH_V10_0 } "10.1" { $CUDA_HOME = $env:CUDA_PATH_V10_1 } "10.2" { $CUDA_HOME = $env:CUDA_PATH_V10_2 } "11.0" { $CUDA_HOME = $env:CUDA_PATH_V11_0 } default {} } if ( -not $CUDA_HOME -or (-not ( Test-Path $CUDA_HOME )) ) { Write-Output "CUDA version $CUDA_VERSION does not seem to be installed" exit } # We will compile the kernels for all supported architectures switch ( $CUDA_VERSION ) { "9.2" { $CUDA_CC = 30,32,35,37,50,52,53,60,61,62,70,72 } "10.0" { $CUDA_CC = 30,32,35,37,50,52,53,60,61,62,70,72,75 } "10.1" { $CUDA_CC = 30,32,35,37,50,52,53,60,61,62,70,72,75 } "10.2" { $CUDA_CC = 30,32,35,37,50,52,53,60,61,62,70,72,75 } "11.0" { $CUDA_CC = 30,32,35,37,50,52,53,60,61,62,70,72,75,80 } default {exit} } # The NVIDIA compiler which will be used to compile the cuda kernels $NVCC = "${CUDA_HOME}/bin/nvcc.exe" $CCBIN = "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin" if ( -not ( Test-Path $CCBIN ) ) { Write-Output "CCBIN for nvcc not found at $CCBIN" exit } # overwrite the CGO flags to make sure that mumax3 is compiled against the # specified cuda version. $env:CGO_LDFLAGS="-lcufft -lcurand -lcuda -L${CUDA_HOME}/lib/x64" $env:CGO_CFLAGS="-I${CUDA_HOME}/include -w" # Enter the cuda directory to (re)compile the cuda kernels Set-Location ../cuda Remove-Item *.ptx Remove-Item *_wrapper.go go build .\cuda2go.go $cudafiles = Get-ChildItem -filter "*.cu" foreach ($cudafile in $cudafiles) { $kernelname = $cudafile.basename foreach ($cc in $CUDA_CC) { & $NVCC -ccbin ${CCBIN} -Xptxas -O3 -ptx ` -gencode="arch=compute_${cc},code=sm_${cc}" ` "${cudafile}" -o "${kernelname}_${cc}.ptx" } & .\cuda2go $cudafile gofmt -w "${kernelname}_wrapper.go" } Set-Location ../deploy # Compile all mumax3 packages and executables go install -v "github.com/mumax/3/..." # Copy the mumax3 executables and the used cuda libraries to the build directory Remove-Item -ErrorAction Ignore -Recurse ${builddir} Remove-Item -ErrorAction Ignore "${builddir}.zip" New-Item -ItemType "directory" ${builddir} Copy-Item ${env:GOPATH}/bin/mumax3.exe -Destination ${builddir} Copy-Item ${env:GOPATH}/bin/mumax3-convert.exe -Destination ${builddir} Copy-Item ${env:GOPATH}/bin/mumax3-server.exe -Destination ${builddir} Copy-Item ../LICENSE -Destination ${builddir} Copy-Item ${CUDA_HOME}/bin/cufft64*.dll -Destination ${builddir} Copy-Item ${CUDA_HOME}/bin/curand64*.dll -Destination ${builddir} # Finally, put everything in a single archive Compress-Archive -Path ${builddir}/* -DestinationPath "${builddir}.zip" }mumax3-3.10/doc/000077500000000000000000000000001371432437400134245ustar00rootroot00000000000000mumax3-3.10/doc/.gitignore000066400000000000000000000000111371432437400154040ustar00rootroot00000000000000build docmumax3-3.10/doc/Makefile000066400000000000000000000005241371432437400150650ustar00rootroot00000000000000BUILDDIR="build" STATIC="static" # build the html pages in ${BUILDDIR} .PHONY: html html: doc mumax3libs mkdir -p ${BUILDDIR} ./doc -examples -builddir ${BUILDDIR} cp ${STATIC}/* build .PHONY: doc doc: go build -v .PHONY: mumax3libs mumax3libs: go install -v github.com/mumax/3/cmd/... .PHONY: clean clean: rm -rf build rm -f docmumax3-3.10/doc/README000066400000000000000000000002361371432437400143050ustar00rootroot00000000000000This directory contains everything needed to build the mumax3 website (Home page, API, and examples) "make html" builds the complete website in ${BUILDDIR}. mumax3-3.10/doc/apigen.go000066400000000000000000000125631371432437400152250ustar00rootroot00000000000000// Automatic generation of api.html based on template. package main import ( "io/ioutil" "os" "os/exec" "path" "reflect" "sort" "strings" "text/template" "unicode" "github.com/mumax/3/cuda" "github.com/mumax/3/engine" ) var ( api_entries entries api_ident = make(map[string]entry) ) type entry struct { name string Type reflect.Type Doc string touched bool } func buildAPI() { cuda.Init(0) // gpu 0 ident := engine.World.Identifiers doc := engine.World.Doc e := make(entries, 0, len(ident)) for K, v := range doc { if v == "" { // check if we a docstring in the documantation of the Math package v = getGoDocString("math", K) } k := strings.ToLower(K) t := ident[k].Type() entr := entry{K, t, v, false} e = append(e, &entr) api_ident[k] = entr } sort.Sort(&e) api_entries = e } func getGoDocString(packageName, identifier string) string { docString := "" cmd := exec.Command("go", "doc", packageName, identifier) stdout, err := cmd.Output() outputLines := strings.Split(string(stdout), "\n") if err == nil && outputLines[2][:4] == "func" { // we only look for doc strings of functions // the doc string of a function is on the fourth line // (and possible continued on the fifth line, if not, then the fifth line is empty) docString = strings.Join(outputLines[3:5], " ") } return docString } func (e *entry) Name() string { return e.name } // input parameters func (e *entry) Ins() string { t := e.Type.String() if strings.HasPrefix(t, "func(") { return cleanType(t[len("func"):]) } else { return "" } } // dumbed-down type func cleanType(typ string) string { typ = strings.Replace(typ, "engine.", "", -1) typ = strings.Replace(typ, "*data.", "", -1) typ = strings.Replace(typ, "script.", "", -1) return typ } func (e *entry) Methods() []string { t := e.Type // if it's a function, we list the methods on the output type if t.Kind() == reflect.Func && t.NumOut() == 1 { t = t.Out(0) } nm := t.NumMethod() M := make([]string, 0, nm) for i := 0; i < nm; i++ { m := t.Method(i) n := m.Name if unicode.IsUpper(rune(n[0])) && !hidden(n) { var args string for i := 1; i < m.Type.NumIn(); i++ { args += cleanType(m.Type.In(i).String()) + " " } M = append(M, n+"( "+args+")") } } return M } // return value func (e *entry) Ret() string { t := e.Type if t.Kind() == reflect.Func && t.NumOut() == 1 { return cleanType(t.Out(0).String()) } else { return "" } } // hidden methods func hidden(name string) bool { switch name { default: return false case "Eval", "InputType", "Type", "Slice", "Name", "Unit", "NComp", "Mesh", "SetValue", "String": return true } } // list of examples where entry is used. func (e *entry) Examples() []int { return api_examples[strings.ToLower(e.name)] } type api struct { Entries entries } // include file func (e *api) Include(fname string) string { b, err := ioutil.ReadFile(path.Join(templateDir, fname)) check(err) return string(b) } // list of entries not used so far func (a *api) remaining() []*entry { var E []*entry for _, e := range a.Entries { if !e.touched { E = append(E, e) } } return E } // list of all entries (touched and not touched) func (a *api) All() []*entry { var E []*entry for _, e := range a.Entries { E = append(E, e) } return E } // return all entries, unused so far, which have given type. func (a *api) FilterType(typ ...string) []*entry { var E []*entry for _, e := range a.Entries { for _, t := range typ { if match(t, e.Type.String()) && !strings.HasPrefix(e.name, "ext_") { e.touched = true E = append(E, e) } } } return E } // return all entries, unused so far, which have given return type. func (a *api) FilterReturn(typ ...string) []*entry { var E []*entry for _, e := range a.Entries { for _, t := range typ { if match(t, e.Ret()) && !strings.HasPrefix(e.name, "ext_") { e.touched = true E = append(E, e) } } } return E } // return all entries, unused so far, which have given name. func (a *api) FilterName(typ ...string) []*entry { var E []*entry for _, e := range a.Entries { for _, t := range typ { if match(t, e.name) && !strings.HasPrefix(e.name, "ext_") { e.touched = true E = append(E, e) } } } return E } // return all entries, unused so far, whose name starts with prefix. func (a *api) FilterPrefix(pre string) []*entry { var E []*entry for _, e := range a.Entries { if strings.HasPrefix(e.name, pre) { e.touched = true E = append(E, e) } } return E } // return all entries not yet used. func (a *api) FilterLeftovers() []*entry { return a.remaining() } // case insensitive match. func match(a, b string) bool { a = strings.ToLower(a) b = strings.ToLower(b) match := a == b return match } func renderAPI() { e := api_entries t := template.Must(template.New("api").Parse(templ)) f, err2 := os.OpenFile(path.Join(buildDir, "api.html"), os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0666) check(err2) check(t.Execute(f, &api{e})) } var templ = read(path.Join(templateDir, "api-template.html")) func read(fname string) string { b, err := ioutil.ReadFile(fname) check(err) return string(b) } type entries []*entry func (e *entries) Len() int { return len(*e) } func (e *entries) Less(i, j int) bool { return strings.ToLower((*e)[i].name) < strings.ToLower((*e)[j].name) } func (e *entries) Swap(i, j int) { (*e)[i], (*e)[j] = (*e)[j], (*e)[i] } mumax3-3.10/doc/make.go000066400000000000000000000130751371432437400146760ustar00rootroot00000000000000package main import ( "flag" "fmt" "io/ioutil" "log" "os" "os/exec" "path" "regexp" "sort" "strings" "text/template" ) var flag_vet = flag.Bool("vet", false, "only vet source files, don't run them") var flag_examples = flag.Bool("examples", false, "run mumax3 examples") var flag_forced = flag.Bool("forced", false, "force to re-run mumax3 examples") var flag_builddir = flag.String("builddir", "build", "build directory") var buildDir string const templateDir = "templates" func main() { flag.Parse() buildDir = *flag_builddir + "/" buildAPI() // read template b, err := ioutil.ReadFile(path.Join(templateDir, "examples-template.html")) check(err) replaceInRaw(b, '\n', '@') // hack to allow raw strings spanning multi lines templ := template.Must(template.New("guide").Parse(string(b))) // output file f, err2 := os.OpenFile(path.Join(buildDir, "examples.html"), os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0666) check(err2) // execute! if *flag_examples { state := &State{} check(templ.Execute(f, state)) } renderAPI() createIndexPage() createDownloadPage() } func createIndexPage() { b, err := ioutil.ReadFile(path.Join(templateDir, "index-template.html")) replaceInRaw(b, '\n', '@') // hack to allow raw strings spanning multi lines check(err) templ := template.Must(template.New("guid").Parse(string(b))) f, err2 := os.OpenFile(path.Join(buildDir, "index.html"), os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0666) check(err2) state := &State{} check(templ.Execute(f, state)) } func createDownloadPage() { b, err := ioutil.ReadFile(path.Join(templateDir, "download-template.html")) replaceInRaw(b, '\n', '@') // hack to allow raw strings spanning multi lines check(err) templ := template.Must(template.New("download").Parse(string(b))) f, err2 := os.OpenFile(path.Join(buildDir, "download.html"), os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0666) check(err2) state := &State{} check(templ.Execute(f, state)) } type State struct { count int } func (s *State) Example(in string) string { s.count++ // extract example source in = strings.Replace(in, "@", "\n", -1) // undo raw string hack in = strings.Trim(in, "\n") // exec input file check(ioutil.WriteFile(s.infile(), []byte(in), 0666)) arg := "-v" if *flag_vet { arg = "-vet" } if _, err := os.Stat(s.outfile()); os.IsNotExist(err) || *flag_forced { cmd("mumax3", "-cache", "/tmp", arg, s.infile()) } recordExamples(in, s.count) return `
` + template.HTMLEscapeString(in) + `
` } var api_examples = make(map[string][]int) func recordExamples(input string, num int) { in := strings.ToLower(input) for k, _ := range api_ident { if ok, _ := regexp.MatchString(k, in); ok { api_examples[k] = append(api_examples[k], num) } } } func (s *State) Img(fname string) string { cmd("mumax3-convert", "-png", "-arrows", "16", path.Join(s.outfile(), fname+".ovf")) pngfile := path.Join(s.relativeOutfile(), fname+".png") return fmt.Sprintf(`
%v
`, pngfile, fname) } func (s *State) Include(fname string) string { b, err := ioutil.ReadFile(path.Join(templateDir, fname)) check(err) return string(b) } func (s *State) Output() string { out := `

output

` dir, err := os.Open(s.outfile()) check(err) files, err2 := dir.Readdirnames(-1) check(err2) sort.Strings(files) for _, f := range files { if path.Ext(f) == ".ovf" { out += s.Img(f[:len(f)-len(".ovf")]) } } out += `
` for _, f := range files { if f == "table.txt" { cmd("mumax3-plot", path.Join(s.outfile(), f)) } } dir, err = os.Open(s.outfile()) check(err) files, err2 = dir.Readdirnames(-1) check(err2) sort.Strings(files) for _, f := range files { if path.Ext(f) == ".svg" { src := path.Join(s.relativeOutfile(), f) out += fmt.Sprintf(`
%v
`, src, f) } } return out } // State.output gives a nice otuput for all examples except for the // hysteresis example. State.OutputHysteresis is the custom output function // for the hysteresis example. func (s *State) OutputHysteresis() string { tableName := path.Join(s.outfile(), "table.txt") figureName := path.Join(s.outfile(), "hysteresis.svg") relFigureName := path.Join(s.relativeOutfile(), "hysteresis.svg") gnuplotCmd := `set term svg noenhanced size 400 300 font 'Arial,10';` gnuplotCmd += fmt.Sprintf(`set output "%s";`, figureName) gnuplotCmd += `set xlabel "B_ext(T)";` gnuplotCmd += `set ylabel "m_x";` gnuplotCmd += fmt.Sprintf(`plot "%s" u 5:2 w lp notitle;`, tableName) gnuplotCmd += "set output;" gnuplotOut, err := exec.Command("gnuplot", "-e", gnuplotCmd).CombinedOutput() os.Stderr.Write(gnuplotOut) check(err) out := fmt.Sprintf(`

output

`, relFigureName) return out } func (s *State) infile() string { return path.Join(buildDir, fmt.Sprintf("example%v.mx3", s.count)) } func (s *State) outfile() string { return path.Join(buildDir, fmt.Sprintf("example%v.out", s.count)) } // Relative output directory path from the build directory func (s *State) relativeOutfile() string { return fmt.Sprintf("example%v.out", s.count) } func cmd(cmd string, args ...string) { out, err := exec.Command(cmd, args...).CombinedOutput() os.Stdout.Write(out) check(err) } func replaceInRaw(bytes []byte, old, new byte) { inraw := false for i, b := range bytes { if b == '`' { inraw = !inraw } if inraw && b == old { bytes[i] = new } } } func check(err error) { if err != nil { log.Panic(err) } } mumax3-3.10/doc/mask.png000066400000000000000000000047701371432437400150750ustar00rootroot00000000000000PNG  IHDRdÆ bKGD pHYs  tIME %xtEXtCommentCreated with GIMPW `IDATxiHTǟqfR)ۗRC(2i3IdX-PP&"RWEHM"J[lQsǜ{fs~ss<ϽsȘ 0(>HA @ A @ A HI\t>L:~AFMf͢KҎ;(::f}2 Id2ۿ?X3Ӆ xM2Lj_\\!DD4u?P(wZ퀿i4 ߋlv*--<;;[i^0"5BA^~M('JEBUUU۷oSrr2i4R*V)""vE#yJ:;;ÇTRRBGu… iDJr9jh4xbڵkSGGWx) H03[Ve___ᱛ7ofG---+#"޴iz[uWHʓIIIaiiiz m3??_XGzzSO.ٳgmɕDZ,NLL;K,auuubwlll,y ޖ'{Kpp0Ka4yJ%::{{{h4Wǎ;˗/O}9L~=O0؜H 䦦&vZZZ8((H2~ܹz̙#Vݻwꉊr \ ,p*ߟu: myr ۺy0~Ϟ=bv!yL&[ns@] ³q8_LUWy8q"gdd˗`0baN/^^hG ;t0W =vؠm$%%ׯ_%PQQ1qƹWLEEE,:3_3g۷ow*iٲe\VVVծX,޺ud{'Oƛf^tdqӧO;lʕlX$A@@:tkjjXqOO777s^^3fЉ ԩSfffJ̛7ϩa ͒ی#?^XGLL'Oϟ?]\Pg'Ȃ ?I]>}*7a7lxbjJjW3كl;e_V%W,„iii###%t:S}w7'HTUUE_oߒV%HFwrss)''ǡ>}/_.<oRWWuuuj@6mEEEQBB%%%JrBd0vJT*y欬,I{'Ay͚5v׽qF.\D DvE&婵UlqNh46 ݸ4=z.HxV[!S^w˓d6iÆ ͛aBVy\LL _Gݹ2=|xbyJŋ1| L!_Eiiwk׮ycE ߾}^pa ?8=$ &!U E(Ւq&#""F SLyYLg?zm>xBk?>.Sn?>Cq? >,t?ϙ>qF~Sv?ak>S $x?Y{>y?azb>ZԆ z?J>T{?`'5>V;{|?w!>x$}?R>sg ~?=ܬ[~~? =ȀRZ~?XP=UD?k=7hU?9=S )?=W!h?>(]=tH?s=>=Ș ?"=BN?V =q?<@γ?ҵ}'1?t݀2z?O1u?q^L2p?χ1k?ao2d?51^?ꏻCG2W?Kܒ[l1Q?啻F2J?71C?#LQ1;?Eh12? YU1,?O"1%?}N1?U%?ì/?uKc?Ց7?nձ?tޱ?孻֢5?22q9'?}æ|6?tJ? \b?5~?ڲ?xa/粽?!\:% ?R ?=*?at}5?)y:O?M;[}?^;j1y?$;𹅳?<}{?P)<*꛳X?8AW<ꬳO?Ll(r&}?TW>2c|?[]+>Lq{?W@>qEz?pdW>+&x?ԕp>q´\w?>wдt?>+r?b>bo?x>jfl?j>$Fj?g>4h>? *?ctsBE?+#?EmL? ?:#|S?i-?a3Y?v?~x_?(>mJWpd?>>u@h?f>kXl?ٌ>PF o?ֲ>3%r?Y>Ϻu?͓>Ɖ"w?Y>2x?_mp>LCz?܉W>Oo{?x@>Bxye|?5+>Mo,}?>Ie}?>YUL~??=[G߱~?Ȋ=4D\?\= 1W?? =3,Oo?2=F*?j=֑J?I=?,=u? U==F۳?&J1?=QJ]f?;`m/?1M]?0?P*?-?wr ? X:?1؀3?kP?_UͲp?qk|?mQ?s CR? ??*C?:n=?&>,;ɀja?p#;ۃh?;R?+G^z[}?L>{â|?w%>(^{?)':>󘌴ڜz?CP>I89y?3 j>訴nw?>ϰlu?>KӴr?K><ߴo?K>-zl?>ph?\>[x*e?3>_38?\E1?5??)?gzj;G? ?hV/N??:]UzU?J ?Fj_ [??r_a?>w&}e?<>xCj?>b3m?>  p?>ݍs?>{u?6 >:w?>z녴~y?'je>ioywz?L>!{?L6>lI|?Ό!>WlD}?W>3Ty~?=b>Wi~?=M8~? =@ &?A=RY x\?=+D?y= !?$pV= =?r7=,VY?+= ?;=5ó?<ͳ?H<^qg?!<2?Q|<{?M<?:#<{??;j?\;c?^;:<?[A;jԲ?H:?L:?V?KA??J? ?Z?i.`?ͷA#G?Q 1?_x?:j1?s?N{1?Yy?,Ճٛ2s?[`!/o?o_P%2k?ŊV1f?Ќ52b?Վhc1\?XᐻC2X?,ޕ1S?e4E2N?֊1H?҂K2@?Gѭ1:?szD22?g؞1+?%-62"?1?;.2?Rp1?r2?lǟ0?ѹ r1? ?\1?»8"?t*ŻJ1?Uƻڱ?0Ȼ/?BȻ_(?uǻ a?jŻLn??_k#?JtM?ؖ?ɲ:?!_?$?}xȲ?I ?-ww?$<?6z?%?MϹ:c?ϯH;(N1?Ǣ;Q?;5?<㜳?gN<?dJm|?]!>sw\{?']6>ևz?3M>ooy?df>YNw?G >Ѩ6u?z2>vcs?n>po?>m>#:l?K>鴮g?>"Qc?R>Q^?>5)1/2?7?O| 9?/?A?'?*I??fQ?A?%"W?s ?*YY ]?N? mib?>Tg?>Ѫk?4>LK-o?+>8ꌴ1r?$ߥ>%t?>'&v?0>JMx?r>o4z?YX>dsl{?)@>>Aj|?*>+f06}?c>˫j}?>DY~?&=xX~?=50S ?=>0?(J?Du=Ex?Hw=u(c?9a=:B?@= i?$=ճg?ͺ =fu?s<0?n<ȳa?<?#<@ϩ?xhTٺDZ??"+q?lCރ?W71?FhFG?+ut81?u?B2n?9*f?-u(2`?FH@[?:2U?(@P?CQ2K?>0H?P [2B? Lq41=?$ b27?YA|11?ec2,?Q4^ٕ1$?W᧻Fi|?d>7'u|?&Q2>ᄴ{?I>y?/Lb>Sw?M~>TVu?ZJ>~/Ys?g>릿3p?>ٴbl?ӌ>͛ٴfg?0>-#b?5>a¶\?? F\W?f ?5 +-?O#]7e?S>M`i?:>Y3~m?>}Ñ p?6ŭ>s?E>~v?>gf@w?8~>ﻃ~y?c>az?I>Rqt|?2>Q|?> ^ʣ}? >mg1~?=mB{~?= O~?=k'9?Y6=:l?,= ?Tk=J! ?;H=P ?*=x?3 =³~?1Fe!}?>UnN|?*G->zB} F{?lC>zy?\>uYx?xx> .Sv?m>s?w>*p?9:>ʹ$l?>>Ѵ_h?I>lb?>\\??U? ?dO?%m?,-(?|@?v 0?e9? |8?W1?⾏~%A?(?m:I???|ֳP? A?e W?B0 ?o]?*?mc?z>*Hg?E>Ek?>2o?>ԛr?z> I-u?J>ÎTw?>!*y?k>Iڑz?Q>牴{?9>zƳ|?<#>Ytu}?>Z\~?=@h~?|=~9~?==-K+?=b?3Ǎ=s3?:r=J!?I*O=F?U/=ڳ,?=|?2?Y=1?2?ypG1?»2?Ż1?ɻ|2?̻.?T9лAb2?ӻi^?׻[2?ۻ@?y޻682s?Ự ʱi?LK 2a?y^!]?1[?t軐U[?X1b?Yn?⻰w?ݻ?NֻWⱻ?w?̻?b(pJ?^,6H?陻 ?(9?`D?>s_?"?:ᄳ?:;D1?CU;G.?Y;v?(< ϵ?^LeDX}?>Efd|?U'>pCY{?N=>Yz?U>0x?!q>Wv?d>tt?>{q?->,ȴ m?u>fǴci?c>ᴓd?>@]?ʡ>{V?l ?Fu#O?{n?MiH?H?!$?'D?{+?i=?F4?~U5?s=?,?85E?a"?憎~M?QY?QӎdT?-?MEX[??3a?> ΫY1f?>Ւj?>ssn?J>qq?>59yt?a>ײ[v?h5>2v:x?1-s>픴;z?T"X>ĆU~{?J?>6u|?(>SJyO}?>|_}?n>Sq~?r=d~?=0 ?=KZ?>=q)?jx=<0g?S=$6?@3= ?_=dz?svt? ^1_?ΪK?Q62;?"k-?^MW2#?vD]H?ny2?⮻D ?6 2?.??괻P2?ĂЮ ? s2?󛹻zܯ?=42?U@Ν0?o龻ӟ2?@0?8Sû+2?Ż0 1?Ȼ 2?˻0?ٸλ7.2?;һ`m?0?Ӊջq2?ٻ`C? ܻ4r2w?>[ l?*\2_?1𑱱S?6껛32J?n컸7B?o 2??O=?y1??BH?s`0V?l? 2h?Pۻ겮?7л ??JKI?M[6?ʴ{hŲ?g9GZ?/Ժ ?:,@?z:3?Վd;&?Ͻ;g? < Ѵ?&=<*?+w^j|?>) {?5>JEz?M>7*Uy?.h>{w?Â>F=u?'>lr?>nIǴTn?ҩ>ǻj?J>ܴդe?E>7ִ_?U>'^X?}?0P?A? -tH?:?8A?'?y$ ?/G?Hc(?@?_y&T1?8?7y5_:?/?C?%?Q0K??p:oR?t?ۜ]nY? ?e|_?B>d?>gi?>Vim?־>CHp?L>`s?0>b.PTv?e>񭚴Tx?x>y?\>K{?uC>]Z|?H-,>{Z 3}? >)kQ}?)>cgb~?ܹ=cG~?=b?ͫ=&U?=:I?fU|=.t ?=V=>Y-?#`5=߳?I=?<㧶?s<:?1f,}?>y݀ZU|?,>GD{?D>zy?]>:x?_z>u7v?sߌ>\s? >_Dp?>~_Pl?>0شg?u>ɴ\a?:>MH[?m?j*ZaS?i?stJ?$?\B?&?Ps:?j/?J?I?ulf%?dC?Ё.?)N;?77?;2?-@?(?൴I?i?X bP? ?穴W? ?l0^?N>?嬴Zc?>})h?>%1l?@>dsp?>zs?(>٧v?'>?x?ʫ|>BSy?yO`>v7'{?]HF>w@|?ދ.>$ }?>h}?>ƀ[X~?[=z,w~?2=]8?0t=֫[R?Vה=G?F)~=@?W=A2?6=&?=@ϳ?_< ?<젳?F<?>܇?;)?;?sdN;]?":(e?B9t?8?J?Ű7w!?cն&y?k1Y?͒!<?Ԟج*2"?Ͳ?=hEf2?ZFg?>$ 2?/޾vo?ȿ2?ûH.?ŻJ42?u#ǻ`?RȻP 2?Oeɻ ?oʻi72?ԅ˻/?"̻"D2?λ@(?*ϻL2?Vѻk /?Rӻ2?ջ0?ػ2?ڻW/?ݻ2u?ເhmj?[2_?wR?2E?6F~V|?`#>{?y9>Бɉz?dR>5Ex?m>l}w?!>ܬئt?z>iq?ܨ>Xn?u>\Դi?y>ǴEd?>)T"]?>5]ߴV? ?;]N?}?RSE?#? |? *?ƯyG?t ?sO?m?߶V?g ?:]??b?X>֮ h?G>Pbl?>xέp?%>ఴ4s?ԟ>n u?/>(㥴w?~>]*y?3b>S{?EG>p5x3|?/>Е}?w>Qx}?S>w̓T~? i=VU~?=$%mM?= -0R?=O?5~=nA9?W=Е6?5=/?=5?Ca?滚Z2W?BJ?o2??,)rp1?bZ2&? {%۱?FU\g|2?vt' ?|"[2?V pa?Um`o#2?*?a1?@˲4?"0R?Ӊz?N߻Х?ѻ#?ľH$?V맻^Fg?ًۥ?S\o?ا^?5:9?v:q,?A[;oU? ;ǐc? Vg#}?>yD|?0.>pZ*{?XF>Ēy?`>'dx?}>Yu?%ю>a"s?a^4o?6> Ǵk?H>a˴(f?D> a?>ٴ%8Z??ݴbR?7?‰^I??+@?#")?zi6?b3?̉.?F;? F?b M?{#!?F?jc*?0>?K%4?75?:}=?_!,?:QF?!?ʶ}N?Q?U? ?ϴ\??M*>b?΀>g?P>4B!l?h>{Ĵo?[>Cs?8>ٸu?>jEw?L>˦y?گb>s{?H>__2|?/>_S}?>X}?.>iV~?}=x~?X=H#?:=aS?N=$ F?K|=H?U=?3=|-?=(ϳ?<?r<'?U.<<.?$?ͻh?9=c>4?ɡ2ჲy?J׃Id?!#?pӲ?{3кұ?@h?c:ǣw?;TN?;D?݁?iZo|?_">ϐ;{?9>Hx?1,n>ǣw?1e>tu~t?#>նq?W>sm?>̴{i?>Ѵ+d?a'> [۴d]?M>(QԴV? ?^N?|? ڴ8E?18#?pv;? V.?Y u1?CD8?̂|)?2???iN?Dô?pG?+])???"3?6?c$$g?"+>|uǴl?>Wo?.>\s?xf>nu?;V>˴cw?>KٟQy?#a>Kڣ{?'G>f=|?.>ʕ }?>z}?:>n]~?=\,~?=_r?=4W?=Zd?Mjy=Vs,?R=??_0=?泰?*=$H?<0m? h<4?CE<Fq?x<?o>D?-2:?|>/?CIa2#??- Λ2?0v?L2?P`I1? y2?9R|_w?0:2?SD ?SY1? D6l<?N10_?eN?ٻʪ?,ɻ޿3?NH? oWX?tث?'遳?P1Z?D:[?K:#;8??u;^&?N;!tF?.j:}?>>࣌l_|?+>iH{?C>qy?^>ޡ~,x?B{>%v?>emTWs?>Qp?ر>l;ʴ*l?9>ROʴ|Fg?5>Aٴ;a?>Z1մZ??)FS?א?δJ?h?8A?'?ﴕ?7?42?3ߴ_[-?^5fʹg?|>!El?d>{Ǵ[p?>ﷴZHs?H^>4繴2u?eD>x?|>Řy?V_>r _6{?E>zաtR|?8,>'F2}?Z">O5}?>Aoh~? = ~I~?=&C!?}̨=Ogn]?#*=>h Ҋ?t=:O?aN=3R?Z,=0?=̳?[<`(?m&<?k<0?moDqh]??x`k81,?8c+D?!( 2?m4»?|̻@p2?qԻl?etڻ~2|?޻EƲr?⻒t2j? \'f?von2b?$6Wb?i@2b?ed"c?86 a2c?廠&̱e?ҾL2e? 8"e?廄2c?廐b?a*2_? 绀g5[?&X2V?u!P? +2K?@MC?[2:?G`۰0?C[a1}? >^|?;>뚄{?4>>z?M>mDDy?Oi>f_w?%σ>e< u?3^>Oir?W>1ʴOn?9> Hj?^>(ִ+e?G/>/"ʹc%_?>,ᴿ(X?l' ?ִ2P??QG? ?}մ=?+?+q3?y6?)???ٴA!?F?:.?N?Ѵ ?H?FRū(?ړ@?N𼴬2?S7?´pĴ(h?>?hƴl?gG>%np?7ѯ>Ǵs?/>I-v?s>88Kx?\y>Ry?j\>5\\{?B>Mq|?*>"⛴1K}?r>or}?x>dx~?=Ve~?P=qh*?6=g.d?=l]\?n=` ذ?PAI=??'(=&?p1 =z&?A?;ճA?趜;?(\+;Y?K:X?UJ.??N1x?ĄZ0@?ۜY ?Mmp*1?%#&4?ژ̻x\2?Lֻ"?ڱݻ\72n?/㻜_?G2U?6P?[md2L?-N$L?[l"2L?x(YNM?컪D2P?Q Q?T*2T?\U?b黲Y2V?PԀW?默'2U?T?42Q?*껀3 N?0b2H?J B?2<?ۯ4?(2,?sm#?#N2?Lip?EF2 ?H?GJ2?@4M?ZH2?eߎ?m0 J2 ?FW?Tl28?XY?0?Upݻ?4Zλ?*IB=0?.Pt?\q,g?"Fwʸ?Y ي?4H ?:٣?p{; ??%9;Г³"?Q<́?Qdi@|?%>Ū+{?P==>ŘHz?l/W>x?s>qv?>"s t?ޜ>p?>@%m?>Դh?/>ȴ5c?S>״.\??^?n ִU?  ?sM??"δG`D?;$?^:?G/?7b0?m9?紗c&?vB?tY?Y_I?)8,?N?>ҴQ? bH?ʴM")?+@?ʹ<3?y6?iQʴu=?E,?(:ȴ[F?5!?ŪônN??ϴzV?z ?'۴:]?A??ϴ(c?Op>9ϴh?>jǴ'6m?>ƴp?~>RPt?>ވv?>-ԫqx?Ϡt>B=z?X>2r{?=> |?a&>Jej}?">(` ~?'=T:cZ~?D=~~?&»=-?5?ՠ=e-m?j=͖?g=JO?{C=6^?"=~0?./=#dz?<?ɯkrm3}?>e&S|?3,>|ǒm6{?E>. y?y_>| x?`}>Pu??ʎ>Z*v"s?D>0o?>δk?B2>GǴ7g?n>TӴoa?>ڴZ??TִqS? Q?5ʴK?v?IA?g1'?zɴr7?1?-?Q;?ܴ#?fD?0ܴz?aK? ?M?@״> ?G? B"*?H??IشO4?5?δ2>?LY+?ƀG?k ?㴻 P?)?̴.W? ?/=ݴ^?T>(Դd?>OҴi?aV>ϴgm?>Wrq?>'ϴ݈t?>sͪv?*ņ>j\x?n>Iz?R>&W{?9>h|?\!>ǎ}? >hPo)~?="h~?ۧ=$CS~?=2k4B?6=.+v?H=[?`=y?f;=%>E?}L=Jݳ?=-$?^~28?%,?ų2&?a•$?N2#? 6%?yܔ2*?O.?f22??x7?o2;?Է~/DZ@? .B2D?ɲ7E?2G?`&I? q2G?җBmE? 2D?@?2<?a6?h)2/?,_)?řd2"?:W?x2? l ?j2?cRS?2?")#?zD21?ih²E? 2b?W滺?Zܻ1?Wλ3?@#?~V5Cb?.8Z?6N\.Qn? ? Ig?ro:L?=}>;?;,H?<*Dɳ?7<u?^t<?<"r?XɊ|?>q |?kI3>z?BL> Zy?g>⥴~zw?>ʂ'u?pv>Kr?.F>)ƴn?c>vɼj?>2մe?>}д_?>K>ϴ3.Y??ޛдQ?? :I?p?y`E??)?Ѥu5?[#4?x㴝+?=?дl!?qF?Mܴx?ML?dx?LL?l!?qF?[ߴ+?=?9մt5?\#4?CɴE??)?Y:I?p?|ʴQ??M4.Y??Դ_?>K>+ڴe?>uӴj?>]дn?c>\ִKr?.F>p⬴'u?pv>Ǵ}zw?>@GZy?g>㭴z?BL>F |?jI3>p|?>ZX.}?~>XH~?=[~?U]=w?=>O?=.a?~=p?-W=qL?&+4=B[?u=O(0?^3;Hų?o:Ϸ?I~?`I1?QN\Mc?.#?Vq?R1?WλfF?Rܻ@G2c?WmQ"E?ֻ20?n#"?2?(\Ҳ?2?c?C2? v?2#?:$0*?t2/?6?hs2;?᝱@?T2D?06F? 2H?ԗI? q2H? `ʪF?)2D?ʲ`"@? [2=?Է𻡽/7?ni22??P\.?f/2*?O%?|ۤ2$? ,K*$?Ȏ2&?a$y-?\28?I?]iV$2_?3ڲ}?޻ 1?cӻ ?|Ż6?"0??lIm|?U Y?SAs?Ar?#T?:!?g;.?J;[,w?mx|?]!>ǔW{?9>7z?R>Wx?n>˳v?*ņ>݈t?>׵qq?>ugm?>ϴi?`V>ôd?>δ^?S>ٴ/W? ?9 ޴ P?)?ȀG?k ?B2>?KY+?=ŴO4?5?ϴ"*?H??I> ?G?.0ٴ ?M?z?aK?#?eD?-?Q;?"ݴr7?1?n崴A?g1'?&UѴK?w?qS? Q?wHմZ??˫ݴoa?>d״6g?n>-Ѵk?A2>ȴo?>վv"s?D>C̴u?@ʎ>  x?`}>bI y?z_>ӧm6{?E>/KS|?4,>б3}?>E{}?]>ii~?̡=W'~?V=uK!?=f$^?ُ=#?kt=dS]?"M=Q?l+=Y9?(=ӳ?ul< ?+<T?4ғM?&'커22L?UF컀J?T컻22I?@E?mO2C?tVL>?_)2<?U8?$27? 6?=827?qM=?!𻜓|2E?hҒQ?r께\52d?8 8z²|?;߻1?(ֻ|S?`ʻ{0?>ce"?˚Pⰱ[?a D?ihc? %m?x^Ʋ?-G9?|M;o?;R?;`P?<#̳?VT<]?Vq<?ɯJ!|?a&>r{?=>`B=z?X>px?Ϡt>ވv?>Et?>Op?~>饻%6m?>´h?>д(c?Np>Ƚ̴:]?A?5*ִ{V?z ?C ˴nN??[F?5!?~v=?E,?״<3?z6?N")?+@?ӴQ?bH?BѴ,?N?oҴ?Y_I?c&?uB?b0?m9?e ڴ:?G/?B(I`D?;$?לsM??]ӴU?  ?ƚ.\??^?ڴ5c?R>[ٴh?/>kӴ%m?>ɴp?>´s t?ߜ>v?>Jx?s>cHz?l/W> *{?P==>%%@|?%>^4Mn}?>n~?=X~?^=T7~?ï=6n^5?!=/l?5=:xYZ?w}h=. L?C= B?#=92泫?o}=5&?<*?8H?ݱ4|x?yY?"F-$s?jq͹0?+?/Ij?2Zλ11?]pݻBX?8{G27?񻽊?T02 ?d?m͘2?eDz?Z2?@Rp?BJ2 ?v?C2?RiH,$?#6:2,?V5?2<?֯pB?2H?J 2M?뻚^2Q?(@S?4g2V?@-V?:2V?`zU?b>2T?\ $.R?2O?R`M?컺2K?xȷL?b삝2M?/N8`)O?ZmBq2U?dp_?B,T2m?/z5?۱ݻD2?Lֻxݲ?ܘ̻\1?% ?Mm@@?ۜ?b.x?Ą1$?NU?䴜?J2?tK:?K\+;!A?ᶜ;6??;O?](<:u?a<5ܳ?đ<=}?kifx~?=:X}?x>BP0K}?p>3q|?*>\{?B>qy?j\>EKx?\y>-v?s> ls?/>bnp?7ѯ>l?fG>9ʴ(h?>ftb?3>Ѵ\?p?״VU? ?DʹF/N?}?E?¯"?8QpHⴍ+e?H/>״Hj?^>3ʹPn?9>Vôr?W>yd u?2^>ZZ_w?%σ>uDDy?Oi>姴z?M>){?4>|?;>p1}? >&n=~? =_4~?']= t?=c;I?}=h``{?9=?9U\=H&?8=p?t=K|.?<ʳg?`o=?+2o?0Ժg?fβ?1:?O;(?*(;?S(?i&<5?_< ?=I䳎?^,=]%?`N=! ъ?t=Cn]?"*=eX)!?̨=LZJ~?=KHh~? =5}?>hG2}?Z"> tuR|?7,>N`6{?E>0噴y?V_>~x?|>Y1u?eD>(m[Hs?H^>H\p?>T´El?e>*ƴ|b?>ȴl\?=0?yU?/ ?zԴM?l,?MҴjE?"?󫴏JRYߴ}Fg?6>Ѵ+l?:>ҴPp?ر>\ȴTWs?>ȴv?>w,x?B{>ùy?^>6ȚiH{?C>|@l_|?+>-:}?>>F}?j>(dg"k~?K=%jz~?&r=F ?=bdv\?ܗ=Y!?u= .N?O=g?-.=5<?ht=.8ֳ?b<([?q<+?Kŗ< I?j<$F?.<Ө&?N;D??^;ѳ?V:#;2?:?B1 ?'0?t$H?P??,ɻu]?ٻ"1_?5:?G32 ?D) ?S(2?UD5?0͚2?I"29?|ȱD?(12M?y>&|V?黦2]?`9d? 廬*2i?bhmn?e(2q?@pt?K޽2w?Aw?i໠2y?.6a#y?|d2x?߻y?߻פ2{?u߻{̱|?޻xB2?ݻ?ۻ ;2?dٻ`?ջG2?mл"l?4bʻ,12? Ѳ?.1 ?LvqK?~0hz?'?6Tu.?I||L?ms?F9|:}?:B?yjr;?;,;?? <({q?x<}س?AE<آ?h<{G?}<ϳ?*=Z?\0=h= }? >x|=|?.>5搴{?'G>Qy?$a>Gcw?>)mu?;V>pLs?xf>Uo?.>l?>/Ŵg?"+>찼-\b?*$>ⷴY\?~P?ϴPU?^/ ?MݺQM? ?E?"?t6ߴ$+d?a'>V۴{i?>aٴm?>nôq?W>Ŵ~t?#>smw?0e>2x?2,n>z?SR>ࡴ<{?9>Tt|?_"> w}? >n ~?1=ٗ~?i=9M~?,=^h9?%='Co? =P?V!g= ?B=ȿ:?#=5p?=#>?o<3ﺳ?OqW?Ý默25?r?"?U= F?O|=bH=S?N=7($?:=)T~?X= LV~?}=c|}?.>hdT}?>팴^2|?/>{?H>间y?دb>[Cw?J>0kٸu?>As?7>Ꮄo?[>)l!l?h>yg?O>v?b?π>\??0U? ?}N?Q?3ѴQF?!?oÕ}=?_!,?(%4?85?˴b*?0>?P #!?F?KF?a M?' ̉.?F;?;;h6?b3?K+@?#")? ^I?? bR?8?>9'8Z?? a?>۴(f? D> մk?I>sǴ3o?6>˴"s?u?&ю>7x?}> vy?`>Z*{?YF>D|?0.>v璴g#}?>x}?H_>zY~?/&=N~?x;=Fk?ѩ=8/R?!=GW/?~=?vX=:N?5=8H?=&N?<ͳ?=`?5=9?W=*N+?5~=A/R?=MGL?=i8~?=fT~? i=2 T}?S>ު}?w>]Vmx3|?/>CS{?DG>ڙy?3b>Fhw?~> u?/>_4s?ԟ>p?$>VEbl?> h?G>IUb?W>:]??SݶV?g ?sO?n?-yG?t ?hȴ>?à*?%5?X4?=,?d=?e"?iE?'4?K?lJ44?5?<{@۴Ed?>Ki?y>Ҵn?u>Jϴq?ܨ>7٦t?{>w?!>aFx?m>ʉz?dR>[{?y9>8&V|?b#>z}?$>~?1V=66V}~?(=x#n~?=42?=Vh?=?(5m=< ?H=6?(=S(P?͹ =ѳ?<?X < ?;ҕ<`a?ch̐b }?>>@|?ߋ.>Wy'{?]HF>Sy?xO`>x?ɫ|>uv?&>֛zs?(>ݥsp?>`x1l? @>*(h?>|[c?>atk0^?N>보W? ? aP? ?豴I?i?K @?(?'Õ7?;2?LЁ.?*N;?f%?dC?J?I?8s:?j/? \B?&?(J?$?tZaS?i? [?m?޴\a?:>3g?u>7Ѵ]Pl?>v״^Dp?>TǶs? >.4´v?sߌ>':x?_z>zy?]>0pD{?D>xZU|?,>s,}?>R]a}?>^Z~?=֟l~?=̖=?-=êRO?Te= }?=ST<?5[=a f?J9='c?*=XK ?=?V=N#?bU|=U?=>?ͫ=,~?=[gb~?ܹ=@QQ}?*>r 3}? >XhZ|?I-,>꧊K{?uC>cFy?\>}rTx?x>PTv?e>U`s?0>6Ip?L> im?־>i?>~d?>op|_?C>۠\nY? ?aեmR?t?0K??çC?%?}5_:?/? U%T1?8?mc(?@?ӆu ?/G?8A?'?>!.tH?:?]0P?A? X?}?_?U>OԤe?E>oҴj?J>b۴Un?ҩ>lr?>ô=u?&>pw?Â>4*Uy?.h>vz?M>t {?5>쐄|?>ą}}?Z >_&~?=jg~?U=oF^~?E=iV4?4=J'3i?$ۊ= -:Y?m= ѯ?ZJ=a$?>5+=eR?u=l\? <|?G<?#Ϝ[<?ܻ2?ٻhjx?͉ջ2?8һ??׸λeס2?˻}/?Ȼg2?Ż0?3Sû=02? P1?m龻9l2?R0?=.2?0? ~2?B/?@괻Y2?.ܰ ?6^_2?⮻8Ө?'<2"?qH-?\M2;?/?L?Q1_? u?0?{>s|Ŷ?BUz?0V?4&&?6I?Tцꚲ?Hg:<?z;L㲆?z;Ųf?;#?9= ?=QE~?=[9q~?r=^`}?n>WO}?>t~u|?(>jnT~{?J?>G;z?S"X>'9x?0-s>~œ[v?h5>ĭ:yt?`>q?>Jsn?J>kj?>JZ1f?>a?>ĘX[??ʒeT?-?S}M?QY?'猴E?a"?=?,?F4?~U5?ʁ+?i=?2$?'D?f0NiH?H?G$t#O?{n?U|V?l ?]?ʡ>>nd?>㴇ci?c> m?u>M´{q?,>jnʹtt?>XVv?d>ax?!q>LYz?U>՗X{?N=>y|?U'>gCX}?>b}? >Rjo~?=PI~?l=?R?M;=l ,[T?/!=0=Ԁ? =SѢ?XZ=G%?59=o? =Dg?%=X̳?6o2?U$?bC=yJ~? |=n@~?=oxau}?>XW_Ƴ|?<#>{?9>E}ّz?Q>oy?k>|GTw?>1r?z>o?>Vk?>򤴗g?E>lc?y>B§]?*?9W?B0 ?ŏճP? A?t:I???.~%A?(?"8?W1?Q 0?e9? (?|@? cO?$m?,U? ?Lj\??mb?>:_h?I>>$l?>Tشp?9:>>δs?w>@0Sv?m>tұvYx?xx> Vzy?\> F{?mC>|6N|?,G->|!}?>f]}?>$oaJ~?G=I:~?`=MP?D="G)0@?ۓ=ba;p? h=c ܕ?;i=7%T?j]G=*?l)= "?=ճ2?əNG2??z0@E?~Ӛ[.2L? %MS?#锻<2\?pg?t@t1s?LՆF??H S1?MoR?[@C?YB諑?6$L?x `Ȳ? 魺 I???t9^d?PY:g#?23;j?;GJ?;<?(9<~h?+ٺ^'?膺#@Ӳ?'蓹`?.:?6:?a<;>)d?΍;ժ?AE;TU?"K06}?d>jAj|? *>kl{?)@>x4z?YX>%k~Mx?r>&v?0>gt?> 1r?$ߥ>q-o?+>s2Ъk?4>g?>q1jb?>@x ]?N?R2"W?s ?_Q?B?)I??I8A?'?_Eh 9?/?e/2?7?gT^?>0Pc?R>g?>:l?K>tIo?>m>;gbs?m>ʴ5u?y2>POw?H >$oy?df>Nz?3M>]{?(]6>[|?^!>Tp(}}?>fhk ~?=fOL|~?T=+N~? (=:?ª=W:R?b=L}?H=/%_?L_=\ַ?+@=( M?;$=|?Z =&?{?r7=kc ?#pV=zC?y= x\?= &?A=0>)~? =6k~? ="@y~?=VD}?W>OJ|?ό!>kc"{?L6>mwz?L>с~y?'je>8jyw?>≴u?6 >…s?>9 p?>5B3m?>uDj?>焴e?<>ř_a?>u [??D'UzU?J ?Hn/N??υk;G? ?P5??)?#{8?\E1?pH4oh?\> "zl?> o?J>].r?K>شlu?>oϴow?>(#89y?3 j>3Wۜz?DP>y{?*':>â|?w%>dN{[}?N>w}?>uTd~?H=qOW~?=x@( ?9=yw:C?=(q?8=z(6?j=G'?fJ=?x.=? =T?<9ٳ?<᳽?7̰<?*<óK?2|j< ?;Q9,; fs?:4?*M??Ta,?u C䲓?mP p?qO? Dݲ4?g?7? ? XBm?Z?-.ؤ?76?爰?1 r??0|0?@Q*?Wy1f? 002l?+: 1p?'2t?4ש1x?󃻐y2|? 1?^ 1?{9Q1?tw 1?rc1?Wl1?eٯ?-^:G*1?~$UK?4`J0 /?{=Wͱ?L.HY? S*?vz?Fֺz?"R?3߬?^8p?i:A ?VI:wղ?w@;j?;PB?H;*<(? ;,=?ė> Y,}?>Qbye|?5+>mo{?x@>qCz?܉W>#x?`mp>4}"w?Y>Vu?͓>$r?X>do?ֲ>l?،>@h?f>fpd?>Alx_?&>ćy4Y?v?/}S?h-?Ω\L? ?]tBE?+#?(MUi>? *?~Fj?g>;el?j>w$do?x>r?a> t?>J\w?>9Oдx?ӕp>1⹴Ez?rdW>)q{?W@>7c|?\]+>&}?UW> }?A">)my?~?Y2=Nxc~?g/=őPF~?S=Df/?VF=7L`?=>/?K/y=Z!?`X=r8?;=Ҡ ? =a-?J =e?=p g?=(]=?=!fU?8=,?k=8Z~?XP=3F~~? =%P ~?=ȿZ$}?S>_f|?w!>0rS{?a'5>RO} z?J>sy?azb> $x?Z{>}Sv?ak>IU,t?ϙ>*Cq? >n?>Bk?>[Lg?{m>Pb?>/q]?2?ၴW? ?st:MQ?0h?lngJ?"?܇C?y$?w# End: Data binary 4 # End: Segment mumax3-3.10/doc/static/000077500000000000000000000000001371432437400147135ustar00rootroot00000000000000mumax3-3.10/doc/static/api39c.html000066400000000000000000003023321371432437400166740ustar00rootroot00000000000000 mumax3

Warning! This is the API for mumax3.9c, which is no longer supported. If you like to use mumax3, we strongly recommend to use mumax3.10.

mumax 3.9c API

This is a complete overview of all available functions for writing an input script.

Syntax

The mumax3 input syntax is a subset of Go's syntax, somewhat similar to C. It is case-independent however, so msat is the same as Msat or MSAT.

Defining variables

New variables are declared using :=. Variables have a fixed type, inferred from the declaration's right-hand-side. Assigning to existing variables is done using =. E.g.:
i := 7         // defines a new variable i, type automatically detected to be int
print(i)       // now we can use i
i = 5          // assign new value, don't use ':=' (attempt to re-declare)

str := "hello" // defines str, type automatically is string
//str = 1      // would fail, cannot assign int to string

Arithmetic

Most common arithmetic operations are possible. Also Go's math library and some common constants are available. For raise-to-the-power, pow(x,y) should be used.
x := pi*(3+4)/5
x = pow(x, 3)
x++
y := abs(cbrt(cosh(erf(erfc(gamma(J0(Y0(2))))))))

Control structures

Loops are possible as well:
for i:=0; i<10; i++{
	print(i)
}

Implicit functions

Some of the API features accept a function as argument (e.g.: RunWhile(func()bool), or all input parameters). In that case, and only in this case, the argument is implicitly converted to a function, which is re-evaluated each time it's needed. E.g.:
value := sin(pi*t)  // value is a float64, RHS evaluated only once
Msat = value        // time-independent Msat
versus:
Msat = sin(pi*t)    // RHS converted to function, re-evaluted every time

Setting the mesh size

The simulation mesh defines the size of the box around your magnet. It should be set at the beginning of the script. The number of cells should preferably be powers of two, or at least have small prime factors (2,3,5,7). E.g.:
Nx := 128
Ny := 64
Nz := 2
sizeX := 500e-9
sizeY := 250e-9
sizeZ := 10e-9
SetGridSize(Nx, Ny, Nz)
SetCellSize(sizeX/Nx, sizeY/Ny, sizeZ/Nz)

Periodic boundary conditions

Optionally, periodic boundary conditions can be enabled:
SetPBC(5, 0, 0)        // 5 extra images on left and right sides.
SetGridSize(128, 64, 1)
SetCellSize(5e-9, 5e-9, 5e-9)
Setting a nonzero PBC value in a direction enables wrap-around in that direction. The precise value passed determines how many repetitions are seen by the demag field. E.g., in the above example the demag field behaves as if 5 repetitions are present to the left and to the right side. Choosing a large number may cause long initialization time.

Resizing the mesh

The mesh can be changed at any later time in the simulation. This will cause the magnetization to be stretched onto the new mesh if needed, and the geometry and regions to be re-calculated. After resize some cells which had zero magnetization may now fall inside the magnet geometry, they will be initialized to random magnetization.

SetCellSize(float64, float64, float64)

Sets the X,Y,Z cell size in meters

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

SetGridSize(int, int, int)

Sets the number of cells for X,Y,Z

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

SetMesh(int, int, int, float64, float64, float64, int, int, int)

Sets GridSize, CellSize and PBC in once

SetPBC(int, int, int)

Sets number of repetitions in X,Y,Z


Setting a geometry

Optionally a magnet Shape other than the full simulation box can be specified. One can specify primitive shapes, constructed at the origin (box center), and translate/rotate them if needed. All positions are specified in meters and the origin lies in the center of the simulation box. E.g.:
 SetGeom(cylinder(400e-9, 20e-9).RotX(45*pi/180).Transl(1e-6,0,0))

SetGeom(Shape)

Sets the geometry to a given shape

examples: [4] [6] [7] [8] [9] [11] [12] [14]

EdgeSmooth

Geometry edge smoothing with edgeSmooth^3 samples per cell, 0=staircase, ~8=very smooth

examples: [4]

Cell(int, int, int) Shape

Single cell with given integer index (i, j, k)

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

Circle(float64) Shape

2D Circle with diameter in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4] [6] [7] [8] [12]

Cuboid(float64, float64, float64) Shape

Cuboid with sides in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4]

Cylinder(float64, float64) Shape

3D Cylinder with diameter and height in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4] [5] [6]

Ellipse(float64, float64) Shape

2D Ellipse with axes in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [14]

Ellipsoid(float64, float64, float64) Shape

3D Ellipsoid with axes in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4]

GrainRoughness(float64, float64, float64, int) Shape

Grainy surface with different heights per grain

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

ImageShape(string) Shape

Use black/white image as shape

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4]

Layer(int) Shape

Single layer (along z), by integer index starting from 0

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4] [13] [14]

Layers(int, int) Shape

Part of space between cell layer1 (inclusive) and layer2 (exclusive), in integer indices

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4]

Rect(float64, float64) Shape

2D rectangle with size in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4] [6] [9] [11] [12] [15]

Square(float64) Shape

2D square with size in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [6]

Universe() Shape

Entire space

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

XRange(float64, float64) Shape

Part of space between x1 and x2, in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

examples: [4] [7]

YRange(float64, float64) Shape

Part of space between y1 and y2, in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  

ZRange(float64, float64) Shape

Part of space between z1 and z2, in meter

methods: Add( Shape )   Intersect( Shape )   Inverse( )   Repeat( float64 float64 float64 )   RotX( float64 )   RotY( float64 )   RotZ( float64 )   Scale( float64 float64 float64 )   Sub( Shape )   Transl( float64 float64 float64 )   Xor( Shape )  


Defining material regions

Optionally, up to 256 material regions can be defined. Since each cell is made from one material, it is associated with exactly one region. So regions can not overlap. Each cell is assigned material region 0 by default. It's a good idea to output regions to verify whether each cell is assigned to the intended region. Each region can have its own material parameters, and we can output averages over each region. E.g.:
DefRegion(1, circle(1e-6))
DefRegion(0, circle(1e-6).Inverse()) // redundant
save(regions)
Msat.SetRegion(1, 800e6)
tableAdd(m.Region(1))    // add average m over region 1 to table

DefRegion(int, Shape)

Define a material region with given index (0-255) and shape

examples: [7] [12] [13]

DefRegionCell(int, int, int, int)

Set a material region in one cell by index

regions

Outputs the region index for each cell

methods: Average( )   GetCell( int int int )   Gpu( )   HostArray( )   HostList( )   LoadFile( string )   SetCell( int int int int )  

examples: [7] [12]


Initial magnetization

The initial magnetization is set by assigning a Config to m, setting it in separate regions, or by loading a file directly.
m = uniform(1, 0, 0)
m.SetRegion(1, vortex(1, 1))
m.LoadFile("config.ovf")

m

Reduced magnetization (unit length)

methods: Average( )   Buffer( )   Comp( int )   GetCell( int int int )   LoadFile( string )   Region( int )   Set( Config )   SetArray( Slice )   SetCell( int int int data.Vector )   SetInShape( Shape Config )   SetRegion( int Config )   TableData( )  

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

Antivortex(int, int) Config

Antivortex magnetization with given circulation and core polarization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

BlochSkyrmion(int, int) Config

Bloch skyrmion magnetization with given chirality and core polarization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [5]

NeelSkyrmion(int, int) Config

Néél skyrmion magnetization with given charge and core polarization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [5]

RandomMag() Config

Random magnetization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [3] [5]

RandomMagSeed(int) Config

Random magnetization with given seed

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

TwoDomain(float64, float64, float64, float64, float64, float64, float64, float64, float64) Config

Twodomain magnetization with with given magnetization in left domain, wall, and right domain

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [5] [10] [11]

Uniform(float64, float64, float64) Config

Uniform magnetization in given direction

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [1] [2] [5] [6] [7] [13] [14] [15]

Vortex(int, int) Config

Vortex magnetization with given circulation and core polarization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [5] [8] [9] [12]

VortexWall(float64, float64, int, int) Config

Vortex wall magnetization with given mx in left and right domain and core circulation and polarization

methods: Add( float64 Config )   RotZ( float64 )   Scale( float64 float64 float64 )   Transl( float64 float64 float64 )  

examples: [5]


Material parameters

Assigning to a material parameter sets a value in all regions. E.g.:
Msat  = 800e3
AnisU = vector(1, 0, 0)
When regions are defined, they can also be set region-wise:
Msat.SetRegion(0, 800e3)
Msat.SetRegion(1, 540e3)
Material parameters can be functions of time as well. E.g.:
f := 500e6
Ku1 = 500 * sin(2*pi*f*t)

Aex

Exchange stiffness (J/m)

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [1] [2] [3] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

alpha

Landau-Lifshitz damping constant

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [1] [6] [7] [8] [10] [11] [12] [14] [15]

anisC1

Cubic anisotropy direction #1

methods: Average( )   Comp( int )   GetRegion( int )   IsUniform( )   Region( int )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [12]

anisC2

Cubic anisotorpy directon #2

methods: Average( )   Comp( int )   GetRegion( int )   IsUniform( )   Region( int )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [12]

anisU

Uniaxial anisotropy direction

methods: Average( )   Comp( int )   GetRegion( int )   IsUniform( )   Region( int )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [7] [10] [15]

Dbulk

Bulk Dzyaloshinskii-Moriya strength (J/m2)

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

Dind

Interfacial Dzyaloshinskii-Moriya strength (J/m2)

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

EpsilonPrime

Slonczewski secondairy STT term ε'

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [14]

FixedLayer

Slonczewski fixed layer polarization

methods: Average( )   Comp( int )   GetRegion( int )   IsUniform( )   Region( int )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [14]

frozenspins

Defines spins that should be fixed

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

Kc1

1st order cubic anisotropy constant (J/m3)

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [12]

Kc2

2nd order cubic anisotropy constant (J/m3)

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

Kc3

3rd order cubic anisotropy constant (J/m3)

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

Ku1

1st order uniaxial anisotropy constant (J/m3)

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [7] [10] [15]

Ku2

2nd order uniaxial anisotropy constant (J/m3)

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

Lambda

Slonczewski Λ parameter

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [14]

Msat

Saturation magnetization (A/m)

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [1] [2] [3] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

NoDemagSpins

Disable magnetostatic interaction per-spin (set to 1 to disable)

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

Pol

Electrical current polarization

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [5] [10] [11] [14]

Temp

Temperature (K)

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

xi

Non-adiabaticity of spin-transfer-torque

methods: Average( )   GetRegion( int )   IsUniform( )   Region( int )   Set( float64 )   SetRegion( int ScalarFunction )   SetRegionFuncGo( int func() float64 )   SetRegionValueGo( int float64 )  

examples: [10] [11] [12]


Excitation

Field or current excitations can be set in the same way as material parameters:
B_ext = vector(0.01, 1e-6*sin(2*pi*f*t), 0)
B_ext.SetRegion(1, vector(0, 0, 0.1))
Additionally, an arbitrary number of time- and space-dependent vector fields of the form g(x,y,z) * f(t) may be added. (E.g., to simulate the field of an antenna or an arbitrary current running through the magnet)
B_ext.Add(LoadFile("antenna.ovf"), sin(2*pi*f*t))
JPol.Add(LoadFile("current.ovf"), 1)

B_ext

Externally applied field (T)

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   IsUniform( )   Region( int )   RemoveExtraTerms( )   Set( data.Vector )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [1] [3] [15]

J

Electrical current density (A/m2)

methods: Add( Slice ScalarFunction )   AddGo( Slice func() float64 )   AddTo( Slice )   Average( )   Comp( int )   IsUniform( )   Region( int )   RemoveExtraTerms( )   Set( data.Vector )   SetRegion( int VectorFunction )   SetRegionFn( int func() [3]float64 )  

examples: [10] [11] [12] [13] [14] [15]

Index2Coord(int, int, int) data.Vector

Convert cell index to x,y,z coordinate in meter

methods: Add( data.Vector )   Cross( data.Vector )   Div( float64 )   Dot( data.Vector )   Len( )   MAdd( float64 data.Vector )   Mul( float64 )   Sub( data.Vector )   X( )   Y( )   Z( )  

examples: [15]

LoadFile(string) Slice

Load a data file (ovf or dump)

methods: CPUAccess( )   Comp( int )   DevPtr( int )   Disable( )   Free( )   GPUAccess( )   Get( int int int int )   Host( )   HostCopy( )   Index( int int int )   IsNil( )   Len( )   MemType( )   Scalars( )   Set( int int int int float64 )   SetScalar( int int int float64 )   SetVector( int int int data.Vector )   Size( )   Tensors( )   Vectors( )  

examples: [5]

NewSlice(int, int, int, int) Slice

Makes a 4D array of scalars with given ncomp,x,y,z size

methods: CPUAccess( )   Comp( int )   DevPtr( int )   Disable( )   Free( )   GPUAccess( )   Get( int int int int )   Host( )   HostCopy( )   Index( int int int )   IsNil( )   Len( )   MemType( )   Scalars( )   Set( int int int int float64 )   SetScalar( int int int float64 )   SetVector( int int int data.Vector )   Size( )   Tensors( )   Vectors( )  


Magnetic Force Microscopy

Mumax3 has built-in generation of MFM images from a 2D magnetization. The MFM tip lift can be freely chosen. By default the tip magnetization is modeled as a point monopole at the apex. This is sufficient for most situations. Nevertheless, it is also possible to model partially magnetized tips by setting MFMDipole to the magnetized portion of the tip, in meters. E.g., if only the first 20nm of the tip is (vertically) magnetized, set MFMDipole=20e-9.

MFM

MFM image

methods: Average( )   Region( int )   Set( Slice )  

examples: [9]

MFMDipole

Height of vertically magnetized part of MFM tip

MFMLift

MFM lift height

examples: [9]


Output quantities

The quantities listed below can be output. Also, derived quantities can be produced: the quantity restricted to a certain region or a single component. E.g.:
m           // magnetization quantity
m.Comp(0)   // x-component
m.Region(1) // magnetization in region 1 (0 elsewhere)

B_anis

Anisotropy field (T)

methods: AddTo( Slice )   Average( )   Comp( int )   Region( int )   Set( Slice )  

B_demag

Magnetostatic field (T)

methods: Average( )   Comp( int )   Region( int )   Set( Slice )  

B_eff

Effective field (T)

methods: Average( )   Comp( int )   Region( int )   Set( Slice )  

B_exch

Exchange field (T)

methods: AddTo( Slice )   Average( )   Comp( int )   Region( int )   Set( Slice )  

B_therm

Thermal field (T)

methods: AddTo( Slice )  

dt

Time Step (s)

methods: Average( )   Get( )  

E_anis

Anisotropy energy (uni+cubic) (J)

methods: Average( )   Get( )  

E_demag

Magnetostatic energy (J)

methods: Average( )   Get( )  

E_exch

Exchange energy (normal+DM) (J)

methods: Average( )   Get( )  

E_therm

Thermal energy (J)

methods: Average( )   Get( )  

E_total

Total energy (J)

methods: Average( )   Get( )  

examples: [13]

E_Zeeman

Zeeman energy (J)

methods: Average( )   Get( )  

Edens_anis

Anisotropy energy density (uni+cubic) (J/m3)

methods: AddTo( Slice )   Average( )   Region( int )   Set( Slice )  

Edens_demag

Magnetostatic energy density (J/m3)

methods: AddTo( Slice )   Average( )   Region( int )   Set( Slice )  

Edens_exch

Exchange energy density (normal+DM) (J/m3)

methods: AddTo( Slice )   Average( )   Region( int )   Set( Slice )  

Edens_therm

Thermal energy density (J/m3)

methods: AddTo( Slice )   Average( )   Region( int )   Set( Slice )  

Edens_total

Total energy density (J/m3)

methods: Average( )   Region( int )   Set( Slice )  

Edens_Zeeman

Zeeman energy density (J/m3)

methods: AddTo( Slice )   Average( )   Region( int )   Set( Slice )  

ExchCoupling

Average exchange coupling with neighbors (arb.)

methods: Average( )   Region( int )   Set( Slice )  

examples: [12]

geom

Cell fill fraction (0..1)

methods: Average( )   Gpu( )  

examples: [4] [6] [7] [8] [9] [11] [12] [14]

LastErr

Error of last step

methods: Average( )   Get( )  

LLtorque

Landau-Lifshitz torque/γ0 (T)

methods: Average( )   Comp( int )   Region( int )   Set( Slice )  

m_full

Unnormalized magnetization (A/m)

methods: Average( )   Comp( int )   Region( int )   Set( Slice )  

MaxAngle

Maximum angle between neighboring spins (rad)

methods: Average( )   Get( )  

maxTorque

Maximum torque/γ0, over all cells (T)

methods: Average( )   Get( )  

NEval

Total number of torque evaluations

methods: Average( )   Get( )  

PeakErr

Overall maxium error per step

methods: Average( )   Get( )  

spinAngle

Angle between neighboring spins (rad)

methods: Average( )   Region( int )   Set( Slice )  

STtorque

Spin-transfer torque/γ0 (T)

methods: AddTo( Slice )   Average( )   Comp( int )   Region( int )   Set( Slice )  

torque

Total torque/γ0 (T)

methods: Average( )   Comp( int )   Region( int )   Set( Slice )  


Slicing and dicing output

To save storage space, it's possible to save only the part of the output we're interested in. This works on all output quantities (not only m)
save(m)                         // save full magnetization
save(m.Comp(0))                 // save only x-component
save(CropLayer(m, 13))          // save only layer 13
save(CropLayer(m.Comp(0), 13))  // save only x-component of layer 13
Or even:
mx   := m.Comp(0)
mx13 := CropLayer(mx, 13) 
save(mx13)
tableAdd(mx13)

Crop(Quantity, int, int, int, int, int, int) *cropped

Crops a quantity to cell ranges [x1,x2[, [y1,y2[, [z1,z2[

methods: Average( )  

examples: [8]

CropLayer(Quantity, int) *cropped

Crops a quantity to a single layer

methods: Average( )  

CropX(Quantity, int, int) *cropped

Crops a quantity to cell ranges [x1,x2[

methods: Average( )  

CropY(Quantity, int, int) *cropped

Crops a quantity to cell ranges [y1,y2[

methods: Average( )  

examples: [8]

CropZ(Quantity, int, int) *cropped

Crops a quantity to cell ranges [z1,z2[

methods: Average( )  


Scheduling output

All input and output quantities (as described above) can be saved in a space-dependent way (".ovf" file), or as spatial averages (table output). The data table ("table.txt") contains by default the time and average magnetization. More columns can be added with TableAdd().
save(B_ext)

tableadd(B_ext)
tablesave()
Optionally, the output/averaging can be done over a single region:
save(m.Region(1))
TableAdd(m.Region(1)) 
User-defined variables can be added to the table with TableAddVar().
myField := 0.42
TableAddVar(myField, "B_extra", "T")
myField = ...

AutoSave(Quantity, float64)

Auto save space-dependent quantity every period (s).

examples: [1] [10] [11] [14] [15]

AutoSnapshot(Quantity, float64)

Auto save image of quantity every period (s).

FilenameFormat

printf formatting string for output filenames.

Fprintln(string, ...interface {})

Print to file

OutputFormat

Format for data files: OVF1_TEXT, OVF1_BINARY, OVF2_TEXT or OVF2_BINARY

OVF1_BINARY

OutputFormat = OVF1_BINARY sets binary OVF1 output

OVF1_TEXT

OutputFormat = OVF1_TEXT sets text OVF1 output

OVF2_BINARY

OutputFormat = OVF2_BINARY sets binary OVF2 output

OVF2_TEXT

OutputFormat = OVF2_TEXT sets text OVF2 output

Print(...interface {})

Print to standard output

examples: [2]

Save(Quantity)

Save space-dependent quantity once, with auto filename

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

SaveAs(Quantity, string)

Save space-dependent with custom filename

examples: [4] [5] [7] [9]

Snapshot(Quantity)

Save image of quantity

SnapshotFormat

Image format for snapshots: jpg, png or gif.

sprint(...interface {}) string

Print all arguments to string with automatic formatting

sprintf(string, ...interface {}) string

Print to string with C-style formatting.

TableAdd(TableData)

Add quantity as a column to the data table.

examples: [3] [11] [13]

TableAddVar(ScalarFunction, string, string)

Add user-defined variable + name + unit to data table.

TableAutoSave(float64)

Auto-save the data table every period (s). Zero disables save.

examples: [1] [11] [14]

TablePrint(...interface {})

Print anyting in the data table

TableSave()

Save the data table right now (appends one line).

examples: [3] [13]


Running

Run(time) runs the simulation for a given time in seconds, using sensible error settings.
Run(1e-9)
More fine-grained control is provided by RunWhile(condition), which runs as long as an arbitrary condition is met. E.g.:
mx := m.comp(0)
RunWhile(mx.average() < 0)   // search for switching field during reversal
Optionally, the solver accuracy may be fine-tuned. E.g.:
MaxDt = 1e-12
MinDt = 1e-15
MaxErr = 1e-6
Optionally, a different solver may be chosen (at any point) with SetSolver(int). Currently available solver types:
  • 5: RK45 (Dormand-Prince) solver (the default). An accurate solver, very fast for magnetization dynamics at the cost of some memory usage.
  • 4: Classical 4th-order Runge-Kutta method. Intended for simulations where a fixed, relatively large time step is desired.
  • 3: RK23 (Bogacki-Shampine) solver. A robust and reasonably fast solver with low memory requirements. Typically outperforms RK45 when relaxing the magnetization with little dynamics, so it used internally by Relax().
  • 2: Adaptive Heun solver. Robust and uses very little memory but takes smaller time steps than the higher-order solvers. Also suited when a fixed, relatively small time step is desired.
  • 1: Euler solver (requires FixDt = ..., ignores other settings). Only useful in exceptional situations or for debugging.
E.g.:
SetSolver(2) // Heun
FixDt = 1e-15

Relax

Relax() tries to evolve the magnetization as closely as possible to the minimum energy state. This function assumes all excitations have been turned off (temperature, electrical current, time-dependent magnetic fields). During relax precession is disabled and the time t does not increase. There is no need to set high damping.

In general it is difficult to be sure the minimum energy state has been truly reached. Hence, relax may occasionally return after the energy has reached a local minimum, a saddle point, or a rather flat valley in the energy landscape.

Minimize

Minimize() is like Relax, but uses the conjugate gradient method to find the energy minimum. It is usually much faster than Relax, but is a bit less robust against divergence. E.g., a random starting configuration can be Relaxed, but may fail with Minimize. Minimize is very well suited for hysteresis calculations, where we are never far away from the ground state.

Minimize()

Use steepest conjugate gradient method to minimize the total energy

examples: [3] [6]

Relax()

Try to minimize the total energy

examples: [1] [2] [3] [9] [10] [11]

Run(float64)

Run the simulation for a time in seconds

examples: [1] [7] [10] [11] [12] [14] [15]

RunWhile(func() bool)

Run while condition function is true

Steps(int)

Run the simulation for a number of time steps

FixDt

Set a fixed time step, 0 disables fixed step

Headroom

Solver headroom

MaxDt

Maximum time step the solver can take (s)

MaxErr

Maximum error per step the solver can tolerate

MinDt

Minimum time step the solver can take (s)

MinimizerSamples

Number of max dM to collect for Minimize convergence check.

MinimizerStop

Stopping max dM for Minimize

examples: [3]

step

Total number of time steps taken

examples: [3] [10]

t

Total simulated time (s)

examples: [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] [14] [15]

SetSolver(int)

Set solver type. 1:Euler, 2:Heun


Moving simulation window

Mumax3 can automatically shift the magnetization so that the simulation "window" stays centered on a region of interest. Shifting is done to keep a freely chosen magnetization component nearly zero. E.g.
ext_centerwall(0)
ext_rmSurfaceCharge(0, -1, 1)
TableAdd(TotalShift)
will try to keep mx (component 0, counting from 0) close to zero. If desired, one can override which "new" magnetization is inserted from the sides by setting ShiftMagL and ShiftMagR, though the default behaviour is usually OK.

Shift(int)

Shifts the simulation by +1/-1 cells along X

examples: [15]

ShiftGeom

Whether Shift() acts on geometry

ShiftM

Whether Shift() acts on magnetization

examples: [15]

ShiftMagL

Upon shift, insert this magnetization from the left

methods: Add( data.Vector )   Cross( data.Vector )   Div( float64 )   Dot( data.Vector )   Len( )   MAdd( float64 data.Vector )   Mul( float64 )   Sub( data.Vector )   X( )   Y( )   Z( )  

ShiftMagR

Upon shift, insert this magnetization from the right

methods: Add( data.Vector )   Cross( data.Vector )   Div( float64 )   Dot( data.Vector )   Len( )   MAdd( float64 data.Vector )   Mul( float64 )   Sub( data.Vector )   X( )   Y( )   Z( )  

examples: [15]

ShiftRegions

Whether Shift() acts on regions

TotalShift

Amount by which the simulation has been shifted (m).


Extensions

Extensions are extra functionalities that are not officially supported. They are aimed at rather specific problems and may not work as expected for your particular situation. Their API and functionality may change in future releases.

ext_bubbledist

Bubble traveled distance (m)

methods: Average( )   Get( )  

ext_bubblepos

Bubble core position (m)

methods: Average( )   Get( )  

ext_bubblespeed

Bubble velocity (m/s)

methods: Average( )   Get( )  

ext_centerWall(int)

centerWall(c) shifts m after each step to keep m_c close to zero

examples: [10] [11]

ext_corepos

Vortex core position (x,y) + polarization (z) (m)

methods: Average( )   Get( )  

ext_dwpos

Position of the simulation window while following a domain wall (m)

methods: Average( )   Get( )  

examples: [11]

ext_dwspeed

Speed of the simulation window while following a domain wall (m/s)

methods: Average( )   Get( )  

ext_dwtilt

PMA domain wall tilt (rad)

methods: Average( )   Get( )  

ext_EnableUnsafe()

Allow potentially unsafe features, at your own risk.

ext_makegrains(float64, int, int)

Voronoi tesselation (grain size, num regions)

examples: [12] [15]

ext_rmSurfaceCharge(int, float64, float64)

Compensate magnetic charges on the left and right sides of an in-plane magnetized wire. Arguments: region, mx on left and right side, resp.

examples: [11]

ext_ScaleExchange(int, int, float64)

Re-scales exchange coupling between two regions.

examples: [12] [13] [15]

ext_topologicalcharge

2D topological charge

methods: Average( )   Get( )  

ext_topologicalchargedensity

2D topological charge density m·(m/∂x ❌ ∂m/∂y) (1/m2)

methods: Average( )   Region( int )   Set( Slice )  

EnableDemag

Enables/disables demag (default=true)

Expect(string, float64, float64, float64)

Used for automated tests: checks if a value is close enough to the expected value

ExpectV(string, data.Vector, data.Vector, float64)

Used for automated tests: checks if a vector is close enough to the expected value


Misc

Other available functions.

abs(float64) float64

acos(float64) float64

acosh(float64) float64

asin(float64) float64

asinh(float64) float64

atan(float64) float64

atan2(float64, float64) float64

atanh(float64) float64

cbrt(float64) float64

ceil(float64) float64

cos(float64) float64

examples: [6] [13] [14]

cosh(float64) float64

DemagAccuracy

Controls accuracy of demag kernel

DisableSlonczewskiTorque

Disables Slonczewski torque (default=false)

DisableZhangLiTorque

Disables Zhang-Li torque (default=false)

DoPrecess

Enables LL precession (default=true)

DotProduct(Quantity, Quantity) *dotProduct

Dot product of two vector quantities

methods: Average( )  

DUMP

OutputFormat = DUMP sets text DUMP output

erf(float64) float64

erfc(float64) float64

Exit()

Exit from the program

exp(float64) float64

examples: [15]

exp2(float64) float64

expm1(float64) float64

false

floor(float64) float64

Flush()

Flush all pending output to disk.

gamma(float64) float64

GammaLL

Gyromagnetic ratio in rad/Ts

heaviside(float64) float64

hypot(float64, float64) float64

ilogb(float64) int

examples: [2]

inf

examples: [4] [7] [11]

isInf(float64, int) bool

isNaN(float64) bool

j0(float64) float64

j1(float64) float64

jn(int, float64) float64

ldexp(float64, int) float64

log(float64) float64

examples: [2] [4]

log10(float64) float64

log1p(float64) float64

log2(float64) float64

logb(float64) float64

examples: [2]

max(float64, float64) float64

examples: [3] [12]

min(float64, float64) float64

examples: [3] [6]

mod(float64, float64) float64

Mu0

Permittivity of vaccum (Tm/A)

examples: [2]

NewScalarMask(int, int, int) Slice

Makes a 3D array of scalars

methods: CPUAccess( )   Comp( int )   DevPtr( int )   Disable( )   Free( )   GPUAccess( )   Get( int int int int )   Host( )   HostCopy( )   Index( int int int )   IsNil( )   Len( )   MemType( )   Scalars( )   Set( int int int int float64 )   SetScalar( int int int float64 )   SetVector( int int int data.Vector )   Size( )   Tensors( )   Vectors( )  

NewVectorMask(int, int, int) Slice

Makes a 3D array of vectors

methods: CPUAccess( )   Comp( int )   DevPtr( int )   Disable( )   Free( )   GPUAccess( )   Get( int int int int )   Host( )   HostCopy( )   Index( int int int )   IsNil( )   Len( )   MemType( )   Scalars( )   Set( int int int int float64 )   SetScalar( int int int float64 )   SetVector( int int int data.Vector )   Size( )   Tensors( )   Vectors( )  

examples: [15]

norm(float64) float64

Standard normal distribution

examples: [5] [12]

now() time.Time

Returns the current time

methods: Add( time.Duration )   AddDate( int int int )   After( time.Time )   Before( time.Time )   Clock( )   Date( )   Day( )   Equal( time.Time )   Format( string )   GobEncode( )   Hour( )   ISOWeek( )   In( *time.Location )   IsZero( )   Local( )   Location( )   MarshalBinary( )   MarshalJSON( )   MarshalText( )   Minute( )   Month( )   Nanosecond( )   Round( time.Duration )   Second( )   Sub( time.Time )   Truncate( time.Duration )   UTC( )   Unix( )   UnixNano( )   Weekday( )   Year( )   YearDay( )   Zone( )  

pi

examples: [4] [5] [6] [11] [13] [14] [15]

pow(float64, float64) float64

examples: [2] [15]

pow10(int) float64

rand() float64

Random number between 0 and 1

examples: [3] [5] [12] [15]

randExp() float64

Exponentially distributed random number between 0 and +inf, mean=1

randInt(int) int

Random non-negative integer

randNorm() float64

Standard normal random number

examples: [12]

randSeed(int)

Sets the random number seed

remainder(float64, float64) float64

Sign(float64) float64

Signum function

sin(float64) float64

examples: [5] [6] [13] [14] [15]

sinc(float64) float64

since(time.Time) time.Duration

Returns the time elapsed since argument

methods: Hours( )   Minutes( )   Nanoseconds( )   Seconds( )  

sinh(float64) float64

sqrt(float64) float64

examples: [2]

tan(float64) float64

tanh(float64) float64

ThermSeed(int)

Set a random seed for thermal noise

true

trunc(float64) float64

Vector(float64, float64, float64) data.Vector

Constructs a vector with given components

methods: Add( data.Vector )   Cross( data.Vector )   Div( float64 )   Dot( data.Vector )   Len( )   MAdd( float64 data.Vector )   Mul( float64 )   Sub( data.Vector )   X( )   Y( )   Z( )  

examples: [1] [3] [5] [7] [10] [11] [12] [14] [15]

y0(float64) float64

y1(float64) float64

yn(int, float64) float64

mumax3-3.10/doc/static/gpus.svg000066400000000000000000001500141371432437400164130ustar00rootroot00000000000000 image/svg+xml0 100 200 300 400 500 600 OOMMF(CPU) GT 650M GT 755M (iMac 2013) GTX 860M Tesla M2070 Tesla 2050 GTX 660 Quadro K4200 GTX 480 GTX 680 GTX 970 GTX 580 GTX 1060 (mobile) Tesla K20XM GTX 980 GTX 1070 GTX TITAN BLACK FE GTX TITAN GTX 1080 Tesla M40 GTX 980 Ti Quadro P5000 Tesla P40 GTX TITAN X (Pascal) GTX 1080 Ti GTX TITAN Xp Tesla P100 Tesla P100 SXM2 RTX 2080 Ti TITAN V Tesla V100 throughput (M cells/s) mumax3-3.10/doc/static/mfm.svg000066400000000000000000000222061371432437400162150ustar00rootroot00000000000000 image/svg+xml MFMLift MFMDipole - + mumax3-3.10/doc/static/nimble-cubes128-xmas.png000066400000000000000000000331421371432437400211720ustar00rootroot00000000000000PNG  IHDR>asBIT|d pHYs  tEXtSoftwarewww.inkscape.org< IDATxyeeuY>SCWU2HD P71A3cnGsMxss?M4W%"C4S7=5W:]?}ihνyy=]ֶ~`9^˼̦\n â$aov–`\w>ȿ x\:%x~{~y('\ _ w9Ws*Ci;|>xxvW9T>(fiuBak߅o:[מr)o.<^G[rR)῟ O9!ز}N={""%Jޝ`1t.Ym =Qk8rR(o"蜫5AEpg`\e6e+a-.(Z ^,Cek>ǯ6;^Y ׷ #NR0oϻ 7ky)ipkͰs4S p3|^(Z^IXKOJUw p#}PDڡp>\^v5MG\o~ tZ^`r +#sWyosn\ z-*жG_1kz) p,|X5k-g/6m+*hs%vr ^dR=t]ާg +hJtaWspB28s='YRşnnR˟")wq$ 2D|ŧ~MىL+WKe`5XoWqje; nf$unNdL=)^1|B>ޖcQxoJa?oBP8@#>.59 K Rũ  L) `sp8rLr< ғmр=n׾Z pܻ' z#w2}p#.^+>8V+>8\ xᜃkkO===g}v+](@_X='R^ V^^~%-<,pm7dUŋ.=yubOOOme;. pu|cfʛ;:}–?===.իW_988Ў;}6>82>li\3evbc/ٖpg/a=s ? p#[پ f70}_3:K@?+U~哟|IO/^~7o ټ\HMx i"1z(b'?_p\E^YU7§VşHoBmh±+;EtfM~n|c7W2ox1SG L5egp_Ŵ_烖J  7W]=5|o&ҏZ ?.̦d3W\74A W\qťt?n}V,59qc"UǻQ|*0y݌s$AD?Y^.[NRW,¦1 _uۙ*IP.iii9]WuJQ01ӦZ#\ QO)?Ml~ z뛚bll Ueūs~G1ap\c.QVM PFaAضv'^&,ViVh[špb(Ԁ]tt,*Jc3\vہ/1*NԸN_Ӕy6c[×]oHV\uջ^ ]|;pV[s&1~gm ?7nl~>MҨg`3_w݂3>9Gz8imm=~ p%\{YϢ kt$^]B2222211Aq8L/: X+Og])z衟 d{5?NeO?'R}28/n힞wk4%MS2q|8-nÆ ?<4bF ׬9cc-gS޽ktrٔfي_ Q;\0~Jߐ)A`R!qS鉞z=v_} >k˖]>˷pF8gS;g"`֎÷_~5;wʻ6lիOYhQI8[l}kwq?\};:Om1:JЋ 7IԳۍ'յq'_u'^׿{S?0>>s=?[.>׿Z[nʫ;g>ï};la~ƎfŒZ 20J%Jm3oREx%ie㌗n 0~obwE6l+_?MӲ_?g ??G…Tj5rbV{ sK |+^+|ncᣏKO ^2@o2- kisW3168qjo+V-9]dxz|INtJ: i)@kk۫R0Y|?á^of/|J w}t<yR0'j~% uQ*rW\q>5A9~K^W`7۬#6aQkӴ% Gg?H{v_>㓟9SO?9.~{?Q޹JЬ$Ho8fbp`zHyE_(J ';I|37n$ħOvtP a[ʇ#a݌B.Ӣ:j7|i6?9i:]!9嬳w4 ܿ}7#~ .xخ;}n J[y͖޳N?"s<'Q <^|)k9y>?«"j7xGv?quױ"hyq[XCg.ںun0ʴSeC\'*8IƶRgYr%Wp?d]\|y/9?qW@nJO~R7>;;`! ΡF#u4t6Ѐ/= osu>_Ȫ׾U=HWS߰I^5YW!VQ**jj0bA=zEP,k_`tt=M,^_d媓z^$}}\jHu3+SUj;9<Q#$Dή7+*1o+_R.|[i{RݓUa/g[[O%Eq48ԡT2`1j1b!4vJm/nhjcǎ \O0Tx M=D+S3ں;oԫS\ Q4jEpPTSX,qmgbbnN_C7Z6xk#VsuqG]RVIc AxAԬK\I.{׭c!"k Gu 3_[1 60ApMZDP!01" "OPs<l@)ݝwBH$AuuFFF^Ib#ҽٔFI Ruy3[ j` 0 狌M011xB. ¼Ao XETQF'5VD%ő 1!FQ!a1q29V@d*vR,sV{?VQשVzrLR9dQ^J:;;e˖]U޸q :oW-VUC[tm ;clkPODINA8%%@8 J a>OPA1  :] U1Sh%Qbw(}|;n%_h W$ cccG6)B!Lz/bP1P˦05YBY"*1b $R OE@p4iҘ0!3J(jDR(G]I})Rظ! K.?};n"ֈe+~۲G -mZZf|s7mڴord CoӇVT244aV Bbq"ܜ'F-I3'g78z8ڙĒb0 LE#${/87zZ!Z\WbOM掏yb_Jk$? .P1"VX0> gT@$FD1XbP@(0 ǣOo}^.& 4wkXRTT H0?S\TC y+$ơjz7A75ˌLNAa'J HcGQZr_e`hp;{'ol{Q锌 [:~q<ϝgLWT,^ދ'&]|q= !Hq >g)f"15uf j5%dŨ0f%&EELH-iS;{5O$`1#.r*ݝmtwpۗ>߮F_7l9i[uy7UHw!c0|2V-wZ,"Y/PAbl hppqbQ\F:%Jj|({` +> ̡BAXH)#fF&*r$.RK!!b 0DTPc,jegL[\e>6HA-x-9-ne)Jm?Dςn GbR橱޴kz<:#VHԂ2wл<氚\SC@`rjdZJ(햾ݔ"GKU= :;Cz:iio!!A64bq~Jr{%t/\3uJ` R78WBK0ĩ1<<оA:z)rX@`"bQ_iFw(bo+"S&p`xOmgM qLSL5h U BT$MƇdxtgY# ebdRŊ6= 3"M[k Y\t Ijpz ! axY7F1Ҵv!kť$ClT=ZK$25`b|ɱQgi],Јck0p3D"0OH%" CZ;vSjXKPc1NQfBJ`>KKruR{ڮVQW0Rp{!qJH :,60FNb1GJHJ ШיԨTU Zz5(tBF&<HRbޮNZJEJ-y-yr }RMP tY zXمޢ vԇYo & ,}YHÆͶL*s<efũP ;q5(#w\4;^-o+=/d s{hv(WWqƠ /ѽO2<88,RKBB0QH.̓CrEKT )55G VrēPKk@*1Qc,U Ҩg>ŧV#xA4`dbhЦ摟9+A1*C*!YR&5,Qf $}R,ӔDR-a`1b0Ɛ! >s&FLHOg %/ދ .UU{P+@Am5ԓ[}>yh& {BlbM}L[[o0IDATXȱ7XMZ겝 3.`OgXA{ٞ]VZLk{ Q8_7 2]K*|dmUţ \fĐhs>1R[ mS, 6}iRvz[O 0bPT&=m4je=ĈJ+]hi\ej8tv7~sH2Ρ&\GjY#>e4P_bqix:LI q\Śqx륂=-+ck * ފc2mΉB\C{s" T9*}HpyT1a`Ig]EKK LHɘ)8 DhF<ؔžT!+ g0dk:e):.+nqvt@OD1Ri!% sIG`д *uF(!ͱpA'QJȮTj=4މ< 4ТZ y(FCڌE &C:C 3hb3eibU>XCxW5;QH?gk)I GgW :45c`"L&dFV5QO LlXELF)GpSTI%UxߤP+Yx["_Dꌏu___,qi[=7WʓcOy4E1\HWw'V}{=W|6pY՟5 =K9VX8ُs~hEhl$q|Zk jb1΂hwXKb\y0b myܔi/"$g _0 v04D\>7j7k`S?@ר{%c$YDQM,Xt3sɎIQ)ł6Xu\@K1XI@`9|:QSuP)Z.n3\0O(G>Nly9?V}I7k, :E DcV?g$8 P4sN $$ޕHD%iEj1Yns0l;VF8cJb (qx_|v ZMc.3ZVm%,\ޮnvJ1@L|m?20%)Kg`+a# gg2'(Faqֻ-[h^ dZgrx0&pr9n܂'`cv0^jQe!@6FLC1%ήܶttp.T@JP-bb$U |zj6xB -h$ 4Yx%ٜ Y{UߧmAmi̾ajqbZ WUٕm ON)rw ;s=ztllL[֯']ֶ{K sCBbPȕ0d=gqi58]El*3;,W D< 7λBcth'γL%BVRk cUzJi"|HM#;頭-AnpC[]~+v M/xjc֯_V`Fӧv2}86XT\S޽E2FCM3Iuyu&7ӹldD}{X0Љ m+346FKL+߹\%ZZ{[ػɳbY.[@xqytAjdqW֭Cn_l:k֯?AM{~99- RcI\M}mdG@[%1b%ಮ.=9CYC("m$Lx1zSyf>y yL$DC!Zi+x/(hkP,I(w5+(coPmdp$ Q Q"c%J Š?|cg*(:#D͊4G 0I1( F(r]4H @2lOnq1>1ʁ=9,t#z{)E ;Xn{僚:RTX;:|<@59Ndаd0(`k%x5 97޻:Z/87 F12U"J#N888Dur ) J4/_ORLP4rщq*Xɑ/XZ]IZ"ADkki+ZЉU'.E"O ħZZ%#$j<DBk[.ebr4_wu}i羳f9a*«zξ]t6U{INӄxl I/ F̾{/Sk4HAL k۷bG21Fh^Y߃C(vvRŨ(pKiok8CCxb4DZ8)BHa$&I RUk$!1W-hb<b~cQzKdZ<%$η[#5P۸F`Ͼ=4*~Kaي,衷.:z(͑Mnxct ŕmx(F]v URB0ȆUdemg^0g|;{=vqscdIփ>ئ *h6&(А #ПZdFԀ r! NkOQ7\`A dz6b !2I&Ts.f 2KqU* BQ4͂R WLP/15 $BhJ$i#:' Vx<;LN??+LHPX9`o GOw'vˇY3uLVW|1'}g $74YE@ .C+Q?ΩLM5-QFߘ_NebU1(FT,e.7LXxùʰm>?lTpumriERl/X1`bM5,mE+(:m՗]H7J} ƗS/d}d~SRIZ B$!ZC O2ʹT88Rgw#NIc#i6C8_=V;=eʓ?uLj$s)Fe1 "!HQTLF6OK%ʠf#pr6|Lvڝ"iVS_&68a4q'Z8iLQ%$d+[opGgע'Fv 6L22[Xe6ª=ha˺:[; $DAcCҴMŁų%~Ơ=4#pdB>y\j tQ"8X16^u6WdSť#Z 'Y`qVA u:K2}ORA,;w6AWTlMv Qm2pD8Mټ {cYkjA$F}lONrCFւ}4]ض|Y_Y[9*׿}=5I!!!q1F 4+cu8B*Vb,cIQ$JKWl WdMhp;3[4R@9:;8)ߑ ѵnM?rpSL>yW=xqa6z; {;ִI!M-wsMaD0|:_B +xdêQRR#4ƉҖӖժ[wl^z9XO?t ::om8jի2Ov5/ RC@$4 Cqm0&+3 4PB*:SLLM&!ZtRgǎ?'֭G:^2;iO?8 Nk_Z+YqZ'TI]>Ҽ4 <feYF7r.HDVcRFF}ks;O?mMM/ /֯Tnt$eG9a .&\ ;S*Im$s{ & e#xm Vw䢀0-ֲabl߭N?mpdx>-+PuxabKGD pHYs  tIME UXC IDATxkmUc̹{s}=Z%$xHXGa1E(JJ*@9ATS. L('VGƑ@H"Pd![Rw~yk9΅ vUg/:{ݵ֜s1ߘ_o'GO_yWLϽ/ y۟}~C$]w DpղzL155r_q9~$W[W~ۿ:N~pgWr 7Ș'%}Go~x7py2[ނE"J1=wI{_;D=PK}뫱x~7ǟQsO1 $Dm!ӊ]jr:7&>Gwͅo$eBN= b&@]4{{Ls R_z_/ott?!/_㭯&/Or"dI)4$Z Ji֘fal4Δ4w\V|tG$N%BH5vHJA#IoXTkjZp57rww݅mO&D"be̗o?C!jg[8_y+¯o3oYϞ O柿7?|;7<-Т,M$O4d M ќ&^ xK@+V2RڄDuD0dLׄQhm›܉ր@ȮV+N쥑O|sg㟊m}-MxI_o}bgS}?}tc|{9?bm=y-N#23{ʼY8q|ql== 9^3 TaJ +i1 +6blw<- FuA tˉ͊~wzo`-xsԳl˳LU9h%(+]!:-0_r!i8qW|%'䎯:y杏F<N zO|k~ ݽ 섏0H(iIhmZ&D+:n&B+N9ا6]V9ͩ!*(֐&i*DE[Y%:AQ"W6ѱ-] Abf E?PJ]7Ϗd>u7*ePMP`m=eJ&!Kuw: 7V>?GY8b9|r͜>'[KbOй',6LD0T-4k>Җ8~|M6/ 'RDQ0p § #oTO@cyT9VPi(K5y$N0J2j ;Kx2wL$͔)LЂF&pUg͸Iwz V"O|O\>1WŢ-Q˖;TLz "#B2>#UUʅKrz{;'H)hse*h ^r@slśeiVctVR"%A K4H\8T^vV B!ioTqj+Q kka_cl}b0/&46cA%C#H0 E&q1c6j8~~.VQ@^^UVi Agx,͖"8m\2갑ybbVE!5TfyXfZedc0HhV+^IO|)NOǏwN @ BӠ`U*NeH#$bDjX3 Ͷhiu/ l3ֆԎ-YѺ/WtB4 CquT;4AV$4P`B}/oMqgnMB2׎=g;]>} 41:JsȤȸ4lP, t7S#A⢈@@q&gS3Np8yeQ 7cs #-hPmX q 3Hмtf*a63\O~,V':"Fk e(72ivxk|??\!MHUaN/p؃ 2W} 7|߹3\LMOMADA2J"(%HBq2FJY' ]`kyF6$R4먱Jd1&A" hjYɲYcÆ]]D̅iδp4% 㒋%6ftTj)23D^S#~s r$7\ 7?g67QOFwpu=MB<!47@9{[gstm/gė sh2cRaV<+y!ilHgd=Z81g(BHO=}nӣEDnp3QM neT@ZµAkԔHl{҄fsT㥡I&+ݠ\i;x^d/U%Ig2Z:0 XCJ/G0ӉQ3E](1'E ^]~@] 2.AԄ7Cx=t$WX311D4#i5YI*` :c2H˞)ZEҌ{Z8A HEr,26AQ+ 4*iI$HH2^#Y;ssVIYgR %-:z1! sr:43B:WʼDCiցIқ_~oJwŞMDgd}lV mzS/r!%8i=-/O<~2obU"  FW s4e%HEp 0rcCef'[wj. Au!b\=H\HkL`ɨEL6htCǮRn4g^v:07|ӷL9( ωsyYa C&uLM hCŹheW iFg$ + tcN/ & $I ẸSs椞味\u T/R1! j+N#Q$&G(a8FSLD(eqa|}ǟ@} |̠5~v7̸RF2"-?+X$Au Koc +L譑XӒᲡK3=WEEs#\abM`5sɋO%9S#X!~HĀDk=hY=@$B)$$Z-J4spC¶@)3UGI q`'0Z*hR0K&ԞF!T)MpˆAHQQ-:v:={><AiSuZ bФ@ ׊R"ZIO}jynܻz'_zK] w8W$ [ ,j&G)c\4xMV鳢lɧ ԑZHD@}-ZjDdQ\gth ϊQ+x3w-Ag\b7kjQ8xJan5(>hN!ЂKBUH{bEv6+xMW׼}N$xwG/a}QTm9~$'~I6]vV #Kvl֑kO#+P#X50*T1UHR+0i&%ԫU3vvv p QiHxbc EۢDj=n#.*9&Z jjI1.IsI毻YI^{/?+<|79s).@Vl~:*~ADAҥ.C ǿps 2@W[B{N4 3Sam> ӄ/l-\kxϓ`s8:aH:zHTt,Ꜷ$^:$f2@ @[OnB[ƵP! ԉVm9fña R0n@gj@uHtEBB CuDZeD"0 EZ}s$/~h?v|;C!sfCBii#T !DPmbdMM\>U-{1~!iV'tuIiC >QJCbl@Sg$)12p@CW+ulxɋ^W}w/~<(>s\wjN$Ncڊ/.3-Q3;)!+Ҫ/w6YK9Թm3 a,IX5EgBX3doERd˙qrxO3ES! uijAdA% kX%0/ [ ! M o3R6!g ЮzwM[ucyyaT%S mM=PUi6҉C;$5"*{AK@ƽ, c@}T*sQDь0#%(R[C4@ Dǰ*n3ppiGf I|-ȣ_=́ eTJmsyf鉡r478p0gO<''F1. uX1>G?koUV끼6K@( 7!TC} L7 w !CJu.1M 8I^-AC&*4f^ qM0]0uKk…b(kg-vO$(+B p[fkH T%9+:i7)C0t1Xd|a.>rQo?\/_&-8GRˈm,w2,f"w BSBlA5?z4ؓl3'_KoWwX(b"E|@'&JSȝsGKEi ݴb`o'ql}݊ř"%:U𠕀Bt X=pyf G{DQ7S.2Hm( Ngf R8P ( E,B !,\qrxҽ|}";c+K]ĝ: 5} BkR9^AE֎Fg.zczu\![ IDATO}^WwЌŊ ,C@dVD!#P2+DٗN[G&FnJ 9ZeBXΘRj%Pvo;Ÿ ^U>Gu?dvԔy\I-P+PuDcx Ȇ#m)%7F⸝ >z>S"e1R,QmqRfq0 v)P 'njYf @ѥABC(ށ!f-TD:ֺ0a= 4ș T&RY1i6VhdЌr5sȌ8ۉL>qLxˋ! HfFh`(kZ \^QE]2tL[rtTp㡢Ӗ:-\ c{kv4wh:-Xp.m=I㈂DVtĕW&mVX Y&a~PJ€ɸCM1̍/|u?tM BW˄ #L!fL|3(:ct/^=V{`ӌKڊ""q|M87'?|ɞh iaߛ,RXܗN@Lc3bm4IA[`$H$6}DZ3{Vc}rok`Zu.a\Ehy9`A ⫷" VpmDhlPvrgo gNѯ+xeEXYC_gKmrV udE'rIa"`fmٓ;k%w2yYYQgJ!E%k&XP,MR}MCQAmY\I(1!y&dxar7\PB%S'$Q+9+"_fne/ٰM^NR$sW8BxqBHD-5rTM͈/<IvZpINؠ-Q.u06#:𞄒BjxTTfAU17#HQ߂/{FvT>X?oHIfVw!Q \I [5fD.!h۠Kk,o 8ʌEŒ"H,L&eFH ;ti2K?f1erS lez #)7ty=>O::G.T$VTo0W(J@Gĩm"K!<(uFb7^)}*KlۥYFձHx1JkiatUd:n J=YgVC!Ɍ ,X0kAihR.ޅ)A!PmD_u.9=i M04+#B%C̸N"q2ƻNZ{iax =J(ݏǞ'Zhޔ-)^8BX*-m$R!tVF r@ nEFq㷀0]xEp+U.NHs3Zn~H̑V pox*4Brݥw-.n)O%U[XDn+E-MG C0! T/XNU!B[T$WTjm,W{~R HP\{^t).x=B%jY-ݰڈV2BP~@Ivx ISj"K#Rb߻N[CqLV40EpE13E|^Hj@cEffu;#b4jdfsG5 ztXeo i&($}e￟ѹ׾)h+`X`pG#s d/~,$Q 4լzrS٢tQeI=q*5L.NkKTd'$E-0\>'%U %:n1 %Gxl)9F6fKTI$ \Et+>Ѻo(@< p,#QGHJ6wHx+%̵zc^_l[l˫/Yy.C{^0%wo: nK%M *-!2'L.@O:<=qeΟU|Ҽ#5>;-oh3*cdcB6$#J2T$¶hSX?S S‵ø-:-uNc M̍%cVm #Y+U61K+ejQGκI8sՊ4hnȥíŰĶԂ$PsF$P},0ao9m_{FNS'm(=:`-[9'ӳ}mNKM.qDEIިD/Cr]{G Tf.q0P$zk([PP-#V4sw1xZ }rzXI""[H',$ C÷`1 WhCB^7!c|\mE5ַߚJ#-c+Hs7}IkdK,\h1n 2§7|ѿ O`j*b.-mI^8ʌ=E+jl<^*͸K7y޳Od؄GK:&fJ3uu'65Nyim)OEנ2bGY Ƹuk9:ieVRxxҊMpq }ۂ4Ʀ:sܐ@9d13Rjc/]|XAUoNBb5os_΂e''ݎ|{oGS?aDBԨZY""ų4lEmE{g2~s/}w 7 n\bk`Z=&M* %B+mF)n+E 䩁hv6T4%4!$D[MϦLM9zww'Y<#b*2/ۣ&0c>Q ֭m5Hr4+_xWp!r"Yknqk68 +[ t{G.SY{/~GƐXrCfKϴdTQBeJFոnnS{<_x-:>_/A14mw ֑0wj=PP>^Pihh[|5t1#j TvCa~zzFP\ِH)pI(ǨѴ-3ꊎhj9kД9Ϗ$݆pMKsv9u*sӔRX}nxw/ǞGEU# | E$\ m# ĆaV ᨤ=1pB%]>W2g=FlH+0/Hm$4.f TJ=o0oAjC؄Y&M20PINJ bqsczu ^eAF"ϣNhXSZqdí%,+GwF[?FMN} {̶Ht K%KhKh#6ⵋFGT لem:a,td ꡬY^g_O^Mosl::3Sֶٌ6"y3lj sh81_lQ`q/3Xc}<?<@VRJb& D%6JLB 9' >Cs̫A b!Bb)+ Zqzwq'sCq[~*;Y *&$.ɸd*Xـ&.)&d}\6qtݸyí\;+60dy,%n8=|9ƍymԉT01.Yav69{( w#) ӲG<`'9[6$Y}hqGq\1Ϩ=MWvʊZV ݬ{ב?(%3.)??7^J3$r!yPK$·RrY^փ rڰ[$ex9Gϸ~&mօ  谶D@`Q(dٴ5KhÊz-V'Gܹd}{ŭCcBv9{Hʏ̕or*VǷF…K_%ϕ- l:MlNMਐ"x]+ ̔ au:$޵+Y#h"Ez4Էм u'v 0Ӥ+-4d)ˆiȞ3'A!j-t6#SChB{}M9fyMffSQ w,Y_㪲>Zs%GkK+cTIEbخI#DkaDžoIׯ=7:l_9wx>{w|ے KbT j,[h2ҜCQ9!KE K%:2K#\(NdmZRA#" -&d]Gz8Az}$oyǮ\̣.ѭo-W\TuMi<[kfmƐ=<.H5FI2Vʔ(ҵFr蓼q֚6;g{7WduF,ze'PN/Xt[vYM3-%UM]0_B.[s|g8 ;y9$ dsk@KI $TЉ" D`=x=mXx`xBtx`l iK[j JcqIIyO_Gwg>{3xr.xx+ 3L6 ;b*Q IDATRHa+i,l6NnCOr9YvYpo ..:Ϝb6=ߡ$E#Bc% p'Ts0V3&/ؐ%3ѧBV2AkÒ@k$HhT5\BmIkZ6I%3̣e;TBS!$!mb<ŲL ZסHO;:$ sׇ77s EQ'J,A]ih\qFT48'XAuQ\HaS _X&7ۈMܮ '+X C&oϐ6GK#<ȿbu1%I~%*=3uX^h >4Nq>|eڈS-['#}֌y_F4z1Їm;<r<6lOݞ}!u|~_fvgt;l.m8_[<Ç!ƦbMɩL UF u`:-uկ Ї˧Uz>n_ۼǩyD0)ZSlZs4Xx069nr:Fkx gJQJS7x|$% q>f:jYX "M\W{=q PČ&W ~tpYXs12T%ԍ<}^+oxYGjZ5le9)LyFj%nPPu M"")mxhЇ;32gE#I Uc$aڋ~+"M -+:`tHH+m$OH2CTPa aR&)ց 'TbP9k$vl+zF\3mЀ]Aڏ :sv|o?AYlZ@h% %i8e-V$- oyMc3QG=Z@҇Q@N3,ԎeUJҰn@J "qﲢɈѨS[#u4RVaHP0H1iQXI&IVI RGH %)9TǐV;6LAM`=_$8 0|BqCmj546 l9GKtWlQ Xk1PIIQY޸ߝ ^ob=Z}D5ف\xl;7^6ҔL#o oC{q6%4&HYi5jLss2ځ(9u ;SQ?{} y擦!i$-9YS 6bɨI>CL"Y|QSoPO06(:&Nذ9[bӚB|J n(*5-꧴ lsLnN?VDOsO]J9GI+#bRx4q~pv1_y6T"(`2eњiXu v&+I%Bg 0&! rqA򆬎mdK@7HQ0LS]`h1L.߃ʀ5lx"qbf/x NEPƼDYrx`}pAFו< 2ZBl#StM3Ȋ:vȂ/mXH8MJF5)9I(dF8HJ7qa~grI#Ԃ[CnH-hqP`sZTҼuḥgw7dTJ(c3e(DP)/9 5 UZ)ܦH/FdžyE_Q o%aa :͂W)} UL" [ #I%S_/·Pwgӎ5 ^8fy%ŋl[|W51>ja \FTFhdGB1 dew& eVBN,!u.V|9j< ( ҈Mo-H'94('Cfؠ$t:G94Id"/Bx'?dcT^99!:y2M֓|Cg"q~UBl*Ԕ:&d z:UQA3O4`)hdm"'YGxjXHW*{3 1MT謣TAl<> 2CC*E+.LAc"Ț %j$nQDǿO=%ݔǽ&L--pZ|aIҨnlqߙ(_VpC4^&2lv'ܒatpGc49TDJ*-{ j3 w U9dDI"p8hSۉc}ZTq˹FoC@݊"Ҝk6,,fB^ѡ#|YCV;t7\0]֑_K.I_QH}PTY?x6=a)'Hj3< naϾn ~s}͟z/?3_rw~{P~$pQq\S,1>H_̕$!]z \>V56~![DĊ`BPt\:NI2 "ܷYfa&GYIOB&ZP@Fã(慚6(ju뗖,ҒH+YFdm<&b@VABF*7q6ݸqsr3>=zqJ 8}nΐ䉆0ZPs`fzd{J .8RۇO̿]D;H0*Yýӗ̌Dn@'*6D|=/W^=$r kS0bfJays\|A5ȚBbq=jhԕLs抷ns~/>>:6n$ShA6\y6\u!;݂{v;zΞ;{>O=o/\pb'㻼rڣ% B// !c06X2zȭM]?{:m]Da>Y@-r7 >u$}*5kB{̹3%^7x3ѯ3LTn-968w\~[Xs7e{{gѺtrVU;´Z5DG~IC7,'mL1<T&!Y)C GYc}yI8&w~ſ8w(i˷D&>yMCI{xnQ)e;Gy+tn<'%o%EgX3IE tc̀ Ҵ@Ż 1IhHMMkqߥlge בv) ڑ^6֭@;s|W9Q[б~_jOux qɤ*4Q,hGe @UJtlNNo_ KǽeП+~O+]ayVJ73"LAGJrL-lencMU^˼Ru e7Wuh3D7 3VWA,E,~'\{ȭ$!=< T&97oεKkn_a[O@2o+uKS/VNl߳+lmoMj&P"pG1QBY8Zp)zkH8ՎVGF+:B R3M@*r_Dl"K?w; ;3OI+yNd$U AY/qЗ?r:}J6mip˟|/y[NTPKzul@N![$X1R[wO=Y˜Nu=w"J` hQI%$HJhNq$1Ud:q0M!X]#Z /5>%kef)^უÉC8qe"] hJ껞 >;zÿ?sͿG5?=L;凧'(:i\4&p-Y ʀ[ϽRg .ZxF5g<yYUN`g8X0;L*%एvOg|ChMԃ/q51JX +zFF+)jWjbݛ^^2*tbyvԘ%_#Fȥ!z%9,^cp뢊 ":RBkM/1A*ٙ%lf>7Ry:$ZUܙT-WV-XZZ# >{}NF5X;:iҮhg]TQ,WjF8IvqRlyJk~4"CFd@rD$4'gq+⤀|x6khE&9|<fL4|6M"\!t̊tvKjm=xr"qD &J1Q !Qz2o(nTZ4g -7xdJrS X 8)mŏ9s7{>u/)Odm;]޳ k_|POveŠ˞J M_ #9(!>oɤJpݥj~9R 04~pYܷ$<U?\S"#|kWj,XG$F zziېSbn]/+׋U3H)e0e+n!uJU(V3%W!Grط ̀q#NY V!xr5e9Tu >dg$%2pAFCp<᧖|x7$c _TUZ#+ÇVxAV<q6Zҽ_ !:+_)IY'ٿA+Lӕ+,8_Q=1g䖜=IV! gy\'9PWHyj:\"d*\/S)K0VU 07Bd"e IBbbT/A)bIB RK{88ă>J7<8I,OXZ*^2Ɉ_cnNsrb㎏_j'㮽NW~I;+~ d3goeJъ (!@EtSbZBG)7HzԖ]2 $lHK р릨Xq*KR&S3ˋP}&⥐C܂ؾg)/jF 8ROBɚpSS0?(?1?sW_Ė-'>] Ե7gGgxx`}Vc*Ê ?[c2FNI}OPמdMSR3s k48íHOTHPITb)r3˫-?W\&ԒHN̆- DzϾ?Z-BHڎ7oJB~`8崗v 7q&oRSݓkBSOD߳'ׁ9;2Sh\G.&y@: 0)dHEe;,:z%^vPAuZ;a‚~Xčf2bQo2mPTlldxhgi8'-Ppl8e3vkt=|5rUr%!+<9JCgu8 .:8 SD`+ JɰgrHT>rן5g#5ɕWuHLt}n,t$s 'Uݹ>Ϲ/5迖 M\~yp|O_Z's#ϺmfiqԦю,Yl,C 8QCjQb/ך_,bW.͠`_E6 yӥdS).@ έ@Jk,v^qt )9vKHά}fJXҩ D첑65hĐ0kdPu*ħoe5beWG"J jF–р}=}\GWQDed8c6C:mXv YIyFA2JVA d\)5NaF .P0&)=r|/5W*02>lh.^9k> 4hm:Y'p^[xKvd&B.K$J1wV˦R/AW;Jh*+,4,=dd#7eU-O'k%s[_ w͛iKbrtҦ)48`@W .5h2Oy85^a<GՀsv`a;>UI{9 '?Lw뷜yg^C8yIì]&kGPouQUIǓO.xhP0zÀ'3/C2mG>W?zdwkgV>Vߐ~AM! kk픓/6pY[W󤴊"2d[dHÁţs7hFy`&__wgv9KKGQx1N_{6XL;w?Yzzvk@i?`}~ti@3'ԅv׾'~M׾B+u ~Keo;TO|k+O͒_߽6ߟ[\u1^oC>]_84?nn&f؟_G8^_a}Z|U3!P_ߜOIENDB`mumax3-3.10/doc/static/style.css000066400000000000000000000024201371432437400165630ustar00rootroot00000000000000body { margin-left: 10%; margin-right: 10%; margin-top: 2em; font-family: sans-serif; } h1 { color: #000088 } h2 { font-size: 18px; color: #000088 } h3 { font-size: 15px; font-weight: normal; color: #000088 } table { border: "10"; } hr { border-style: none; border-top: 1px solid #CCCCCC; margin-top: 10px; margin-bottom: 10px } a { color: #375EAB; text-decoration: none; } div { margin-left: 20px; margin-top: 10px; margin-bottom: 20px; } div#footer { color: #555555 } pre { margin-left: 50px; margin-top: 10px; margin-bottom: 20px; color: #000044; font-size: 12px; background-color: #EEEEFF; padding: 10px; } figure { margin-left: 2px; margin-top: 2px; margin-bottom: 2px; padding: 2px; } figcaption { color: #555555 } .api-section-link { padding-top: 0.2ex; padding-bottom: 0.2ex; padding-left: 10px } .api-container { display: flex; margin: 0px; padding: 0px; } .api-menu { float: left; display: block; width: 250px; background-color: #E0E0E0; padding: 20px; margin: 0px; } .api-content { float: left; flex: 1; min-width: 400px" } .api-entry { display:inline; margin: 0px; }mumax3-3.10/doc/static/web1.png000066400000000000000000004240231371432437400162640ustar00rootroot00000000000000PNG  IHDRL9sBIT|d pHYs tEXtSoftwarewww.inkscape.org< IDATx}x\}?fF_lٖeacqMݔ7:.ǥ4-7<4ɡ M P\JS$W)u) c!me[F̞׺ޣї-ٲe[y={{hRZ4J)5<A΀fgA4LT{) Lknjԅ}&<|-ݻwA9K.=-mݺND̓)Ӗx;^NX;&A8ϙ={Io<žOWȧ% d=pǒUV yFcc}QןL'x|:">% |Gol v \MHZE^.<ϔ'$SěHw4ٞLl}ѢE'** \@ݻ\UUuB&DΉ+ɖ%|"W.fj4;듨XAm:;;K˾)T*5d[:%9p!'d ¹x$<000Dq ֹk (-k)'BN"D4uO݉niiQH|Uee2ƨNdTWWJZ1Z[ZN^ CkJ=ϳAXuȀuǸk1*moovƌf4'ͬׯgjx؟'r)=<=Vz<)N]]]]J=gY:sNR}*Vk(Rpu]u]M!r) MȖo+0Z[ :cDZ}M&m:6Q͸$)^[WWgƛ>YZ:II+}֮];y\ 8O濼.bvKv>j[XXW_3 ;ޫP[7a39_;tU3ל_o_Cx{~0ق $ʆ& Xkx q(Һ8NIku>מ &D__ãι>JfD:h4Flii)Nzx͝;+ iN0 ɔG?QeXlِu^~oaȍ YQ~~E?cpa-[O KCo8yfxu|yvE :!oG7, ”ڀKm€8N)Ͱ1Fő}Wzi ϝ;'@KKKi.#ΆNI3f0d]ss33fhg;rJ U0 &T--=0+Q t\֬\/4tw kdf7nլ^e+>to9zBѬysH gc* 8n.IwIZ)@{彤d֭z̙ttt(uIZ%sGaxF:f[P__ϲe(d|Q(28_ ;_|cWs͚KЕTi@=}yʳ&M7ަn\f=aȜ%X7GÖ r8% 8kF)Qv]kmL[6htE:83UQQyAf͚EEC9fJz\P|^͘1r'PǙt4'Lq p>@!C8rW\;}[ǺujŻ8zGŽt`^ W[?KŬ:^: hYW||z[03BAq];$̰tyOPT\ '. jƌy<% tR8PZ٩0TIwm:SmP f `գb&{/AM-/K;Vk~iK~ `{7xa-l1Zs ͘üM<s/jyUmV}3WO4s93rJAqv1NGg'SVlQ)k-I::AI9C| 0w\l[[[Kؽ{Ztq̢קQ P((k<$EX|H7u똵(%M}e1TcUnb¹sz=<.]v|Jx'ͼ̨pmΦض%\ ӸA*t|~hk841({J8H<҈4t!G ܨO~$*oiA3(Wm.@k|Jk]Z9}%cAfΜ9cͦ4fuOT"Cvt  Lxj|I'@3_B2Li8'a͞Ք[<)Lt pF_K-u|8u+wzu:-#ZYYICBShAsR/'}GzC1 (<}_?`}}}Opd2xy.  IoY$X|IP9ۢ(R qC\U3rk:F)т p.GVIq.\J)Uz]>(4 p\M{G<1U٧  %T3ZEťTty;Z䛴6\7D N{lA3I[N&䛌9'hRpѐт ¹FYǫěl=׆1+Ot! gR{X-OSd8pRBֲ &cw p6P SS:&5oT켾wr!3V '"XA>r&ʜ9shjj:yAA%J彞ODٽ'ox{qq,Y28/yG]q֓~^+ JÇ(M&B¦(Om{Y|Ûv\@㭴砱(GWoӍ,ؾRDQ4)nuq~~^=<݂ Gu9Yg/1}rh+<ykrs|Xa\Z_yޗV8Tcv:\Sm'oˮ?ăr۸<p.^Jp˃wMkg|! +n}|Cz%_}C4+pמV̦,G7 7_0#W*ddEM7qYHK}ț=KiVXȊ[sƘ:Oе(g ){?nF)®j.]:X_s nqKa"\B*CֹEO0Ŵ>MiJs9NE˲&O}!9qN;ݼ.mv7Wаb- GFI6_̖?FOY[n=sl GQ4nLVTKKl޼eGc9޼y37nivA .t‹y~׏ Rx7އ($/A+_yh=ˁ7fƍ+ܟ:ADhRŠ6hcm1杉avn޼dzvynjj;䪫U"aA& ^›?q'n\AU K*`C^#p ix[ՓeѨwQd:冬;Hׯ{#|5{O!;OvmtyۥED`(~O9>ھ{tԧ>Zt%n;|%=0??u{[|??4m*4Ҳ%)4}er,*f~3-\x=\a .n nw׬h]+uk Ŗ|o0e' &uV,.Kd'A .x 5|?bF?LAH+n߱͗]Szzªu^|5\@5ܽu ]гn!֭s]F`0&y/e3ˤmʎW“ANi![?3<͒E }_3U-'sXTNj>QٱRƣuܚ-=A ime]uwZ3HMd, o1iDzֲpI۟ ӗ%sŋx⩮SAe.q‚ p"A)B$, SHXA  L"aAA"D‚ 0EAa 7ɜ IDAT!A)B$, SHXA +/_cn?uk# KG]wOY p.!hAA"D!h`sLuA戄C"᠂U03-_Ai>K=Ʊ#mKctPS7kboK:9CcS_] <M\T\iH,1gns6X>~W9o*kO_[DyKf|퉯T]i4pH|~0l}%8oj&?EY=rxX$|G6~7sl4ߦj|5r]7 O$I0,`IL}=u[nH[ tXanՙi#,sQW[7d@M]90M,l_۵,X~3 o7w6~o`ŭ[ jzf#݄߳}+hA-_׳\x{릺 \2t:DzaBZMƿOol/8!֮X;A ^`C .llGWeٮ '>w3IYAN[YbڞGڳzܽٱ󼰧k' GLΝ{6 {G6PTUigmwcTqT,XCVQKap klPQq{F> p~0m%+b pJVvwMt=-kkʻujhlQ #t  L"aAA"D‚ 0EL6aarik=1:\jY HX8meQ޹,FAт 0EAat0}0=|K_G}ŗrYk9#<|Znp)v1]kOmAp ;صSXQ0y/\ה7MUUp- |>^7,sgUS]sA.PDgCޘ*z:X|KLd9ԗ96rI-Wy  FbU3_Ų 3RwA 94(Ls>rT^u{rVl˸ᓿöE=/]5յf&dwfl3Z[u7~~ܪ9-5usT{KKJ{)3vRPmmf}G3ֳ瑛ࡻ?^AN%a÷4|d]]x8oz5ӬמLJZ67i=<u-MF JcRLStt+oY|)pm[ë`Р5{ phZLڇV?ƾ 2$X&x}-+W|tv/mgX4WV3|oyW G~2xx-4sϊ-[h8/x=ȿa>uӢ!;eӆETn.ÎU~?Ku>e Sqnֻv:ʦͬl6в*Vḽ_c+nY8յfI]'>9>eH?R~bӰv2һ^"[>u+.usC]nnNn8aZ2íkϣ+AQ%U3CU4-giC?{x ;F=7^n%vv9EHA8^p|j+7Qհ+~mm/ӵλ~fzGcZɆ^onO=kc1]g& K@y#O~~[q!yf^jc[yMӰ7wuҫy^q52] )0т p!A)B$, SĴk_Zqu;XI  [w L9s:]";~]ANAa w? ]S]A"a6vQzLנV<\&,{^U-vrt&LrA#mgz:ۧ5=d*39sOeꪐC;l|β]{x\AvW?ky#MVAD" gox#aFszo//ҜviX. Ms;uRT[]mESK!QUK.)={)S+/x~ZO!08}?3wzfGXnG5~m,k!<>>uf/kU;^b"nF;SvȡkfQS7l c;c-/=l i\yA0,6ds9tU4 ˮ {i˶gg"O~{Ȟm|,?.b/pԣ2̒ff!̑'\0frٮ,  p!3"a]/#<έ[y>oSao8tt-{Uo=Os>C|\wxu_-l )ӃY̪Mzٝ{~uxU\u&Z4͗/_?27m\5Ajpv/t/gB4M 1{WTG A@OZw9l_Znk+7Yˮk6YBa2^n]~{~q6p.moݟ{o<43ܴ,ٲ]-]f]?^zgaNA3bt7,k}ߛM{+nk"]~V8 7`I#m y+\v}7۟##h_v{9oS*v1 _L/ pߌ a誧~]6~'^o63Ǖt \PLt  CAa !A)B$, SHXA  L"aAA"D‚ 0EAa !A)B$, SHXA  L"aAA"D‚ 0EAapS7X3`|;`(>,`5X[\F+TlEUWUgW#wSm r*Vh@al/x@ʂSP 8/z7J H),ME 8.sB`|B[<!!(k}l"K-HEȢ\*6ߧw"ՇU9U=Ĩey~_myU6?TCv/y`N*_XToieQw룸\w%GuoTZQ(+[(6.',s&^wΆACE* Ah)}OkU('EYXea:U AmR\eӏU!k nTcӨRAi%Y4qC7f[af4,V7~)Q`E,UԲ 7N8Vɇb!uA55(jR ^ xM.uwEۑ-PXewT=z tkyC2T,m4.=?K(\mQ:\~.Zp8ҳE+pB1k:6j!0CI JT)( &PFc6æ>fЯ-^r9G c;(iĄY$%Eh#kq>zVE>CHBޢ`bhDwc#^1R* p/F(S\3^&̾U`ZU(`[8'[ h(d 68c7Q!t9Ư%rұ `@tS a2";Y׊K\P#XjWIA*i~X*LI噎5֦x9=U!bPX q qTj(`vKF,yRoE\oO+bFSW(hE)@:b9Ghr*Ci|bN%-U (llrSS&aCGpq#Uri[82:Γ:8K*-ڪ HٟЃj '˴@\FQyTYQ/şӞi-ዻb_M3.R8}sJ SRˠ㝃!ʥҪ%@`;#2qJ6x$RTuP*,E^µ)sZVE*kqexjN;UhkX0:T)x3:"" gzl,_ϽGVzPXTkBB.826 p^)!%%2X҇`,9KDqI٢xQ`$C\( u1Q~--+!6: 8y,9t%TVҕ:c0?G.Ħxt/#?&ƩYD: !2JK%&h5԰飇#hwCY tHZ0XPdդWhs35I)(E]6ž%RY˨Ea IsCL>TmhNW8RPb 9m>LMD1"MRI6Zކ-}NH(  3ۣOcBmR@KORcI~= ^" 1A'̬-F.^?',Cw7G<o¬Y}Xc@{\_/OI(K5Ř-c,x?;Z[<*\U\vBSk HK-(~c?*j-XE,EYZtkKiÛG!7cf!m-Z޷rtgҸ͚Qܮ UT=i3)y(gl}, (39л+O LEb?SGGn&Eɲ;u5}E#a4C39R0r8}ᣩA1qM|~ :GcKYuDYT6¡v_97,ö,aR qІIK`R 3/㶤^ hPuU IDAT2rz>uJYZΞVr4q֜jf8{aN[(V\AO^> X1obE­#~}ugݷ,8{;*`N$,CMsէ!c[t[GčThtI`Di&舀Yz:I+#y 5=\yJS᫗`Kmpwe$Ӯ##\WϑlE7Aki^;C#,oMxɄl]-Qf9V:0nVMGRQFڞ'9ҭ= \\Cm6UU  7geWuȂW-&%Si p5o9NIh{ZTX5f/Zl7/^?JE4PuP=됅|$#/`$`Fa4N z5.Gq<\ yL؍1hT@wXK0gH'0AT8 20!Jbmx#&v]s/qWؒ̏)lH`vs?ahj(`CSasvb#᜾By=,|V K1d nCf}kmw\gÀނBYt$ .wXԁ5䰆,)&, ]2186PpzEoS~7Ns8?頋`Y:kiЪ|eQJ {ePNOf`VܟgY"#,VkFcx255҅FvD-l[ "HYÔ|E;Tx\9$H$V P{P_)1#6-s5:nfr $*" |bvn!XBђC-L~Zu[@g6JK]=Wft}]^ࡪ|ω+<񢂗Axief^xr 쬂}ڿ'sQ"@Xiu$#kUT| gn]G6:B&\A^權n&x`yQxtdp`AM7[010pPYnE1Rg~:Uzݷ\W,?! {<2R ^j|%cHV-\ r2"S7F=l* dȘd]N u cƥT!#s w[-`.tJ>R(R77*F_5sOj:#/2RZ#neNaB0W*BΕ ж\nI=Լi&칷y}JePތomE5h6u 8{^- H89W\Y8KIЫנpb~ExO M@"!뤥^n`Pw"@na L' as B0:jN"Gv$7h 8&hڗhlcMaeKײp5Y@P%0,iE "=s-YXpC$bvf"b!jA‹)oN#XYCkrצ$2tFQgQwW+iCXeXP΁+T'84'hi:EC@5W\uWaUjhu*T+a)љ \TJ1Ɠ\k `bf- `"@zbcXy a"c!hЈxpn4@v%?LCh_`c[v}o0^k#l[v8xcsГ7]EYo,clY1T|"]͂'9eMFXaG/2VˆiG. I'#<,5a!}`p7?Ř VdJ`z*/2Fz`nrB Uptd1$^j-X GA4\"x[FxCƺm7C5kt2`m]LBp>H*W҈k= LKl 'i ҍP#h޹$w =+J#`8R=>f +d TE}גVdTXtQ ڽ iZ(&)X?b:X1(24?Bg\'!]7ZMrQnTgdy.. {ZQledE?qRݮٜy;9&:۳1YJd`Q[.tY}{wd9/\!NH$YMz`H#>ÉpdZ9Ͳb>k[I8  &Qŀr9* @]|îDݾVx(S! mVi]l4R˙fW\ZB215kaH-7Uۑ(pxD{Ԋq!JR$#ݫ$ -eQ<,sI`8`/g ⥯"KTB眩PɈ6^^5t5Ç`n^u t֢~jriMwJ4-^yHXĪIp h"!Cš&^TI1>LcQCdp|'IY$[ӳ\``us +`H u7b߶Xe7/3ҿB޳d֡Ft&V$i v-x&Ŭ|*kħu0c ,:Tb:H.Q Cq֎Pg%LE[Kpg,'mtu^>]4 p/?PWwWhw]vP!Vk Bh$zs %`}$)kKNlM0t4݊Y$޸ߺ"za0T$h<5eMa+"5ùx,bJX), a9"rThCFA7 њ 0qө$FR0 t ,ixEHu>x N \Srr di?O+z zwQvK#F=$x&R}z\(Z+ōֵI&[ͪ%80$3 R{TzPm=j^(n?zG$BSKx~Rԡ/<%H`ToVzJGAA6\;_^4AN7Z;qVq ^lg3mд8#oN]4T+ zFv9>(5W3}rr+(9h +o+Z^2W*r0Am˜S_ptN3Nc/5W 9@Z)#^Gܳ##ۄ5iPz[Uuk'K׺6'3m,OS h\3{Knbk2(I2.+syX1t߳PB0,. bs)W8'c,Cʒd:@@nwR$P$I}0{Q Y7 )b""t‹ڷ0p9b*[7E{LKN <3G~ ^sEe+ڗ _~ gR8t/~Va 'W$W4WPa<#|τǿB@&QN2OAS$,+ IY Б;<ˈBۿm.FՌ=YtZf6k%$ j0džj &n[;/F0{\ [:j]o] F l|\wsfk Bvh>dXF.o)c;Qڏ.) ):e& b_*.}e {"7}\^dTxj._pX]2013rrLl/+xЌ}Lj2j,!_IR3kL RӚ_/{ߴ8)lqa0^k#,Ps4l9ѫ&?_3lЄVpy6WXͅ;yl4"8L\Bl'(9YG?_ ]FC}_1;+?_AŇLD%-#>:"Mpj =qG ,*2d~{ B%h4e0#Hmն٫(% :@k܂,#r&hV Q:9GHǎAj,=;Zz234 %d;H9^Wf.设3dd =ts/?Y.Oh/ -1X*T.Xf'7.heNWa={=ĨY7!d8rfSA%pĎE&ؘM5GmP㤯kGGs$ ) txSĆ+b %z6%!?~ptT2eыdpјw 8"ܣR̀W]OdѾvA1(~3NNw(iz%!2_߆UшnϽ c[M\#gK ]9b_ (*>|nB+%;8ts?DZ1܁PVBexv;?Ac<~Aρ)v>j$l>;8q,Kˏ$`M?$T'`2(vp;71?%TclN>Cܼ۟嚠]1chOԽ= aQ|G0b _FO 1Z, 9k#:B"Ur= gWәwp% ~)6qm݌ YXn.=?Pg5]*QB0ͩ ~Cj^4 'HgY!O6) _;FG:_o<ٴ PbG|OtzT`@q8ɜ?{`Ek{HxbO`>Ԧ"}m7_T9Cj %e}>;$)!8ukfMLvص̺'丈A1'`去d9!8* :~߶_Rrޝr*=N.~IM`ƊϚG ˛L үknxϗ. O2boc} ]HG RprZ^uq+"t,OPXv7ZCuIkG<~xhls!υ,9?t.>p&޾{+_21'm4Əw& HW{> xZ2`dteF]borMZOlhO!& 7I85)Ǹ;sJZĜU2Sѽf|m&09vNs$?DJ"!+7o%=C&t/;;)b4Oax\RF8#>B+@y`5Zd;SN\EGfjs1yoph?o 5A#]/Ⱐ%<tUw dTk:nqLs=+{%a7Zc9xOV ]~!ȏxN/GWo Zs7/ѿ٫dOS */*~`KcUVp?1LE@-/^qspcaUA}7/8ݩ?1=Pre&X:9⌊z vE?0i IDATbq d1L:ce>Sbݵz z1# X$C< m;o?a^12;8qmQv¶n]ا/РŔC#ڗ&ã8 uZn(uۻ &xW360lh"XcIT8 Ȇ+ҡل0@0(8 i)4S7sns1Ա%a3| (nC8ˎsosڬMeb ђ? s|bcN枈g5ZnZ=eWmu[W-4P/X2Q Q w;Ar 9]X(i,~Xݖ*q5AXж1ϝVS0P\ ݃%]XaxM)7e?0]p1=Lbr?<]~^C#f t ސFMx<40\;k,6QdMݯ(G,Y̻!I96鏻Pƴ (q)[K Aqf /g%+`m_h= ve3V>uu_\57En v{!mUw*8kxﭒW+.]$=[2&5YHmoRSjm"ZNFKHox!tL"A 9}[SHj) Cnxz`EPȄ\tF f1$6[L# 97A/޲& QM5tJ[eAYaOcQI+O1 ,3o4QܮE&^)f(FԤȶoI*˱;_xsBkHW *3}8GF%ŏn *3Čo^j)oq:L)Ĩ]'$}{G[ݴ31^V"gq`S\w$iNUdХY ,(& !,#C F8 =QuHH݂o+o9L8&: wިƘY6;D;vg߂Lk3Y-ScfW僚7nLW[gZw?xԒ&3P=6+cRI음/8C,jp`pCA,n@@nM=*H` [վh"!A6Ca4 b HTv Mhc,'lyY-ZHX`"=!WF̀B)0Qd{)_!KMx(7vP-=pdP *V"̰\r3Wf]ؒkS8/RPQ:x]+mU2q]{w"CLhT_kw-.p#MKLe,7RS;$WB4[E;嘩L,uLX227r2[b:}es 8W K=Aڴ|j L02}Ў;3Dic7¿ ̟(#"u9 \hXTe/Q1C$.J}+0rWM2h/Z w&֝YG[װ~Op>OilG=*$xrWsgGO9`6?슽iFY:\+Lice$?0-p[Rv;hem=mudK$hJ`:/1ђ6OeZjP6#Hyl^yi^&{` C G%/uòjUaZF@c ҷ$ߺrFfQ4n:y^o[szl>/%꒑_[oG1(Ӏݭ13|t7-+7r 6Psc\:\sN+񜋲{˲ +ҠsӼih OU)*iR r=f+lM>ƄlE)!+nLyKML'1/t43$[ĭ؆+o12оsPZLC+"*kя-mS76R.o>* *V2q 8Vb $* k7B/PEcGļ9 04|pt[ ZE_xf1]j .P1N.~UW4D׆Y8ggƒ'#]7TBxDz+"L<V  D7%Eʂn[KфEH$˒gzFއP\p/ H*oI<+${E>µʹk>o^o2"_WMY|AcE {`E1nO-7w,Mi?6\&2bȊ+$BW382;{ 2Ը!E ' 4dZ2][k6^k# dñd}pHPLH^}T3N-_?Z? mI7"75ӒpIPV*8T|9ՀEN UNTH<OiABs[xC9 Qt#nsA&;JzKi?Ѫ(ˀ#ty4!1fv3D HW/ (4Jƣ  w8FER׬(Y9ՋaG/af;'n.C #A c%]c Oh ?nzi?09/kdPFeY\Uw.Q R7ٿ,Zcc'K8}rJZ,M"KO󴣼Le%.av$I3飯1g#g%n4EA̾þ\ʌV h4,gdpwcg/#}*Mq s s3U=?gʱf9VfOG-pNcW=Y:m׌#oW+( B3 sړߨQx0an%$+UR=2.󼤕6 Q,vLEehz 8E!3%Aq|:uiwhL :f΀iɓR-5#B`)'Zm"kwL]2`C680Svd(qj^ǔQ<՟q3> Oc[4 K-bc9xw*'KuV,g_>O'-?5o|`s['45-goθeL,=n:$Ŵh9kv0 -5agw݉< Y3>=c,p(ΰ4mQs+Æ4ȮE<w1l(떳"{@ĊLM% @.fc&9ʏIT%]$NYKHs;)󝒾pbP0Yrky =V6Sc xۧ` h>m"_licI1 3`L;ʵG̞0c  LC-1sR V[$$\r7Цѡ]FDcnskjr bIꌒ%%yM{]oPK#4WJ# |͍8j~,g[GiuKxD V2,YL]I0 "/H$7$ olMmAvV^.>(>̈ {KPt'5]B0+c!ڞqH b5"ZFcjB @6aA0-O_䜐Cvpڿa|ۻr7;/KEGWsH2#+_;NN/Z8>7@lHZ NOw:.yP!#.;MxѼFo<@R80TEj84/U~GD97uK ٶnz_@a=0l:Zu٤Fle˲2Kl[>'Q +(^q45"RnP/#!s1j lɀTۄNkaуx]9w*BJu$ՔKBBRaM7C W-k+>k0CmX. |%e+9zUCe$5*\JvP9!n [nj7uZM×MC=mvM_}gƖ {>А02~*%?2THY&kjzY϶$g*7C>_gfԜB T-|e_cgd i!m3X?(&lq#?\{ڈrK4܎Ѱ^W+VYOna|u쏾+m(1{ >a7'~\ϓs;dL?9ߝ eԲ>W5zrSP_1no&|k` i #<:9;7x6G5 xmKV$}~G=`L`_/T98Nf 1rN{"{7']O"`ijww̋=Pl䌮8O1꓆d3VP][ 2_ÌrXF{ XOR9xɖ'K~]|Ґ)rRHֺJF9ͽr@˾Zd0hnOh¦wBdWh0FsLyG|Ie;a-uoI-Lb:TFì:]]-jng ]sF7!9v=/]"N.׹c.ar#x^M:zu|Ȥx -_quϱ=R,udX:/O|df!xi %|9G-(L&15J/XZ-霧k4`k]dOhڜ{<YGRa hl9xK'OI3FFyսv-pIEhb] IDAT {%w\ U_$5GpL=% <^[6GT[]?H̫}4;86%W8Q%jP!7\ : CaM^lZ&bG`bf߬s2١mwDk4H>huHVSoP3mt]m>\ $$|f9mfir-,)0r-`-wi_g#kѬj\˿9J$0)5~c+VŽB;Wr/* UG.<;GO*r2!j<,}Nc$<*46ۊkEÍ]ǝ;9orfUr0<3%I2G#/ـ 06!}r8mLS؋jz|Z〾ٌRt%?K霁 mZYÀLs 5FJJHɩoG^<[f%GE |2Y`ZtNޘ=Cꜳ}WQe$&Mr .8X~)$"}ҙ p+}q:+p 9m$l~̺b}ҭ] @nKl!گ9Õ+#' dKS[T>>̰D‹6Yk&! MT*n>@0o3Nʵ=ǸV;J͔u EF:J؃>8>lN[Uw}(XcNX9= l'l[$T,&8<;;9 (EM-ͩXU! FUw\hQPJ !Fw5bYɸ{~sB0%owB5 ٿpD(CB'֡*#A]$oAY$90&I|i;9?J[a= О^L ꣳe:V:6% -{$Fkg6\`qӠ- $ȑsL~^G0+yCG> _nacȷ8*$L3Ca CX"א-s٣izPAO:\mN)CSxұk-{.^з>XZ6|/ 3a{bbOj$iZ s!.U*ͩ hfئmX||ϖ4l$, js /D噫ȡ`;cf J$t75vGý*/I ` ~ cXcV|6uX211 oRL_}eu)v',fECkvvxGcH .eEE,D~wюnU1{wh}!eQ=+lI/@PiQAO7{BAA}-y,8ZI5DAuo ;I?}l;z; %HTSC&R[|h޶a(Ydu-a(6 ^#4mdbB0asc껲$rRkv!a|xCQ>M]Kh[okdў7bൡs`v4POLdtπ,ʸ!%Z1B9EVCȐRǶ'9SaՔ]l6؅;R"*V O;lOjH9-@ Q $5Đ^l^@kg )-KPK`+Bi̜`H,ۅ|=CJm]6FE(67y+>Ξwݱ[)P a2Ȑ93{Nk.U@FUȮ%ݾܖxs}k15rcyxW4#z*RB(~i}$ĉb=QZjŷ$$ w Vۡ UY ]s{b*MhyF_50AJ~n%=KKݾ7`{K2RXZ YaĿc.cKD:VlSbۊ5°fat}c-\UbD G.\PpC؇2);Yvz %*nρsJZ#Ki@c}|KlmK!Mٴ7Xء'dj^f j\ځ4m"IwV7a֍,v͖?ť&5!e]ԫlЄH{"]ArJi2Žz4=?g/Nq.s6ڑ-!'%grwXWA-g_0 GHݱ:V<2<%w j->īhM153NJBĄa1}3 @f _]DU7 ) bR L =.,rdKhK]yu 40JZ6C80l4zYPۨ\3n+t14k2*CEp2QpF `GxBx3w"o yBٌ[c*t˂dcYd)G\W_S{7Ŝ=msó%+=T"Sr[Lo<}. jXh]uwe1+(1wEz0\Cx#AʕKɸNE9aMDm2\OZ)Ax0|E+L[הD+k@ӞT.?D`ZïTd$T20UN1S4LQRKhx3pyPwy*$ &<7` }n+tDž4i5c"aR-"G_֟CO#~4c$Zw-H@ڄ]jImW 3˃37-6]K{G!tM !!!J 8tє(Y[8]~C"NFUJY1B9pNpL&Eٝ]!bg?F%u1` Hr kڣ#VGcրݖNEz)=EgbƏ]m :nE.C8ڐq4i{vviDs$xCP]oF#JT}S(BcF#(-w  9>珧~'|,0(73\M'$(Nu!:_GrмlH[JIWc>-d;H!8:xA`P@Huqiae< 8#  85B4PL8#U}8b;4NVyw]=XW+ҶxMc07W 9!&8's26nu\4 Rtk#ݡ S)2(L!+q(myYW{c^egf~+!_HxZ$EP׊`+iAӪqt{jEC^7wogv Wq j#a{CTP,g/_z> \pb/kfq;+c5NG]MXԑ!"dGrAaۜBwB}ۭz%rXPUógst7v\@dCxqx[SW8Cȭp8AtX8LׁgQwT,)Jf'00ѠMSXhKCS M%̦0ԟ_\ hyֵNThN2< ]C@$CovU).İg E l.%օUqR$7la<*>n E,^ز*X '1|7E Nfz4|l*v21٭lvKD[T 9zxLr뚲;. EhCg_<}Qz4֝MEm:hJ-ݯy_wS֠%'Tc ʦ*]}jMd-G 0-y+j^[ո2R0>M pC׵ aΣzJO[)[ZhߠI4n`֯q >D| IWcFc\ \;(#,dxRax3H?NA+BH8Ó-LJ^HS*\N=38yg u#u i_iE0]ٶ` d)X'լ̶Qs!&Ɛ -4pVILτCe*P`-w8c0V=:iIV{Rz'3%Y"4Gy`|E{9mDR3{2ޱ4., d?z47-ehMtrv!FI2x%- C] M:p`=ĥD#(~!Pw#`R+,9&iIx>2wu8"lB \aCvcpBХ~`I}5{-'z4L R1n$23>5qnv=jG-iQΩyXà~)-_sf {'[[[J6l1.BHx~bgdH8fRm],U[/9<~'l8E#|%<~^qpԒ:O6X/fHKF'ϳW%gOh jUۆ&t0;׀w_7=L!1{8Z"fvBtFI7=#haƆ A"׷8Ų#!{ Wi5hR Rg`3L>$7 #ѱ_E}gHCׯY/ AuY;=d^bW͔N؎%5# cr SL -l{tzIʫ}Ã(Ǵ-<5rtk}lؚ*(ELx5Uı3h;¨l)3kp |LsQ#x zjLKIfX|k\Z,C2VV 8F$lzqD/ N&+ @C$Aaj:d 3FWvZ8 ~dq wcޜs+B!%2o֝^l8;rpP M2>75KE[#_cɏgQNMQi+G'%a2+R0$2`$$\h(9 fnvg>ScL.~}qi JB$]LJQ'ߴ!!olI23jY (w*^s?󆟆 6ZNyg/Fǯh@ ׮%dr/Q^Ӵ_GG3>ȚNč5'蓆 ')y]af٦aph8ePئB6p?Yqd'}!>S]Cz)ܶ\77UY)O!+9=\IXɒ1 Юb+)vL}+K(!d>BAY %AFO THu@bbsH1]}7:-}vy cF]\/K.]US2 c+RwW.BxqE;RO1 I/h&s.lFDHMΦy'#-oBj.ol@Ք$̗/+xv쥼3,X+}|˵"[Sט+n}1.6X2h Ė+B^tCEfϹ;YjiWEg((.PG-{!]xl:yĖJ+5::6m7}tc 1W=h"*c0!Rr:fYƖ}i%k& mE 0Yl,xܜdRL[p˦S8ׂ&d2ƪzza#fV] iYb;V!窹;X`aP;p ^_k#T"9*_!^\uRJ(ejA}خ`Lx%0;𥅈-~C<,y8_tez)/^T\:%miNuLw@e2$"\o3#[؄rqDȎRJӊ;cqє^=o_ IHFPY 8-@2&sc򈃳dY xݬ>m7'g|4(%ų3=mtVVI_0g~m8vaز`ꮍB.{~[O@.@2i:cl_+Hlމ wl[]ﻴzs8 ?ohv^'P[&!R>>Ovq528{ bJ+& nKyTѠEօTnBQas[ qBۖdqvp{)NB[4!aNh<}0*Jvo}4xil؄FmI1vbc2aw'2R1bTls `o;`CŽ\EBz(׉sĆ.&rm[o!Aê$K5:w*& mb|0nzr#hF$6Ȯ#?3"5# Ɨ yc +eܘvEG56ogjZb?joc f JOn=~@0QήsZ5( uJ %B^a]Ʊp*Bh ~ "G!E%`E(!9*tQjA"d$WRnox<ra(<<;7 :qOa?޴IH{B713\.\pDRV#3 4gUWuݕwf~)?yDT5&f@n" WgK=_ eHΥH%K)" PDC{dI$z1]x*%&K[8|aU߬߬q{GLm @/r2ZSGRsu$p ңp IJePEi55( ty؆4oX#\jQ+ Զ= ,-trעFTaٰٽ +vntv&#ƟF&"d+L*zC,_Ac9\p#Qa\Yg̐3ʦLU 2"LIx~23`۬KT a4ySJ<5ǜG%gL<i4DT#\ǘ]B觞>ww[/ӯvP˲44 ec?Q~8zl 2eE94(g̅49BN:%(X_d͇6\܅=SKFX@ίMGx5_bb΍kouIMpB}15 qEa %BR\cCLȷiz3)#17c`+0ڲ&)U%gViEdɷZ\#9K:in-R*#PKec#/Ġj8lhT$Ib[B=6<{ &[͛0ds;;8ol%D(h9".2vLo^e|yOIũq,W'_ *ECJ]ڰfUz 1t/}TWcE A#\=c* ҒA(粕QݪHFO֋,,vT$[!A5ɱ1ҬQ$-v2C`0hIǑ^A#Sg\3DIg\gкE{+Kz_`ۿ@~ˏ)7$A7ƒSr4Y| <5r+on65aSˬ]^map_rѱ[ MZɹ%EVɓ1$2>)МeLmI0HRBz)=QV1MV{S}P65v=C)pcR>822\ V? E3j5Fҙʙe̓{!FmH07vTk\sLը@0SG,߁LE wJx(/ \(F6$ X#W Hh]=.Q9=K}M8/Z& ON f .cULUz`-" RK4T8/~qljzy3*L֨4cwy}1cOx^}mՑ$&cXΠ> (O=g\T%G/28~ D:8yur9>T,H[\LuJ EbqzyM}:ヌ80]8kVM옸si VG%kM#W"xrjJ^6x>_8X=x ,u ԗ`Fo[8 oȃR,6U^8]JƆ+Y'c`2Vἐafs+^bWgH0 oq(xj[):2d>ZVA(<ׇ[v.f#4 ({5/ޑڅdDs5\<( ^N"M^|UFZ'8 2~VVb R.y鬡+GWlRPf4&vY7p)rt!)6T_Z7IcaGG9WO=W=&`eecNLTXn8њLt>gq)v7ؤ;p@ 3f"z$} dTLd {-7GaTW?Q+-`bQrӚ"ϰH[喠[xPf2?,0,."m"|u͋W5+ɇE[ՠjڜq '5W']a,pf#]l1"\"JX2i*׾WIpdoPtF,#5 !Hbǰ4Et؞.:Jׂ >2'CAU.ٷ #R-Wǁ%8c,ŋk _ܯ N_qM/Z>qycg!M NxLa2Qf3O5**p"^I(Ғ{ B ݠqLdTNۚ 1\P2XK-&y.6L )̈GYfy6\(0dm=_+yoY5\ԓ5p,!:fӨXWmCr (Sk\PVtYڤoJAuпf1g0;%U=9P3?f!ꊉ4iè0f+{T;9v|dhO {-&4t A.,= N4?V%#2&S"xte*⛔s8<;z<$EV,W [8ޢh?Q[jf4vZDֆy .rG^U\~U~ mP<',Bwӊ/+n(օ:Rʤ\sAd'|rӄ9ds2~EVV~T2* ϊG CSzF;dwשcO_s g!#M6!d$PLWUnw4!E.|TPy-FgledŅ?w?ԝYO4TQ/v9SWF'jMKZfZB$ 2D|EMi!fwS0q3&g!%j$rO >>ET*lpuBfU[|d+W>nىICӣp K̹Qߖ|[m#??rkOt~OfX#"7{uw3 ?~;B^]BwHi24/9Ȅl` &(ϷVŴDᖊHbղ-\6jıZK,[J,j힂%6YQܷ&FfMo {T0@.<5LZDѷ'aU8!솤\Lx"TĜ~2CGD"V =Iҙ3OK[$$?VMI>tfNh7]ѿLX-rɷVicz _ل/ ~H˻y򾽁cKNK7m$ JMm0ZrfUGeC\>kP6%756LÄ.B4~ѓ LIKU2C񖳣 1}dIsyROi=߻cT %7ubvE4-YCaƎccA*co xl?QXamXVs8Xy\"M*+0>FuZr 6#bKZ\{ 1![jI_k% 1!Kd, rVzY)K_` ;-DB2BǭE]fX\OQr5(ĞjZsy֦&0ͫ~n.q#( ڈ%ZHECZ+q{/I 7݌k%u޻ojri^1rQ!8r}%Y`)j+]%o'3jvaJ&qއѩk%E0]&\n$e l*VQI#w]mL@TpzP̕ "le<͛sT`Z.^40,(]i-ٮlYM{3'[c IDATҪKQe LHQ2lSPa$}^5ڒkb9 &g]I;܍:v xɂ_zoUh # Xt-n(ֶl/3l4 6xFGWah;<  Cw?pXl]EZ8Wn<بa(j]>|ChED*QCޮEԬ+Y(Vˀ,ON=dQJIi]ļI!R&#a/4X; 늻=~زf<\ ˙BuUDVW_rocˊAS5ѩ4J~oY2>eأT](%)1n>ڥz>dtD@O b$Ag!ch4XNXh {/k$,j4sq} evF+#Fs~ ox0X·\3.N&4ܜ48*fF,Hꚛ¼ې&ճAhBYăі|UjJMSshO{\,VRm46viCo 9!ZތQ!CB9gHw0KGL ؆aBKf"c.e:Tӭ!:CDP]c6TX R^?b?eq`&z贡o o`dԸ6(psl_q;aլ%Ҷh<ʔ hٔn1EB<0.OR4%, "xi$}܌tq?Vq H2o5;G 0W<"bQyjgAMh"{f^yp弓 QLӒʔv)$i#\"xWrˆu"kO6$0qyIηbXM$y%N3$@$qG)!5*%y৖g=^sj[PKCc<J+ /L8=|H\Eה[JrX?Gk\-Yzzj[@0n/y,^tEix4sF"\vvƛp7 ຕFl,WAQ ty1mKeYE_@82j6*!_%pBPRɔ|^X`  L!( Ύl% N*teJW<}{-ua!oP(PN+*Nn2(>v)E&2<S5c/w-JblL6_ev"I!e5 9NjzN9B @8S(<0{{鴎_QM˨#^!NJX]Ȝx%{!wȩf?֏8(؎6!?В_1ي%؊3&ּ( <{6fc_ґ(Z./Ck/1)"HNӳ5]Z.^fN[*ؕ$+&L^Q*9qB2E4@RNAx)56ܒ\Iic0_s.c?h2`,WI±9ix^[E:yq3woCM]2qxYXa?PbuO>"" 04 x<ѓ7eHчcPdzϿݲHQwB^uB34 d@㩵fB6dxTZ%7}lbέSb^p(6$a5I}!_2%deJ0Z,9}hAQIaayJ&PY?#{_+O4-Cj,F>9y "E42g%HOyy cdaA`78'׻4@ C%3u- ĖuK`!Fq5OsW* &KpbKHS_7/yf*Rmy^f<1|:\1r)hdl!?¯N2J=946(g.B2.(͉#SI4%,Mt\1~01(5F)6rDZ7Q6snUK1\>mBFgBRPzC3a .>dĚ &%փj1ZZ;/joơ{ oy`jƴ"`i44x Y=32qohii8,yռi5cfܫSv)1 Ҹ#9e=v9-; oy$<ɄF2fyqJ*RS%X FR'bVr|?V+TP\$OY>frPe180CC2 1* ԛl>hubGj6Q:o0WќbCNШm"]d`PЧ&ǐcQ(uQ=ٕif." VDmB jx9dl`V@J0}E{cϸCm)7H-]O:KJ$'?l&Y u4㓚'Lj˝$6#KYT)ͅbkF)T>VJ18/x5sAK\sk>mUHra=eFMw[MحRk 5QĢF[JK߻G{fdxZv~1-.VU`vͺcxG#]s踅0g!tfYh[d3ߣ%.5hpK Ͼɋ1Z^Gpe˦)2' O Bx|2!@6Qy%vU1Q+Vi)+\"LQrsGoۛPs/eIA .ps*g\7 ,nң7տ0e˜YKjN@ c4arA.;zb1[8j#|f\Ȅ˪Kev@}p D4vi151XIjl#eH<O{ Ŭz4xy\]/c5-{u<=l8v kJqח_?ï 坜5inIGqyN۷|SmĠRo>FH55oFqG3&w26FExǁɤe:餥|ceZLfG\2mVo`} lRbR->S1(LC,*:C(Q򉿤0 V OOmPy. ްv[Qj :g%_^r.Yэ*RKV{}[Cs뜢9oA cNUMl\T+ˢ61!K0/~7mtYV̒h+ λf_T joB]c)}E<1UQ0NzSXc`W,iJaTqkù3q/m=Ǿ 'q쑥`(Ȩ13**c.~qnF4ܫ\5vol+ecJ; OrN9B?FeT3l w<9q|䲥nlوQ̵ r|aݽ+oo({'ޠOWBo *٨RRiL{GiFCP Sq$#Ÿ36DtDsŌ*ffqd aOcR.N VGewq~WKL=%1!+.Mdsx;i3n,8-l4ȭ5wc.|yӱQ6nJʼnS+>CUvw\&ӧ5m+!ek0ZKӊq6T;>&`/}:TK\seEV֙ZcFǻ@ 5ߺ pdjΛg~Jc6֮ca>m֨ +%knv@͉ 0y0% ϋK}.b}0}>#}4ɼxL$̥@Y?#'Ϛh10c(fpy{~H]n:/yW_UKivWٴ 7(W0MJ\q" ֳs0~OO8*7*Q s1 G'/5gD9̝U cZ@PdPMc;k9ckC= kh1KelqɄ[sa&^ )|{8)p(( 1iU,^1 c@!kĒoXd'xbLT b#ͭFVH)vy9%jy6kcsQu|>ͽA)ckh% =ݯyB}شO(iha,5(e Z{o0g;3&5&feX30pbVpj)Bê-(59 V M|; 6x55TRhD`b3֭12+&Z1kǜ)f#%5P1jG\C_1PIe0S\L+X+~_OOR}T,q~>?;&}@#A3,/jG$ѽ7.6X}%ˣ sNW .5U3SOk -:\U.'4* Z 6y-8lpȈm4S/y!$VB2WMQs͂e%V:fɆ I٘99Z+i;~zn;^ē 4Bs\^9|Gw3pI WW5zZ!}>D2=kUz{1 Kr BOAԱ.{L\I{\h^:5\MxF c׮,\k.ӗ]%REĚK_v1Zj6D]S `vhb#DU]vl[|)懎ݨm>;p2Բޛ_@ʍ[=lv,w>W5Oga?Oޜ/`{٪c2nS5%bFnA!k؋:avǂAXLJWXҔȢc[VQ 9q|WK7_B1Xg@eR \6lɊvmEN˾){4KoJ6QD%OC;$-< %C;Rפ1 gÛ$4' y.E&C70T*C83'P[B49hqQ .?1Mڼ&߀v:/ 769ĝosrj,Hr)sͿ`>]TDCsx:b$ 9 U7" W?MuK/kMIpjqMu`;qRq1y)q6£`V {JdsNΆa& '3|t J;]:[LZF~>?(+4:\ 1-i 24,xb^b +KH2 :f) !.E2 ~X%`pf 3B*S }V|4DK~# 0UՃNɭ}xC9L1wJKNj>aIM})'&fadh spg\;,0YP'yA+WouW8ZRIRuCee@HjbQMneS%bb`lK}}H&!0\sL Y@P|fɦ&c}e ֜{Q͒^ @FqR4lJ(PWIH]ou/\ȴ!տ&]?oXpBzzGMKUҽJɿl3 X!:|Mxs˘ MED\i0zJ5bJ_|1/s]>vZ7~s>22޲F/] ܒ%PhoPmRp9DahLr~Κ WǍ?֪BJL1hAeZ~(kGɑFAi;pi`u w] mpJT$ɭ>}0P1HzD?d0e%hMEa=2 $*se5G_}XH9۷2F&g/YbHÈٍTk:>1G5!n!BtfͷX 4Jlimd}Wƃlo[ )kPؾw~0\I".H[2Z IDAT"`w{ ~ N=d[4 Y]nr9'GcnKˑr;P˖Api3!dҖ5 ͐}sۥ|{lLder=o; Ij ~ Vc\m J ^*_"BĢ&>t/#hKzW(cԳnzx?s݊Гv)o ݱPM]M#<x/jSSGO4%⯌ r]JQpaRҡCM@BAB2X&Γͥ%6A{Z:&ADc1q N-BȶL!{^9g(+3RtF.{5oբt=Eez~RKu"7v!kORp;n<)sbqSz|WV}:|HH R.)uB*`6 ,d )yw&RDL'޾p״?T?+ټi9iHyKueatя! [v֡ζx drP2 TS/,Һ@iSmǏJb#ghEMǽ9)с:ē+܁͵00,N"*% o৷=.KN(qJmkxQh섒 g:jvsnMlJ8.o=^D6 Ѓ]&Veɐ,iz0Mr^u.vs\Pr]z覝?ѕޜ#APV)!$Px-"'sys:";l2%R4\r^poɟPz}~ ,ݗxjk-&{:t8 BaH.kL1Km8( cXӜKBXK*Ca_!maF*eݼD`Z:RNKNl%קNS DmN vYdubJr$1&.ϲ,H {@_ 8^2(SPW9[e3)Ɍ"}nOV^Hԫ yFR8..64~F0eF阉ҩlڎ."eL˵WVT¼,\{ޑ7F겆N2,t]Kra6I|j fX-Jfr[2+˪sRJ1:5yNg_qCkޒuZQDG9D0[Ud4+8"f/Gyй!GUOhLgK6-+2WAv~|ؘF>(f.9{z \[esLJr|b=8<7>-h%݉!ր;kx-Hwqs"xל9gtIi:A]˜Լs5,g#NhyA!N""+*|[$b9ќ!/JDjc߱ HNjo!|##4#j+L$3yUA,UxR9>: ^2(U]['B^(G-˱LH!˒Ppfu_9#~_oA µ$*Y zg+ǑQ~xsZgRduC#֙i[HOh#J >T*T*`sŗ|>b^)P#/mܕ(n"7i42` pR r=c% ,$倡!K ՕqdT Hg! b4.EXpOhh8 G HY!O 洼.Ttttk 21֑qro50ԴbP\oT_df`N퐣3ϣ cGˀFC$=əeuJ''£{o ;.ʲ[[x֘S+Gl 2]:#A/JF8Fd.SH+#n1kȔ;X,?d U]ŖⶽAxΜKN`PL`qT`]2gNG :w%7zc3vN6 X[71[꒸.[˃Â\ԑa\rڰ⚕(*lF?pkh ,/O _i({b. GPQbpN6Lb+ Cc+cxapw0JLGnc/yϸ]lM퀞3tpWuy^MgC&`c&m3c9b&sΨm)1}y Τe^[^|]StPDhq{M3lavPtQV 6G >ҧa]՘ץA4pSMKgC}⛞KY8\|ݰgeZ^ ܢaEK,r@4ToPo0Oh;:xMYe 8TvB4p/Hާ-7Fnm_ =mt 33wjJ'1$a=Lʐ&ǜ]5#gxm02Y>Ax,iv0TiWt.*Wf\hl"F=^ETmZĕ[7pvy4p$0q]pkaR9$!2qiY+,3h]yQZZeU5eKf~W:3j Sx+p}!Tv VY\R15LAGD* l?O"3#`o i= Ӎ!^f@p!5O?XQkH]llY#4rIЏ Nz]J"_' _Bњ]Cc ~ևv>A vK]d ?v6 oߘ]w6$b`Ńɝ{7E:"%nA# kSsטB8&)&G-u5QXv< VP}Q|ɺ~__pڝ3CJ&D-C*n-yJ/n#7`LBA77nI1갚s_<?7{1 $f:#>| [/aowɼI#ub%c >"bᑡ<w>d-ukR԰-]1N X5L65ckIX%Flb{qAq ҽ~ãY92L=1̩.f3O&tA]ZE7A"8r3A2W4 &@Prؠ1){w<6t"-a9L S R )3轼D=uG]6~b]ԈYxJ$ V%_.eRatvR2J82mQՂYs <[-XMnkz{v4rjLu=︈ 0bC*uŹ}E!,R1Z8z,..+^'ۧ`4vAW+bR\nJm?xvOksz',pE2P5dR_b*0>/6 BuVј9`3_ww8mˋ@ 7r}Ԅh >ҞKEʷ f͂c 3Ij)q镲1mv9U|I{Ȧ"WG'!CW %MlBVfU0fyŶ%Qv+Xr}or kM3ⳎB HIs_\; 4sg, k`22\^*ɎnH^t}h\:m<_0Y3X"!'皿Ϻ?mcJЎ{ Ts0@dRAwQ}X,1LJ0.(ޚ2̹}“|\/`b#:ZlL`,tFkHՕ W=$vJ2qOĴS:mžA02O=$7<[9X/#=W\xv&I 0t-hHtUJRh+'Qi6VzbZ!Fm`%nL?)ZF`iЋJjgGXca_ b|ԏX %fD"l$e6(_pհmxItI~&#^ѧRqF3\"MF" " }M&a.Y9sPS)m;rdJij9[V#:/A-7M6B_zBi pW8%NX(\@ێf6@B64-iY0oZ:Tl-4I޶,@ &3eh.95#+^YWѮYX;iwlJ^@ aEuS0JtBsKRϥ$J3Q1HfzʱAGc * xs:21d1Xpx!co:\s ya^4Jop&R 4P=@j!TYĆb2 $3C姫<%g}+mC51M\&$AY"Bhp6*V ? ty=dV&>k 2(GC0/ 3ε' 1,m+Vh\E{ܷk~G)Ij^|Q}wwO{:emK]4<ϸgz+&R|0b60Klbaن]qyM`ʰ]b13,SRh{]{MC=] .UɬCsM:ԷN.3mf5a*!!7x4&r"r״tl9;c[Lg39x--p]pS&+(&Q93ǧmYXs*ɫta\?rP*֖lӂ\ى,CDSK ϟultYٮ k2cLWqse+ޗeoebK^YփX-褤տ/0d2'J>#_rp(/9S2o˃Bgs88iłp9-݆.u c3 %-eH:D*S1meo+CC-l./d‬rL5R3)`u:~BަtI sbq>K\'[P&ch,G3INedxNKNNYC:ےtkpxy 2|CR(X ,8OhxSԮ$JoBk½9 Z 5Ylr;:+-#,UA1K~D :RX+1TDZ\JRsCQ6 93^S5S@ aa?΁Z)0\. nɄǺFq$ښM:VDˤ_x>Uz핟aaq &ؕz[-}=XpXdaUf]z.".rRQMK=N\ǡԼSsg<9eI7HCdBTc,r%ğq_SYq&\ W4oF#( fdБ%gmWx\IQC^yp֚O@䈛󽏟Ω@kLGq9Gтk%{azb̐Ly!2uq]`rEp|(>%tdH9Sn]B` %j[SgFL(ĨYNXbE_ML1. 4LJL<5CN̥Rp@q꡻7OhG4Q*|N['ԽD().O?"9VT+KЬ nha`2B9XOiP ?2ڀ QSA)DLœwv+$ԄF8|yeWoMv̐m`"LnR_v_0(jޗ)}N' ysȜ=cSd+„?eFrDHiޭ]|~ΜKv(&*&]1թRaQoPrҨVBDFv@2\bn_ C~P_e&ޤЂ1-X1BF̨8F=.}E A4?Q-s;:k(n no! 8ogyg<#޶[5W!X`yLYpoa]L"|㳇5Eiٿ]`4uXT|;> <  i͢+xu?ㆯ Rp℻nuYG1s`xl(I%t!Ev^r=z_N iÀR\M:t!V]eMb2P2A.G٫#1)K)a!tCDNG"iH Za[! AGAU{2" ZNWz@(!}nیYcN˱^WauS/i-OԁV F|b+Ԛ>: &\GWe4 N݅vBWQ3D4怓~H.]EA{YP\Cͫb6:so/Zf*JC"\G'̭1 sKR8N5C3`YU,HC0[D '`amÞcp} G_łA$֔/lLδWO<M|Zb_9yr|iCr>'^QPU:-#N;Bg&?5oy24v)#&z)X;去k&6UCpC*;(e剤,x(iێu^ds?gu}.>%?'fqxr-:I(%9MxTP ;0*,47Ct1㳿?:bnhl_ MQtg5.tH;˯:xu`4MaeO7&dRۯ/>Fр݉ǜu)kӊuq&N9 ?!Y5$cI3O[씇|~A`QV Bgtp`=/).R>rwa||kE*V&Zf'8w'5 F a aSG#_]koDaS\s#ųQUL}B}Q0{Ycuki,Y'G5u|n6zTwUŷťYk!:.jn*k]KDF?[k $XWdxvgM6OƼ cMU(`d~dI3qPJު+[NwO 0Ђ Hvf|>$5 Lcz}u!=25G\uv浊8;wy ΁ÕOS"Tq9קTPDC~>Otx[}JZ8u;a1BW 885=6)'~ņW.,9v&{ Ru*N N-%hH _WܬĊ Nvey8cb!]!Dˁ=b*eyu7sK TX=<'Ș&dhj'N3 m<Ӝ54fNDw% j0Ldf`ڵ%pfAN*2Vb,sj Bi$ Rp} 4$#{u+f,W]K QJNYK$8MâaƾX+>~U~}c_7 +o.8؋bZ$ϥTsǫuwW|nQ0ݯY 1k}ф)(&#Uӏ&N fBftk5VA8;a65?nˀ"-N[pGѻK ~8 ᄞR$Gvck0g!LI8zKILrHgoT/`"G8LX%hTEro`tb.9{ةcx hX|0F(ALR 70{L s@K9]`MlQ~h'gƁ;h憂ǠW=:*{5Հ|WLh ԽbBoer2Q lkxTMKw͌zj`3^!'\ S)nݽ &|2t` ldU[ *ޯxE|L j75c ˝#nY:_+ׯdUΔӳO@ qa)T|<, J|Ž-=;˯?iɂ`=B&_lXgygK3%Ar)U/`u%$>"l57 ygK1/ӝq''p=ڤmup}9`8f$esI=I}5TrٮyTI4 5E(gJj/ۿR@nvjVc^Ú|fD")ck<sonz,gdOE13w=suC qDh)4e7XuGʚN+8'sL:Iy~2H'­ޗ\(86m96*42g }5-oؓcXaudv樖a&r{k211xq:{~ݔiDҎ"X~~ O$Bx4K8RL*O_\Zڔ\ ux1Б$HMr:B >34ZT)"IS(D֜uXrM`;[* x,Ĕ  !S 3;@M4-U ҜPĂ5#Ůz7sJJZϖMp}j2`T JӚ}SR)0IPsڥa LXNO& K0yMrf-pZX^f\s~U]C`}B찒N8{FѴ h@b|a3cSA vcXvФtDzl }hG  (.] {Pmص_T÷wjoJn{}Ad-P`|CufA˔2xh2X5W"@" E(zē`9)I~B !{O h'?oN|Xwjq: Ҙ2TѠ8pȢQ qmQ,liߞ{" :Y:IP@)G$6jvHa90'؂`0 $G$ ibb,Cw5:uTfc:" 4:;A%Ahp:5+l7D-+lP#R,yvnJ"h} z Q^]a $2*;{%2;K@,`x@F筟̒\`0Úݍ's͊KJ䳴5< 7(;fVP i"z&l~&-S M's a= yثAB#h"x'r _aHcg[ђԋYQz4({0i4gg#Dv@mzL)bZRT v\o?lC'>E#N=S5dcC f[id % ܹ'9"4Ҭ 'Ŭ7L>w9p(.g0=cȜH2k(#oYtt 42DEPzZm (Gdz8CZg6u+d܆Id24D<)nﰳY{u >r1xLTʬ:UPceC ,9r:,"Lbڞ9[yÝTD6 G½;|XBX(q ISr="@P3A X7imGԓ!R+4b[vu9L925&[[WV//(QC\t$=: v\3^"'g#_7K5 G ;';pA fcP8|t]O\1P\nF5Md e*~B΄<]7ɴ@Q>rP#RooV[;*z>-C0qCs9nj+H@>YO>"h=ڿg{ iwkL`Z7w/P}'X\T0̛B2!v6b\aDF47rD7ٴ vg`˚$lM{EQ$ƬKÌR*pNWz,G%g=-x2UrӒVe̽%so6HLAGI!9d<`y!_-a5Txa2hTf3E?4 f a@`SbcC05W7/+3eI̭E1 %== {xSnASfiTXs^Yr cjGàbh8IԬLorւQO_0ƲHĀ(]ى %ʄf< ~" kы# 4Q8ؼh;0leM9`&/OM$BhMN'됹5˜Qfh?zߚEAh—vKE&DJ"_k T{<\pAm$dkhYsBL sބ47"\pAGCl$9+&^+y Li%$xm tCsDAHjaI+ҧi0.{i|I$#+p\3mӽ'(9|[r-9{M-ʨZWc+Aذ]g(hԲ~L T90Sm[``- 46h@V}bX3 O!QoA G`cT2l^#%hCTHMnȦ _L#\ ID/7簺152{u˳XAop1eyB9a ۂ^h/#zj1ttMu7yqlM㛿EB, ,! (!NRhpIKbPXkbn||%JO#8TǺOYO&kn䷿Gu{ÝC_ps'']!EGkh*0QM@Mc+W/")YZPkչ_nx$Ub֚[7$4hgjk^/=_}a~90:=GWs<s2}DY`bʽ.Hp[}@*~i 9N|A8p_lM8ûKbs@4l3>K*fa)hLHz¬nk JAfDKlG48{gX xc)Hro Cɏ 13@T5[sj *cF<:R >wkTVgzdS􀶼&(1U8n3epc='psS 0=jydTK,33ٶ$>:6% =.ي%x>#D*4D~uP<⮻%zC'J 'Ldt&,|㊻w :n(*WT Oud^0jKdKV#BpuKC"HA _^Y&l*W_Fu=cC-q5#GCJr*J, lު)Ng9CmԺJz=,?9JX>C1; Pt@Etq)u?Gbo~M_4dZӈ(tK@ktfoW?27 4`f`+!Rpb=yJ"\wxߟ@ LUa94 UMNXz峁{%|xZ`/<1&7abT-bBˊz]! 1Oa±((Cܡ4e6:.%QnJFDrRن@O!NZagM%M>ꮄ>FII`DudڕoSz &`A#4cHS4P=tv6Dgܵo}%Z:< ]C6nn犞ZB{JfiROY`c4r-s1C^_ SyPa'`z)QP3HFYR|pQaJdTpmf%PAh@= {Yr_>;֓b(8~iU`Mw?ҞJJ~!IfvT7w>߱ gHs\z QvLq݅b.QPxДW~4 HƸtTQ;_(n*Uh33A|\U0,ӕ2Rab;5UxC|~ٔ3}X-rƝzLfh vY%Caha(}NFN֨L>ó ~S}Tw3B;= K=wm K_]77a N ,9n8f_u,>8e#pw n^6_};8gmVVgae*6a1{,cUɡʨHa58{D+hY(mT-sx{cXDFn9;M<*D(0 vG%KEF5**X-8{2$-fyۚ<1>k-\w8f~ 9mnsVy{@60VHC`|CV8_v0k- i\5_0? 0_?`0K9oK.<~w:o+ks()O!:c.Ǡ`ry}O){Û}}1- 2P^_{d`:"IM٨l 憂͘'аzyB)0T107B<{o~zP( p}FPQPx ?.ppƅq(@sfb#:@-c2([ɱ8#T{2yQ0;.TX1 )wCVKnx{KSI1 1 3·&ރn&rk禯`]b 6It4洿\cou,+|L#3c3&7-bWpY`^M?IӴ;T Mb7pD>lHBLwFkxM~P*Y".C@T:wl>(>x EC|y]!E)Z3e')HW_Ξ}x,8 D/Dq9lD5*56~;k?ݖ%?犤q\-.݈l sMd>Sws$Ȗ >tE.E'16f4>&W ^dzOm͌!0`w Q`J>3@ n`xGeA;4t&trM2zVRig`tǂ(4\N`Cvº\FЗ`CM)G(j yPښ6$` 1yޔqc*xX%D%V[Mt&1 |%k@{w+y۳ihn5* g1ad$׭scXPr4bx6fN#7idd C@-U(TaNYGy_DA9`DacO)f@MLd ұ,wLo5 2+Kx PrZtY5ؑ-7AESU cclosiEYW7z*7T2l*zK`A3Ew R NA,eQc xJj(/\ mć'!S#XRt"p&uPePD\\"!qW؎^$:xS؋\T^ e'?p@eH(*#?4{C[VXD|GG%A[f Qdp7{|pt2#a ostxx{K>zrcUlY0vt[|4-4DQ?H٢/kVb؅$yٸ!FC}󹏲Ms-b8foeÊ wc^*B,%Ot`맴QM.Cf[9P\Fqcwd}b x'X8("{%܏Ad͆%4\Rnj1-KV|dܓ(V@AL/:|IT)Elpj(y|_bۜw)t@ _- j=dG59n'v u$H?*"%FR'k>9܉'X2z~Prf[^AxUO3ٻrX&7wRN("g" zӷ E0U .b뱨\ _n6x%."VOAז[}թЩzmxqPre5>k.]QQ#1{z:IۀU 33 &*A F1L씉IiVR-HnkT0d TZ站8 C%1*l?מ1ISkTJz ?X_1پ Q\`CEE TJdz7F T Ri&e6)*6`bMD*3 S(I\>F LBIKvL(\+:z:4(·vRզ`#k *)<4TP)x6FJvBR-黂O0$CȭT5\G^ XcqM7Ish[ ` fv:vy}D<2,qorG??չbV)T@SpftAܸ|%;^MQj.e H _cb˭xV h/5/ ~"k3,`G^=!|K? ݷgog T=>Vwwn6έ `A/qkMmG4}KN+P77{o$Ir=μʬ`%$ge>Q4`g0t7Ȭ<#pw3S>yd0%QyYDx~G_/Xm9Mؒ<8BW.)$yg'5=ZϚp`DͰm˟ߧ3aa.3\.B"MjsKHSW3 l!.1Rʍ;cvKJ?kA'L]^_I\Vx ѣ\zD 1g'39`4z_[()KYK%M3 Ԯ4x3 >^c^ 'AK[wofUI']r̡b22C{7dV(Q,H.s?NjCcܠŒGx3=ϐ%2>yBxhw)c'yԲBTh"Ԫ$"qœ2M%>2nXzzWh~̶p)Ml?N:ccKZjjMz3,?Jz!,)L%k;vg${"=wm稪,pq1}JW`V(MVU8[2d$DllЋ'4 U%S35\<== (SdZ&d+|r RAX3+ _&r[5\Dw}·x#aQEqc p-.e[0rd97L:tldXtgaӼ HaWD@<s p]~ײlH@˂W%/+m!n ƾ=Df=\EiTb]6)(~&9L͑Ed20˿Fࢣ'C*,VܵȅLɭPHI>^wצG[JeL[h; q7 f_[ʺFSt)+%<Cnt)ռKc-`bҦR SlzBx0l"^!1r8^a"J53h~L.֠S(#t@{)&-Z"4:E5(M\9Zbew8)]yk'#H"-9CjA߬\ꕰHd @T*N-_2!$.?jusbXڹg.xb˒YM[:5+dMTuɵ N ؘE2 =C_៷4_Mi>?%'|^s-\_?PyƸ.RӕNA{`T[k!o?ڂHA~ ^Z#66b9%[ @"=&1fY,֔yqUm/.#;%[rRQ#7 vzℛ.Ir@Ő3X"klg&̗݃)t7}]1Cr4Pjɮo)M7Ę6[nQXwԾfhLJ}c# >CBU[=>[+-ᅬ_Xe%xtToS%v 05fҟ^=Tc9f1&_TR`6CW wh迿 ơ'JLٌڷ/SyE˗\p45P a@kHZ(xڌӊئɫBҍ.d Ze)L6̬DSqHiV ZX @ŽY7#2]zdk C wKϥJ]6+O٥ѽ^ԕAG]9rTyh.jVVLԶ,qj+˼2 Z~Z&p<3!gϩ_ 8#]hX3h Vu6]lG1KH9#y?{ 5d]={/pal z}I[DbuvĐ@EzRNu~&I$HH_,2MOE9MJ8 )#O5v: \Z^ y8z{e)9睟|=G~_~Y4j#WhDQGo1j;-zQpY@ l]?~i>X}h Dhbsi|#MU &DRkfG@6NC.i9bgX |ݜ/3nû t0t8-y(iJS8T,Y 9L^W 7GAevqbB.Ịy-Ρ(f݂ ;+c/I2鿝b%sōҎ*14шFAGښK$V۟`;Bec4hiQn#dt֨Ӫm'hfG,Wk-1, čq;n»;[æ6Bo(~gop,mvEIcjP ߧ8mhOx_pD隸kodvZF_][k\zp&f{O Nl/~ĦlpXX<3!68R^h [~10Y zIe[h]Kaܒ.CB01[v6xrcu lӓZAdWA hJLDfLՌX c , _0T/^BT;f5yOV_7pjt{&gwb&9L @<6~\DS0)z %L-KhG4=&eI]Fw TncnݦVF;DS Yu&5䲥 ǐ0M$ƈQhqG`[I &Vw9MN\sZ" X{4VxJ画—g|^Xݻ;-Bf˹٨!*|s,لҥ9inm,8{8}1F+/3;o8LOz,-RXzYv0 /L8XbMsy6߹= eQS1b?f}O$[m m>>i#ɻ;t]V0՘FVP}rʐ !#ZSBaОS!q!F^ovaRNGsKkMz ;XP0;o`]aAlt*Iyž~Vq,>X$M8V,D^Kl v>i>9F {f%S#fAxyz([5mJaU lγ岭A:j0bl`.^)jΒe2p,kW}ᖯm阁cXՂ ׎ bތ}Lh/pۣztBV)o÷O!Zjb}7?}hYQ $qD2%vy&ޟ }wlj'7" zſ@6D(E6M9Ub捀eKK!I\K Vթ`uvyQkBtm] b֨ HYjf?Uoo-P q{ jL%R/~r eɐh"^jҹ*KHH1 cSp)Ic}}t'%xgHǵRKz[]݇T~%v9욯~xW+㍣-r$in6`f,oPk'hI/Ǖ9\/ ̰ɤ$QfJ0i8 Dlw{ ' Kk5%/)fU-kl;(}Hk\%3zqȾ{&W;l͜) /X&4̙넉Sǚ G)# h4Nuz4,xB--^ JnrUO^#O!vN4C> >) O*zׄ!K|B*74KE(Pk:y]NȰ5\~~˂NA]r'fD(6K=Ui@ EŖIYeSౠ,(H<cئ'#"#b΃FxYp ;f *2!!fHJY%`F3Es;@jHpdMP0]|6䆚kҵ]!CǂFiirڥ5aL6Ih b>эլz%̥D$iNUNh~9=$E]KD vί۟s1--xI1 H7gԡϘ^]+M2xpEg`+Vd+S|:8:0rP {Hp9NM7#*6u±9*f/# c/V |HXP`mSQ%صv$GDg8VF;5̈́ ΙQ2d}#Lc.kf5mxFpK~ʞ٥ 6~3ȟg=7&ک*}Vig&36T˰-E <66Q1XMѧ "rJqi)Bh'ky-JŁ:$8x96\PP"z#uz.Xgj) >0K ;bLV=qfi_kcd.LZEG%p23M5֤BDbd])xgN`F,(d-^bcV4(87EFĺ %9|)Mz}5_ъ 09 Q@a`$.''fT@quLNfė,47Nb hQ*h݇"R4 EYB( gyǘ0Czԯ<:{i{ W(Kg#,[ٞҞ><cG<Ķzۏ:1_G\`nW AvbčGP [>F뎳阶$ڸxzUiИH 4pv鈭rVi?q81@eìqc1%^~9drMV|?0:WeDQmؖ}5|(~iN5E]CNxFKK)]EsF rIP YbĀ^VnJ{&l%Q=Nѣ&PT)zԿ yS7JYb70ئ¬ԔR)J9ikk+Pqw"bIBbF:CuZe X'Ɣ/4k4ϟ_0!kuE:ŸHQLix%7H5BM*gNiCp3bmT\[iq8OUnCEwIxBS*l| By2?my@U]6Cȑ<`7|.i;xþ%IItbv3܍ʙ1LB8y dSG,EiD*M)(CU`_oQ?9 b z*$!\<}{)P?^WRsP5KgeIi_(L'Ii. ".0yE8V𘇎;S~ˏ#DXRa__^RUw.,тZ]K?Pk=^7DpRyQ;m2iDė_XnBS〝_p8s1GoZF5Qu;x3<-0a2`GP0,;clQѣb ]Ɓ"$%  O<.퐊1kR_E-]u2mZg@Ì p hɍQDr/hҧgp-XDԿ|L}q=*nFRmp4}GG33\Mܠ) *IQ؇ç,.05qc #Ѹ6 1F|T5C{xl Gij h394H;Ob+xI\pT7)olRSS:;;{xv2 f줢-D Cw xHdJ#-ncY!̆SXA"E<8  qH2>4(} JU*S3fqP|MJk0kO1clDs:g6[w+JAfXBSPCtKSr&vEL,x 5.$.^X~1ޕl`XL=˕Xgp5=\ 4&(kIxN2@SP.Z̈ CGlsB8y)um<oKXψgYn顦(`Yud)Q,s'B3 N{oVA|z@w y /eȤHV9cezDaD%fLk^Pl)pקψVPKF xXm 6r0a |cF=aٚM)| 6Y j >>axMÂ) *$h7 e(y%Y5NfRY oUJ:!=F[nchPHi\bG{л%v}XQriC[([nSQ*<}ztzKYh\D`Hu__¼pȧUv(%"V] gnofKFei2%< ۀ݊[o @}XCu ulMUvqIOdG눌jيۨ|"5f!:( 7Gf)N/ A¶e&8IsE&̠avDQGb=K&uhWHdgȥrxwcYE{eQ({梛h"*0].^u KeRI{WRmSl0@hTwmr_?znbK"m)ƕDvl)3&/ǯ*a+4Mm@ۿ+·dV$Z Լax cv 0u]҄DBG8]+Bhg}Ph ((lĤbUuCSo(\3(lG|,_,EiI,8zb89KW1inwTrq g_]$y NAlvW \#A'Rދ(#d=}Ln2Ng݂Ė0ƛjFǹ墹>Fb V)iXIR]!rbU 4v ]#%EJ4F46on Bk)';_>fQ(@^„2FcPp7KjZ(vC(6iHl\E pv԰8$PEnT+3fh>Eʆ8&C~~iH҂:&)H[nٮTf ]9F˖eNUj#J=4~MSNX3&VlmTsץN+Չ * 8.Uʊ 8hx.(("M*]AA<2?}E۴#,Jπ8f=_4oӌAUrnfBϭYL%mG,lɽ5 8·}5;CbvYQWJf8d\1:.BaaQI-)5NLxEhISEA?m,X\3qΓFQiq>CрʪƍӓؿQ0:-WGԋ#q9U՛2>]\Rɰ\p_ܠEJY^R e`>mS]= ur(%BHYOt:jn6z LlxEASe,k]MN:]I, ňhFBZ]GX"kʛ;FT1o3W%Rz5؍D ؈ݰp7PP;fQ`fPy6~XriVR({b Vq} xo9sTrTge!VfS;IBWb$w%Qhd%1`)إAVʄ+Ѻl.džlO!)hZOߧ9Oj:e($xiVKʿ@WYnI8Xa:|*^ 7.;Ѹ7"tHPʔXl䞳1ĤnL,Lg4J@4,Š-RAMP6չɼ%GIGOnZj8XZOdS߳_t" \io8/fˈaLO֜GZg\dt^œ*pxjeVHU]$} ֆIv'5a>I,,OqI_"2qPyW8VM[zpևܜ%n.^YºhRwW-%CtoM5ҵd["kW@\-?o~cXfQzdGc2#mq?'2s TBA(tҙn7 #0r^{";Yn~X0X ͕]uaJ+-Wp)a dRК^h1Z4&GW!HD.GɁ&oJŌuMҼ Wj`zjm}7„na_PGK O4ӳCRel9*D3iEՑ2C @gݜ7pO~팔l@DT41MBflrԛ rSr:DMS[ ,k"8XhISh.ObfAson~K,̖qe'#S1lQQ8R@-STKO#cSih2Ғ{{5 ׈-oc3G P6ض[qX%ZKO 8p3NK+t4FhْOGqJMrK]A$q賝.;:W[f~N32bC%A97(' ?֧szM03Z]b~i֎F^Z7_XU1i7IA/Q$A/j7 xjrVUV=6x\Q؅cjc}E8?=ěLjƶ a _>F1?FP2ԧ+rU}lh6 ܔ{K2aMETh,MJBSUjTV{՗O:HtHU F GEQ;k@Y!G/ $Y&K"\#1tc^AT !f`.sAh8Ff eVr%՛7ح tD8:-،l}|KЦ" S:G0 |/֮5MVb䷠tHC^&W^db:'sI))ܐx{k} P0N0'H\L\m5L_{惰 ;RWr2<$w3 μFR0q̙ڝr @0qP ay⭣K., dP;+ ԺrmUd&(=XѪR0SpybUHTmiգ05sMp S}a t19BH`dBJX˯?M.`3WٸɣpB6x0l6WH9[ZsP TK(N3 E69~@p`&PwZ,k/ՊhUX,0j> o|I[b6 dvl\K-Вy\9-+y.[su`p׫Wb_| F-,Э0z"l3fǸAm5 0d`H3}Mu3 !ZoYUDQqK#.b+S2ZC#psO>Q L`z됫s3ψ-=P}y IDAT?ǖ_[`˧-FF8bG2dk+PK>2)j빉eYG.Z;T^P|x5";C'Jf~Ělpϵ*/F87[o[Iak-w^4!,=\m6]^~sZHgX_ZP_B f>:+^ vۘv`,1#`aR`h2L"66b%D ;yEq1'#2$WJGu5'Ϡ( @.NZyrT'_߼wNt&A $r)Y-5XǨ#x }.+K+dt ){ -%#AK+k"j?<}QC< T*kD^)>lՙ{[/mdDbMD^фTƹ!=]QC(ΞC7KX5ҡ/[w%6z6G̞Z8p.LI%ū0:vIWV t?B0hj媹߀aN<"^ jE7mz׈Sܛ?L -ʈ텇(b+Kt|?}LK.`wN-EFI6WI;pnf^Je|]7kU+k9: TŐtwl=:YtGb'SXh\ ŅRdx2WB==W@.`|N VYnO8F:LЖT|@-lRڊp3qQX..`ٳ{fnv=2Pڡ˖\P|ae4qFņDqռ D!Y%M b!ahRZ}zSBo&uHhLnA=dE|KBqiwhhip9 &NV]_Gn!Lf)KܨYYH;&(6tSpfNY" Ԍ#ܔyb>ѐ#2"qী$[nMWm)9$MҒ@΀Yů)69!TQ 7fu 'OKB[kH2[ޜ:ZJe2&CXߐX $DZ T^:Wy/Tp5VŶ  M !Y1PΈVaxDM@6 C ѤiaVʢ#~Ɛ0= ҈Z)1jHuQ7kUX`+=͡Ajzdk g\d+-J U I,nWzhkWfvLa H|;\g7ƯW 3vkDG䥼L8p#rm['s 1E7^3:w*&RbY 9y0]|uEp[ovu.Ѻ{@ M&}riFijL7' 4_Kb5nOxuhLpvM|V!ꋊtNN@`P9| bXFX8E`9?.1>Y\ *b*9>e,Cmi02ߡ;hƊ*fp@y t Cn(a<(&H'I\C? EnimnҥrMcl#]3Ma!  3f V $XMfƴ?gowH<\>P +Eh&&L?9'ia g(xLΐ$`mk5eǞweݻ!m;BP՜cSra+O1'w{>G5'zu"u6&HlQAGC' %b,65.Rq/{3 b6mBr+(tz^ :Rnfw!AVb8LHdS.ؐzaŶY=(hxsauݟb~;|Fut)tRG(U|'R>9bVߡ}JR VR|!z* N% !MHo ߻-|NT%5T1I<Rp1)/h!ţ7oH v ~ RBTqB@~ZW-vK:2>ft6Pu'|Q|Τ%%vڷ:Iv8D Sc1SRԄb|nB?iQnЯz2Z!Ru MpR,&s.^=Qr sC-6oOn}|c0ݛto39uۨ@fT&C~[B5(%V zFSB6}}lTf˵J/gdۏE@ M,4McYpXaA_Mb@)G^el>-l>h% ?1?yaYFy6y(BEN\:dY% -;D)}ZeGQPh\Djvm):Ԑ]1}RE1^M tP% "m 6Eg`2d? IkQEEꟽ{}lo Z/5849P4 Q/h)MO izѥXUE)<2Bؕ8׾mWm&`1lefx̸֥{cVƐzh*d(kSNcf/"HmR~En:cAWeX0q)R,B8;.)je0i}(, ɸZ ѫ7hU.L2d3Cmp0u-s1(/_ C!h* V,Ez" ț_6Z1l?H]O^ͼ@yըx̿[.KS!NIolGkT%)5ŠCnE 2ېZw-\Lq9TVY ^O=a2%sxu_~uJpg =Ӯ٭k_m%fLK&usܙ!h.ven԰5hK]Ǫrn&%9;`˱^m$1ҦbDkFmCRįUMH1 1W )#8Y$؈G8S6DQPjN,A &c4/% ލQ-kBFX|u_=GD,jۑe{3M{uRmRa.'u [VYc֒t׎069B4s(Ts˟?-*+4'H`{@$PZwʖ$ ml;l췗# mqʮuYi[QF i:%Sݒ͟T鷟+?#&4X_T@*Q *R2R" %5N.qF6W3SL+\. o F^XBT{ ܺCtF֮ۊtCGUY2iEH^Xr&?롚P:Kym1F`q!K0sSX>k}}$~7m830!&,qU,9XMF,dPIP08ԤbK=E&,2˩^kc7o]7š!]ZK9ZI)1+.jP|7j˥Ӫl+PȔD"Ks_JT*eSʼnRcFgj*BLԨΑLpǘ)0clѕ^qC?`̘mё(X VDDa8`U6Jg4T˘p@WW?^*1Q(= dmc|R?^9rpB`7CcüU9gosʮK~mFGkOLK4?_UjX:t)y,\W-,ŜRjxC-~\o*H1tqDCALzDZ([n'cvs v鮏܀91dAeQ?HI8gk@>qH1nDwoxZB5+>(%n Զ򒍯#P39;I5Id,ŝ*P RBa3B7ۘy76kK7m =&xFH]z@hN50}\jYLOvwیď0Qi+2פD_pam':T T <|UĔBhJ&`:#4XFoJN!?Cg+lln`/q#6f!&N0r5xLwHRҽ5Bӏ;QzO < y.-V Ts>%絺wY*4")%le'.>3`O Yɿ/[0;ue9[ 3PTD2my͌k'%d7%S/ꄂ cF:$X5QKq8*f52J ozxvBJJZw.R3BR-J? :3*uu`Ș񽑬GrTvWK[zrԑ1=<]2Kp[d@/,_5, b6IP2iU2`S$siP8"=;TE7r=ǭ ~tk5Ԉ'xEKX pɘkM>s}l* u¨8:e벱 )!M+lϊ?W/,J}\r<x8? ոJ"j)ʫiƺfX-b{,*qeapajZjHABu)"; 0?ﭺ _,X(d2"̎K_>ip~Xc+FĥTA4T1WS|5IU0o% d/NN6i/cxR5hg{[覍Q%e =eA$ĒS0 "CqZpp}Ik.E`,o^|gB)X#SoW n$Xԧ"GPŲQQ񚊂7YgJwE` AK[1dX鄼J8 nhs7Kgaʈ2. eQNru(uJU1j>Nrpc7sagGlH4\Où)KJVV2D8%a\?pԧpMFBzc sPaՑ̏N`2*FMVIUE|NHz}dHl`+OZcKoøud>UH#-fT'OPf+>ZBeq4R®t mE"WA{Xih 制#rߢ`WpXH-srpTϑuzi$Ľx1Wm.꿛w 85@7_T|w>diZ?,ʪ A3}3c|kmS3P aHhsz8M eѰ WMT1_e^ђ IDAT8pLJiбI9IS#"AK[[;wK%b`>F$z)a/^cUA= iCׄ Y;gfC)vßS舩|\~Y:Q2i N7qscnd7'4ɂ5ZuiۛLdc}[$ ˤjv]vy1ofIq }]y'A= 1_!dQPjo4`{e:JUhjR|9żGA*]  w0 ӋOmP-fp@V;j%b+gMFba{yNt 6wq{|I9=a k_)g(,;[3*՜,6d<[3R|]-ܼgjM3>dNYvuK5y.Im-`4BxWܗf%0$fsDWkqTUme_pCq|4/9wsBB|TceT{l}#h-m!!mGNy\BH09?Mlവ`LG(`?zVc+7IwE:[8۪C2sS/~!@fR\+)_`x0[(ߡ#R꜋#}⍒g%8f<'R>J[c+wuBJQaر0xD X^@poe}Sys*0 |0NHMFE L^#(\a`iR,1ke'LbKH#H Ӕww)0K!~i9f^z D e1!0F g0 0#1d S~ݤ*[uޔ[j~9<RV辳O{WVfx3Dži$SUe1XBm BMQ]~~Z՛ zo%eV纺V:}ϠůC4FkOM<}R"HQkEr<yҴIf_g &|,0+`:_N:#Y]d\a+he)=~xḺGtF{ nb;wplƆ0bw lBR:q+|XĻ lz&CQ oo!\xto288ID *{CkT2?@P~;dB/{kkx#k`<~5۸?Ň CNK$]_BWLqw!M{y ߃zbجV:P;~.?PN;ppUm|]R4@"ҪbU9L0 yt$rbio2hu@H-YI{TU_%8'hUy]󑗝]B!u(?ߵϹ7_lJݜ+&dJF\ uQ#U9<*]|e{)Ly9/^xjmkF!sXTf㈭M k+UJ27kdtC d) T9;*1aґBeI,WjbW%sl;z_i@({i]D+kbȬ*$'[}K6(mOn=4GZۄ ( E`V8k Wo ᦘ&OnQ~*x̒'у {y]j le ZX1v$`JKLuB1Q^~5R.L@ȼхamCc(#uՇY7vI|+(HpVP%Q[?12cO[c%i֫u~h~nRyÖhͤ6X&WǨ[|ˆ.b~ž9 \3IeO1=V$3r.-]^e~$I4gyYYUY]==c1c 'IĂ/$bNOOWuOWgVVޙq{*|P3s2ۀǡn'|x [5\pqfFP@=UfC Nf:]*'ޠ^oP_1"YHZEZQG.SJPi?4]# S[ Tk*쀫߃/?%]`Mf$5 >|Ou˩rRz@nXd<ȲAb(>&A0|5X_1MZ<*9:WԂ &A}҉t{ҕn>i55jИ/0;Q0&$8h ƻoFr˿ ak,̈́hG2,7w͊Fa-3M{hFHo9b5JZ1) 2?ńt`3kYecHl|팅T$EDpa*8OO 5-c̓(@> 5!A1Ze#w(q94CMnݡsnF,fJk.J}ΠmI.=XB̫Oٕsx. ,ljB/,:SS5uH9JѸh/ a0~`Hc{u-ZM cmpAjuҕt蔥K#w7Kly]qgEP%R9.@J%.ʧXw d+H'jS8"F{3gW/%sV`f\Ba{=~:a0h+a70HRtLf`(81gMc(*e^+Uix բnZm,q0PXD61134-iyg nmVY\4]NLjEXk]v`O&8xlXM0!%'ÈA&0 f@2Fugs2z|NF%k\3iN{H_ ۱x ̙k6f5RGa$>c[yGijj5Ym!;s|1{^t*xH,QIY{&v8mzvD&q n A\c;@ *!*m\2 Lsı`ܳ:5O}K~s}XXt#`g@ -eM!qCW~7dCĶA MKni9'kq$p*#1=| ij3$hB][ WAl_6wG8rj [1%0!Fߥq}/ LņKH$MKkityam^S0IעPNA i#OJF1B%eb8"]Dж?o~mUm”wtD2HcڬӘ5 H@3YMI4i *aIK-b@g1NCk\@KK-/ət20xb:F˘Lg {Ln1"[zݹd3^\1Ko<ڗIgϹbЧx?z3=%sƄ>qL8Exή\`~D2İCh@ON0IV F^zE_4 fᮼW#h-1Y)R(PNp ϒ&я MکauCItqHEF\vYډ cAEBcӀe5ĺZSʊZ*~(Xp!Qj]/qUⵏ0it+A\@o|抄Hz ȷ./SxğP݆Q@3@e ,,URD:;@#Č ^3 &(և7xÍpF)A]JxmBnZh :ث{SURoGJ2HHb\UjU$GE |[8S9~f - M.HQ 8;z 3xt5 C48'c5S]p=-{]4tyo1׿eS!Քw]LXxrl ~)<c4mXJr5csW<5]M.bW$Ff9W6 d*YJ NSM`l3 uO:(y~} AGk = 2qŧAo/Q76 _x%-jjAG[8gx˭8Orz5BSւT<'Ggg3w#/MɂOV}1ٵvCJF% rARAR Ц jg+h!ZTS?wׄ:]P<"K$@3)g]HIPƷ׎\5ԍ, ?b\Fֿݪ<RlaHzMD. ScFw Ԗ{jdw 2aϽQuftiHc$ o:ÜdU \`L wF0V(;i7j|jg[g8,C$! \R(DA~WXΎ6ym̍e9*lA05Hr#N}sMmUuFEEET'8V"YFJ f,ZkY en G<9D:6fMQ37tAWe1xj<sB#8!#!'Aq(Z%.3bZΛ[x.ܢϵC>KE<9NA)%Oo9=O8|b:-ƨ;ECS<Wa^w^#WC>34;Y##1n !Q*$l"$kb4ժK$rNLSNN I*SM,?SqpQg?>'Ig؎ǵYKBc>ORLQ):}^(VS ԝ'hK@xꦠE[LxM BISt=5a (3[y3@*{;H\yxÍ d6z/?%? ʔYlhZcT }ž-?@0td~^x>@ۦEF;PԽ7Q|x1Ÿ oa 56 O @MHU"V9jUSg *t,¦Y粽Dp)yg,u& c| æs%̀>c|*Q|]Lj>LO)E3_j{{lA`f Rk !㦞)I'}1r?DX>E><݇SRW䋚t>'?~ T>5ZI*Gr{9RS=COZONV!'Qdz WYKnٿcHF\\(1gfZw%٠E]m U$emkMP$BAS=SS6zy - @)bkcɇ  9䠎FVͪgz!M^ ԕrL2TcsZab dbGP0 tYP;F7D9tDߎ*؎=ۋ;)Ykta) UcGkr7?_K8< : 4^@[)*VbRt=G0&UE) {i ~#ť5*DƦN~W0"c4ao*~cq"4piC Ei~ϰ?dο"#wduyƟfؿ?f^W\vWHCr/`2˔ 3Agiww1_`Α2TS:nc|>apȚ:Ky'}dgS ƌ mbCƎ՟PQ0Glr,iVy?oQjΪC6d5< IDATC凃<#,M=xv?.X b73{(z 17Auyߡ)T5'S .|@R˧?|;;%'O'ջ*ӽ(iY%X Ìsvp̸!߳7_Da'9SMEACA%|> ܹrJE&89: Ro!k"}N'_j pnG]jJ, 9o:%ˢDv;YK)r?)/M~SKqpvym~,hsSM`L5< a pToJp*>WWkjGHv\(!Jr iLf# ΝG%YzL8/69Ji“&cj;>BÛ>txņf~B;aqtTg/э+loFai6`?J\~$V.ظC,lazB30 ncYI]Mq+8|x7]7充%3p<)Meq(TyEZËǬ]6`|'i)씰( Y?xN(JB4#񖫲nC?_Ϳ|1ma˭3LԾ Ox02Ih24I$QaíOsʺt+oWa^c]cnW#mz=ޢ'HGS3\ƭ !/O^zVzFԖ+effƒp[$~7KxW#9Tz$e{;YSvp*;ڐ Nq/7!Yfi6/[ʂWҲQ>)e[Y"ޥ"%hcXX_$F' 53vwX~[䔽Å$FFy0y9B ;1bf?~>^E-pRcM4O㧖Ͽ0_k9I)HWY%\:bZն| SeM0ol.6;>neΓ&mk!)wq$'+P8^ _{u"ZF-gO !;!K౦B_}FxY !\;Gϓ%|iF8[mUܠzRx8d_@˜7Z7ágy1G1%BBQ9zRq_cBWF.>Jh:-PY qpBiZ@ p€`G4kFʝ>kO?>'`%e\N~?{~碽 UzCOS>KR /g&VS<ı!h{,tNbOHNm\U(tk3j.n~W;.%G|(qY{WWCV$bB_q)w,ܜ౒`t5#lmmap^ Xu黯9nSnHX7Qm+j_R쓤kW1Y$9I6@R{WC(-RT/_#;c㜥(2dBR_#: ^7Rn6꿡N [5*%6R٣# >&Nդ<90|!z^`yv@h$x f4^ԭhۏxe+| #,[ev5HV!jG/ax.m*4e N~;2 lbz]!0-RB NZ0.mEv_h4 $ V젆 C|Y ELD%m:?J:*0[ص+8FÊxVn>Ad@u ##[  FA7]n% W_i >/O J|:ܟ< 6%59l*f ǫθ J~942*vcZ4P/ SAke~^KN{L 1/αCE1/_AKs\0Aةnj!&1SQcXyg6vn8uC« 'p1]3 C8DA%gLA$" !fΌ!}lKݲ#`X/F;gX蔞I!k?+M԰VۻçS/MJmDOd)`B=" M5;.c(z؎8rYVk;H~q4JX"kEKQgZ#֜SW3yFgU~3LOVȎTFP!˗sK2 OH"hlŀZwE`޹qBSvpy$^IoX* ?KG5Ũ`q:Wz\]K=Qlj:[7?L^V]6Ya[.we L,;X(H8pqrR럠6T&>Onviwu@ҫ-fA2ĄLuB(A]Ni4g(H7,|0i"Ő lƶOՐ%?eQ/k>?\/fhDVݢèZ&XNS96)}PATR5/k\e=T|MO a+):2 `#r[K^P'hI$,~F,=$/S.>oW\ a/I6Hvk .G@ IZ֕/%/nN*3hLj Ř?pپc} ΧhYƧ\x*Jn\0*/fʓݿa؈ &|餙Q6i j{#]3W}G$|({O:MXh+wz'tAMWv&*l!IC:9T7%ioܔa.M.l>woE]tif~J,!ؘD;)#zej447K ι+[:!4K+z#[oc1>ؑe1_]<eh8 w`\ A@݈]%]CRQ40g e˱]R8M7fa\FƊ qz:#Grյ5E)G! !#hZ/ոIEYn0є\Df = =a tsXj(#qZBeht`vZuZVS<+nOd$#p|dF:@*$I 8vJYBv4F܆ +M_mjb(ժ8{̨%׈4h4)vpZ CfՂ1HdE$BW3ӊ=*cMFUV U|:r#5W&k @R[ |6'(SFqffAٖ0i_mOER03 \Ɔ.^AER03uvJ+n%7I]_Y6Pdxs+Eҟ{WϪNy^p cUq) fk Ѩ#K9 \f[-X$o{[S }ْ! kDuG9OS\A8^CiQ :;$#hbrWl/_&3կILj>a%ˏ_m"`Y;ϱ٨}/P݂kKiJ"o,~/! +AETs)DJLI0%JEkL#d#4BFQ˜BJ|ee?N%kW}l"VtF8~RScq|:@4YGm/P]GdǾ+T{  "Flq^ku5l0󏵣0kU >p# gĄFrnb]$I1bs<OmD %ϱ1R`i,?yyi-mĬc'0;y1^m üDRĔ1G;' n='83fb+~U=c/?gGw׍o ԦLNIH$\[c%6CsobS@xKn6^D לr{sX#9ϡ=LtBi; wl^ox8R,2nw ,{#t_R_X1M=U8ZQw8QΙH%h$0 _OysJm6l 1Txi=߱79tk,$0DqBz9}{rA_5ӊ{3gǤs}#um/\ SHmrI)h Q.2&ǸI&3Jf1ѮDɈ+%<X0^' Y̱k0RvLkcZ!\ECP<*Ϡ jN945/Sꬠ -=)0[gC!.:pi9 zTnc Rt[F%pt.y%6?>Eh]@F i|*~5:f:rYk&XWP|um:7xAHz H8[C]P(a!{hH6,.d6Ʋn-پ'L wDS}Q թ\L05y D rvZβȉ@C/h&l)I/1L${lyCZAN)( .˅hXiP ~L(P` $NVXR))Q4aGde5 QQPtԥ,uSLϨ8 gO_ B`gT8gaOjD3X,@WZq80#XG:׮ywOBNq ԧϢ&ڰ5$i5j"=f!e̱ 07R K1sH<ج>#s+$yM4^`)=Zy|vd#Ej"Dyxf( ce2#ŅK66M]9nAL&d>tϺybmkek(B8= \chU_71d`_Z&-, $B$81Gt0951\iT3&t"buhN1RtPb,դ5AϚoȺmPaJT*䥷ɽ(4NWFLSş#mCǔdb M aQ=zE}\-Ez8"]Q#4LYf!M@bVP:70T{%'C _"\ s8;gSLލFC4d?3PMnx?G'Sp qJ8yisbHa;ݠp8*J^Wݠ_h1.rmQ9bA~}H/>fQ;&[-*%sA4|ンlSoScA;b1(b>;n36 o[OY8ĠsF(?7i7k09G,zsOVH\U:?}?bʔpL_0 VmXF%#=b^MOԣv &bECȘXqivQ0Ry< teI ק'>I@T00<٥lkFK\~)CuZz5 N݌} E{ƍ.mULd"߹ 32wMKnq&|3%%ٺPϠ6&vo[5T=k)ﺢ_`]>L̟5Qr딶5IpV+D@5,8zXqy {psnHGrQz%{~Lk^&Ԉt1??{)ǜJ&IHh3&X借= 5cM zEO{lu1C9`(W#&zu{l _sƔrL0+5g2lM+UIW+Θq e]9SoٕsKi9 􀑌M͵EV oߠ#6g$臔^)mRns% zz)loxA o8Oy3ʼaVR[q.\R"wl[̘kRn_O2t!k2 ?2[?s)&4A=N-SK;Qi¯~e+w,bYR9^{L2,D(ΰ !l6}yM(,gG|BDF3~CpL,ɟ']2ꩁC`J'PE7lc-gX,E$`)RREUh}B 99BH!d+rql:C8EHA!z_@S>?E.pMFN IDAT+1`^ECQ㫚?@x7wY%<CGdb멮N @innG xW\.b[ ]q2˨rpRTCǺEez>Ík( Bt?sTdL9Υ$m:(u1mΉ,U ӆ,6JVvb)5Ҁ|Z6oĿ0|2M~e PNjso I&t#ðM߼~i̱Zc DÈP$41 b`M;BEc-Bup|6M 4٢7ig`hceCqS =MzR }5mբ5ʋ7nbꐇ; 'S9"%-h>~[ѶOXJ|䘕?Z-) CeQ:%]]5$KSbr'T?`v|hLa0)15D  5\.uL#i?hT8z ʤA+5k!եDW04&YZ2k>sLz wm>4LVޓpOek=Q\loDUh4% ١žrS^9f}AiQ^_ߤ5YHsȱdM GZǀA3 8' ! Æ?ɛqrg'xk~́-[ؔFZE;>C6Z"yNmT9放༤iRP-ِ7o㧿Sz1=f|ȝT3)'6džM,^7Ƴ!oϨdXkoC^r8nt<+>3 l#pq†"E\yuͼכw(W 89-t"[:G< oY]DV/{[P~Mhm+ Z4I̵k|tOIhDЂ_"V*7a}3cژ.jO;ʬ(uo=#*  mr7_F6`OT`9F@_"49H7vbm$lƅ5Y5Xju .|Pb4cnD9(LWoeos]T!rȻ{:-3k ';jƟQ|9#_L{}i%,: ?4*j $Yg.u+Ƀ㺹XzK-- A?%ۃ9mK0ɕ_,wX;d! ğ}ܺpvڙl 3S~v? JԋvK8|XsqM^͗m]짭#sFk4twoQo7=A1'G u=&4J e񜺌V7tO% *m. [ݤLhC#d"Q88…u$EV%AYnl L:YzvB6E0X.a34m&GP ?ǃKm;giWX'A9 H4ȥ c@6T( B!0[˥[9 AE^qS+žv0XwX:] h2A>aGR.XQ6/oxfLycch! =!Fg:(׍`}uB)8oX3ckgQxXu#z˄;.,Wb0AȰe " f8 FcMb3G,ÐuDٲÐ`)D]S|pޱf()1Q?XcD.&~%8v 3ї 9])2bv!5F$e;7-W. Tb ]+R'dPs=\04yGO%Hye5Nm {}}Zs R:%T^xjCGI&8 /'%,=hwd#&I?e—hњB5x{05r dP5`z#D]"v l|'*c#紏Y:0Z,ڒ0 `Ӛ$ם;I|x]ms:-Ҥsl-{NqI˗ x5t^E1RSJa=U_d"D$Mb*ڊ` ^\3YRpP,Lbi㵗7j;J.7n[^͕Qh Cԟ<… sH#AU桤'CrN=Kjxy 2, xlMkPDTK*WMMc=5 qXI>]IUT"Aț>Gyn@P TTnN0UGdxC@mCJ0d`iYbUQlNm=4(f-MFYi5lT0&$ Gx>" ]0bр NMzӿ30S2[ U 6$Q "`k•cOM-پCihN.˗-Tސ=@I^jY•s?`^ȋ &BѤ&_2qyזy]RIXӣxJ7_L\F֙BxE&sA 'XJ b)*z jFzP w9Rek,B)G {BNЂ@ֵt PQ>*P59&6Y4 h>VI r1^1'aIO)Xjc]}X/3*}`be;ugJca|]]?4 OtӨD'5I_Lw ^5 IUcq3Ctfѹ'BoSǫ.hdǀi_)7XM?L^$bMޞ-Q> ti 0q)%-zL (29Lu,GznE.W4dl`CK]Zb gZ:xY%5JHMܲwu /5r[UT<>9xj@M7h|ıپanB:BpظlvY%NU(EX E=$W+{hыUp. !#sI*dӼ-v=Œ)E&XK{Um 8 C1A N@$xf8>A5ry&e5B<$Q0yh-^8 0&=op:'@Ot= Gau$!]oEpjY&9/_vRIQPR 9 AуW.Dd2ќ#aj9|pmh V'u71Qsi??`3. /9W+aP\gC-*]EjQynxʄ8%q8( BNkzT)s\!I[Ab70̀i8B8(/bǜT9Řި!G4L2 cFa|>Z2!1KIR0&iY҉B2;fk*JrFkb@ c2桤1u' cmyQ@KHBq\)qpN98C)Z7Sk_KlK6!KcM}(Pt.|,k<,Z#>jo|[5֠0O,~djBʠDRn"lQ` 2&Log ͒`_ɽ)w(Wѧt5c9'eǟpE+bJhܷUM]zw`rmڴy1nBIRfÒ{_VR%6~!ry_I!4'au͛)LQ|_{&CMG3U|# gNX0gsB\'h,gű\ƆSKv߂ e댂zy tF'+ B,%aJX<e0kiMEBI]?́-篕=K<;]xloџ>€L!O>ީC &EiDY6m K"MBuPp䔵1Tu@?R#(O.u`5WlYU:O2POF50Ӿ(:sF>R@ZsflpHɽ]yWUÉs3Ƭs?d8o t]dM 68fU:'41ː#T[Sy-IsH rTvؕq``v..o8޽i`,2ky2^l1\a<Sp헋4Kg68M5.ng}t pk/\wdM!jlMw|}~3}č6 Z\av6OyXiEі`d{31Y88VBv&7=Açc{  'n5|ⲇ,@Tu # HXL0M6{R6gdd8K2G_oקQYF2/7x?89s3#cv2 (O0`@z|y \܄7^qR^ )8C~]` \h|9}STx?~XOW31u'wN/ IDAT/8~1^bB|m5O-?لn\lڦ˪_W5l T*fH>^3ܯiTxo BhI'TJu{36*O >L/_S+EEQWl1[ N ꧌&L8-v}q%#;@Ckm VDžc&GrXߤlU|'f0/qX)'k"l`oBq/ot4xO){"$h 3N.TVCPiN}C&Ot^h;~Xbov‘?6g5~ZBm34xU2r:/Ru)rNW9s(}r]g;,ǘPa uufw] Kʕ0 $,~!GoacVl(e @'WJW2fKySXňAe15][epc%j:Kvۨ%viTAJI 󺏗un}-qM?jxi ]-Z{2ѶT-/=i:ÃWx[ H&'*sP>njdLB\9p2-X`0ldeeIrGaJc6.{ Wz| rWIacAq^}`cDVXfĚxFmۙDZ>{P`/|+W39cq (h¬A%MDup㒝ݚ&xU2[Cܻ|N\d4)m=Js pZj$lO0e8d7'b߅E}NPEd V(dQ9R[}!N<3 SۥS_א3MKhO1Xْm$Sz'G< pRZ"嚶Bo#FلBqǃk2EcU( Qbgp7nD]#<Ҁ3 pi;ӊ =Ť0ً >1 L!K+XS!uΧRƎ !U(wA 8됱 fBS +ζ󇈑n/hez2Pc,\5UA8`Jtc 54ƩƜA הZ;j͘)78m <HGj2Dz׌C/" a@cQXh9\d,zB_cZzI6aS-U\q>` CJ7v3^nɊ1 .66 Sh{} 7HF/&9gv~jQL'ؚ7s |asyOO1,")VaY#wl^ cɁ*F{'_L\VyMΡJR}v] yBt^4_|t BLARPHA86lb:N2MI5Vj5PfV =xO2F=j^dYHí{}0R% f(ACM'5]vrZY"~@ `5œ\ 9F97^'Х+Pw" ݑ01 *uDRc #`M,CL"w7nӺ` bzPW0AL!MjhJ )F.^sT cIJ xO }\F .F7)I=7Pztd+B2T48.pVhLT0F,/46l}7鹗_ ׶GF0厯)!N-E͸%S+t֗!g@&}l]K]{.У"ErbV+"ТCWX`M!0X&fI[%acj{/<}ϸlD1bfatjNL1ky!qyCS觭ш/!$< 2GRMk7V?-Q3׮K9f;Ä PU3 We2BY$d-@F 'F\ބmD" Q"4 f9:C UNe;)=V,dgՄ"YϷmN35nN-Ɉr#[ Q{!CgK}' }&SHHT1) ,n*iԓ:܏]nrQϫb&]ѱE+|=&O iB0#]/T)t_^.ܼZ4&T8gH4[*Pp: E$t蠋H UGYBFа?lYy 0ws|'*ƤRM/~v GWpx RCOg ~칭bΨJtώ %<\ i9EoAQ8w|9}N2>|N)UU1|_I"Te}?7D>*:\XDmI}I9|9p9ߺy/j_w f뼈h+76n@{~?OZ{>mwa>7V_^]{~&4$I6M}|Xk5 + +8_=6"H9oۘ +oK5yi5k:IdI8˲}P}VXa^>C_E5h]_ďw"<Ͽ {+^aVXH8v:B^cczᅱ'oE½^Ox sY + /4Ze֣Wq4:ϟ>EQhV_%̚f Un}VXa%hl+. ynl6{[}k|- Cd23]E+ +@_^&iesiUU:Lye|H*Jaw}VXaVgE.E!pѭA#9=99$ {=jU:zVXa lwx[Bn#Նϗc,ӣgy|$Q}- cNJG +%ߐzE#ZAVUU,rv8s损ǫv3;Uhkj5ױ + ;q>CˑKM"*Ӫ(twwǯߺEiqZ/X/{p + neS12^C|y?Eŋ[[ny 2_~__۷`<kYP:{|>pض#i]ךyZ!n{oC6mUmo&`>VUi}ؓ +‹3}ܺXϝY5Xnk>7"p{礨["ֲ,5˲F95d2`@jѝ>Xwvv{޽{/w!Wa4) PgY f3V,ScLhdYBw sMd"j9cr rU5KH&"NU8 !X `#")t;VXae_u%+F,UhTc

kegzL&9>W\90D铓\X;9Y*v٤Բ$"ɩ»YNMѯY19# +8x@ H)Rb#"!haO?!E>Z) jgYD$xC]ڦ{p8pp8Խ=mUU|>W\z=i&!Z!s&ml!xҗZZ~-qc1]\#\8}4I?bG+ + hZr4D݂ ˜Bh CocR󉗼vDF!AUCa> ι.m'''UU_vM޽ <a^w&%m9hHLrnߖ|[f\8;)m =YPkVXaϐ0 na?.@"bbԺD`'H8Шj[> !tQpXTt]g>׵'Wp[w5~2[NSZR$lTpzG/""iFsMcۦl\f!jku.͊WXa0rFXz%')YL$>W &q5!F)^nUR\!9f`0Hyww ^u<d2>-j‰b;),42M=TCBKjZ#!1qVYR= If3:`k_3,ы_x%ZaVxa,Zdc+6RTS:HI$)u"6%ݴJ( /"=)˲lu]3z:*`HbG?m]xwwWn߾-ryܔɲL&39z=igujCB>nlo[bkUZk#t(3HUC¬.s" +8篾!ڴ/t96 Z%Hg~E} ϧ\#"~>w4Mh& Pu*lnnrJxnnnݻwyя~KQ7FH+BH,Da!Hm(/|m+TntGlVAIAl,x~:z.}VXa=IG4,]єAl(Y"V m #(1&KE&nor2[tjU@<<9== u]K !s*,{{FBm4#D-D]FƘ.;m. d:Ⱥ7;\Zn›bE0,v%"-CSؓL;0"; 0Cה+ISR*&ᤁ7t``Vò튶![ֺ1hc|4|>l<f3Ԥ+N*m]{Zϟ(EQxJZ8Z(,CrrKܼRJ,u,E ;۳ 8Z2.i/ [<]c†<:Z7ləit:uZh4J) ¯V$/NW||~Ӡ+h0l6繟N\vUU( psTy*H6SN~;sJ)EAq?_z% |1+g!$auRʻ/o6i!GnزDdY2 Yk}Q.4cbS ^8}=r Ӱ6wϢuS3_&<}X7ڍF#oq$s\.ݝRt|⩻^E;E@*Sw\&@[nb~ seqb9I-1FC9ZPy+eYt'`JRe+rqSXYv"dAO*ަiWU?֚vaiey,Q mDMֺ,˼1Ƈf`rJ)MS_e~_7Oiv_i".˒=??i6Q1UUF5MCEQ~WEQq$4@Ip t%|,r+͍l6s'`24C:Ki8Nħ MUUb Hq]qI%z5. XNNCVH8:Ia2,\>C1_X8-`w:b+b༌y82p2ʙh} T}X/ݟe˰tc/|c˗yÓMA 2p2ʙKZA%,òeX!K{.o(az2cL*dS>ǐAE_HEˤI;4 } WH8/ c1n!a M~.Mx/p+%e 2p23$s$nA2$sl.px/C BNIK\+rAߧ/ЗH2$۔sKM,20𲈙̼$k8%qADᒵ//o ΃]IR>[ [A\iBk ^)o*׼~-  Tԛʗ)>z}g'^ 1 )?.x*dIENDB`mumax3-3.10/doc/static/web2.png000066400000000000000000002031471371432437400162670ustar00rootroot00000000000000PNG  IHDR.)mDbKGD pHYs  tIME   @F IDATx}y`{]$7$!D E5,"TQ6\ӺT+]p VXFEE-( HȞegsl!q2w#dj,FWy5bakO 0V @ ķPt=MjYƔ SZݺb:L-QJ\D!"_<SCt'Yj x+7+ǔͿVYQ@e,>Ro@mu2Q?G#?v|[V)%ƽh\jLI,}MX!bSoə'}/|RF5DEYa0`@#ŝ0`G#t?d #NJ%lAmuvU 4F$)'6yXd$PL_>*8'H4,Z3u"]?G#SMUQ}iIY Th e Kfv87O$ӓ65kK2]k)ǘzQJfzf,Z¯ Y<ՄXv 2 ![PP |O$BG#ciDN03ܺe[/%ik).>&7;„3ڦʯhyi1s7-F`mUjV +dq_w,b&( l1ۦO<`bĀ$CP&Ll@(0@"WΖ0zrw+MQ*-G# AnVHmE0:Z'ygf =0iDnVZMCv\*@JU4wnvZe؋1J=iG0Dx@A dƃ,҃bb~5 3>D,~^BI1ƨ Θz1T_DsJ0@WB#a09gIIpy#q!( 0j@(0>>g?G#o?0!rh4DLy0(0`5MMt7W4 }5ڦHvZRM~/d uI1.B{*q~1^1G)%,L^)kӥB2A˜{9z ڌ J&L(aQӣ<#?ir `L1pkSKpkrZi+pS3zң ;P[cޭ 5 9/?-u{2󇸓@"JKʎkwAP>J 5UPb$QA҉*żAZСd܅KvԮ #?G>@cOJA#73g05 M8+*ƿn?rJYjVAKݞ=w{w}ǹ%#`_v6VM(JB1f)Ye !vAq-uCz';Nx"G#1FP3ƀJAjm8e9/1Pb (s!2w= T~`W0ʑ|e4dI林o? OR)*81%s'ٕDMGaOs ܃IE P"DeZ0FLe#|X&*w*EG-wol //DB֫C#7thIƶ=&`AdǗgR0Jy>(X<1hS\1ǟ2.lbB7Sr% K(xM(P .2*JR`` N'2g Kk1h*8꣚VID),G#{?Z:[̙U6(ialwɟs"FmpۗٗID1ڒُM_GxMI"!?|jQ;_L,HJg͌K'|I3+<1)h-%hmB-Hkiv$ciF1 o;ޝm{aMw;)ݛwR]ISw Dy~lUN|)ǫ΀B|k`@ȴ&h_^x3# '5#w\ %8 7y O5eTW`$9#ooYZp/-9/Ӧ\ńRF FE]d_Ò2ƮEf9R&RX~c(*UPceG" U'8wI#N<&"Z?~ZQ/$ w}_BVq5/($#}Z@AY<9tJ4@Cd"c jC8 7ON>6`,tRLpsPTG  MCȿi֩U_냆C.M+X\'N| Yljr3')dp#s6gZv9?{7]mַ{hhj'ݝGA' ٻAV&= `'jb8g > )f×Ie̲ֆ9 [ 9c҉C\&QU5=(7u:cѢE+gϞZc.ǽ4|)`1\p7xt2R?RV3d^1n8? 7ܫWT]1 #{# D}#; C-u3/ݩ5 ,x.2#s\.4/1c$tyJ<.T_ϊX.т~ ljy،zo#TQ}fN9*LoԦE&EK8Q/LLWA ΢:4go#{cRIE*(V-icxdʵ@_x aQUP0׽k/?a֝Kv{t+~fU7?w- }{S^e|q=.x,fM9wi?;Ż9mϘ3r؀#MǷ~yj0vQMRe{)]F7QU-!螫V^1 1 vao?ӏΛW=w~:4iΚxRފQj3'`OlAX mN4͜@flڌv"5~dQ"ފsJJ7/&2'U{3UQVRFE@>`TfȖ?Dh?G Uz#6V:P QmK  ѸꪫJU<~Ɨo5c;ƛn݌Y߻+?!M"{}9?j;}Mrlj7ƝOK6wu?հc5좛_7̪?{dLS8邟7^ݹauoWkzdȫ'3E;aأ!3I`s;s "K]OT=.K 8hL+,_4 9Kӫv8M8pV͚u1cF4OmVIc d<IjM6p$O7M{[bQ{T(LRЯN_~rͺ6]UX+>Xf3cUWDy@aisx("WFL;=D2ʙZ]Ȝ*wLRPwtW0M{%+W4iҔ)SL2iҤ+WLs HZa:V]pw*+ï#!FfFierɻyCʴL ]G4_7x̼v^Ud( 'T#ot&(ew*f1i ]vDTL ԅ]y t?G-[z=Nc]hhx;8JXI7z%J X^q߇Kv ݓ/W,oM ?NG̴x2đj\6@׮avgeeuY8ӦM+***++k4b楓G,_ԗC>B _s:ߜ'^d νk9p IDAT!g_Y;)"DAHw#SWI%~Jв((a|/ W?Gȿ?XT¶mò,۶#eYeH$xܹM Ԋ5Mc{?sst8GkEP4<OB1@ 3wuinv4o!0 4 )1;id4SO(?!xxDP" yqQx\"bTc9_Gr[J'0\6uDF<d'6MZr2d!MED"sB3ëvw4ۜt࿿ד'f:bzS#nZQ7Oy=]]ۂa q7G#{=%H_6L0B Dͣƒz({:W W:.I"ǻU'Z>SQb8FqEW1;߱iNw9Fq'}i]3 aOZvgCN&}.o]{0;ҍwLj/_Abk~!?ѕbV9|yI>cZqEn'wtX# @3ǩ.L~P^AVX)TuZS5GiIxPI}\.#=5t@N *OJ{WƧ#;B͡CL p?[{5( :At:wXT B;΄Lfs*^SݵLNѴb=*!VmC#߃8 J7I Ǐ-*0^UVr1TUW.9x"GQy;L=qlSA}Wo^}crʘiG4Btl Yڴ>@ %LbvXQj"mc=y"L3J2*ĩ)*DRDE@93_hd* rFUT.xCt\$-M;1 G 藽 [@ #L=5vxRZLo㩧hЀ1;%zh-x (*g^ DUv#QF"K1'4J͡9GCd;yVN  -DO+⮵Fӎi\ WZĶ$2Aׄ4ux=+O.%K,uj|TYI3U44 MCд^lC6c~VumI&.QblŧN9j^3QNI֓MxYM!fN'?P:hdg Я2O\AS-? eF!c@44 MC44 Mfe{gDˀÐb5IE|)cPxLj o^G%P!*O S&+Q)4 `LP7$T pihihд1Y QjX1j1ffyYtsTV/bgItJHV*AupbbΕջz0zP_892=n30 CNWE@[+KmE(*ơW6j#V b'ihihZ7L ,#(71E)2j3-K( Q9uׁȘ"Fy`ѹsS&j$pE$T/a"{@:N^fgV{w8QɎ5i4H`Ę&19tXWLcݸjhihEe"@s?Jjq[GcVҺ!Vy8ULN0`.1Q(0ti(UEBmL0b @  PO>8zd6z6jzN" x@ITOP XCRAmR3㙪XT#82}ZP![VVPWC#iWL+) #E(P^j=2Z5*QThh)uCe+*:Fxd*`B.|D9KEx(B{L @ xx<e#>RLCcSC*|BTyx'~8BG ƗI,<Ų%ILW#h0zG]9mpfwX3RihjHt'k-'C4,NSo5!24ON YޝvA>!һLc6@ 0^y-[֬KM]C#R{*#jp UשNĜ`LԺԩD)s*D ~9o+|e\ӯ~}y}i/]:i1m'8 X|,keء:WRMyCE%yI'ڵO?[^|/z{x^'j3(35;bi89NZo>㌱|`PthOQD+,777wXjC@,ܾ}c=_x^BHyyy(:t_?pq<5&N7MH4kڃ>{mڴims \-84>" 1]6[ikk[p᫯6eJj,/rN7Ѷޱ'cw-<5Ƨ-|u #ea?]5-a\v!n!dy}ŎVDRJ~4qTl["I}?^7Cƕ5s43@tEEE@`С[lHYӯjsi^ySLq"'V#x&0gU*[K[oOfffX¶",g N"zcT>N`]ng Ԣj_++ x[n O[o;>^o8ĦX~}aa?0`@ 99Daa洁q#"666~mv][j?۲?#N86qoر'|rNN΁>UV@ `MƢLPxfi=кuHJJZn#{q9#AvDl/lg/f*0eWq^3Jϻx\}ig8?Ȍy Ϙvv/xdgx"AE naϞӧO>}$'(e@ix5ŸYf\?@Cԩ85>3;n e˗?S999IIII>o5;;R=*8N)r.fʖ˜۷ovg222@J `줤+1'[Rẕ 8p;wLIID"a04UX)?,cC=n>}|>0|>_>}֭[q{nh4~$"[1Gy/?/lvʿ?^[W=~ܕw>|ʝ?~s_qw;P|W?3.ֺ{U@2KF_x~ØlW{KӦg{U'͙&@hk= lo f {ޖ=wCs͵]`|z/J]uvP7d.}p/q瞘lO঵{ X@ :?s˲(C cǎ}oD"۶?{,^f?<3F8egΜ귧F,ۖH-SD4g`ǎELRF=88^mU5’d4ݷry|>Jex$!b}Օ_efe=zȑ C?LSi\+i>b[o :b~cFkZMl* =8^pn;9߮bϷg\q[QWӁ}S1OloϏK: 7Ohn;oBd+m~)iGoWm]a^2_+JH~Ͽ7feN|5o}0I.4lXs1֪51UK?x-947lX_g`~њ;v'`#Gw߯vU\\9) F<-"6SYDI6̦uFzW,ʛvbwߝNijj#>BRRC=TRRo ڵKw*B!ine  @ǛG9s(JGd{qB|II>hN]2`cG ߮))vZ6|Sg\#4.JM4<2Op.>M 0ah#iduH ?˾1sV0'vߚ/2󀺳s$m. E3G͸G(n|uܫ.پvkm3@۶Z*vؿ%?=ҟLp\Gw݆?lE r[?+o߾ݶ8ydXB5SbDIM'U>ceeeń.1ƚ֮];`sL5Hiniv\ى]v$86MJYKsqڄMLӴ,-&NxFI BJ*D/"38 <׭[%AFsh֭{:Gk4tT ~mgƺL}(b|?=׾)Z>^?'^m<J v3 Xi$ނ SzMs0uܨ<*C'Wxg MkX3l<'2Xp[rzTɾEµ3<7 c[ƪn<'_bw}(ѐ G gΜY^^m۶ɓ'%%%%_zĉ 45"*AjIdco,]lF{?e-_|ʔ)фjc=lq*Q8Syx㐛{i)c@n O455v\<30!aC9JMK-N2P8ÇA)믩;pE R)V2]2ۗ.]r B#F?1bD80\.ҥKovn4/*e"ߞK.J1iA;+, wpn"OY%kU?XWf Oq\C4۬rRڞʞݦ\,x̙STT?HkkmQ- &D^(&W}@ndо@vv6r\rذa)g.Ϯ嚌L5smmmmmj6ֲ;/##3|#n wOJe@޾ UЁ:0}Ri浻Y# w@ „(--C IOH#۲p( ?P( P0 CP8P}WL CĶFms=v[[[·z(7/!P 3E7P+sd{T.EG'''Tx.>YѸR:s%_0m:3򪫫kڶ}1?&:TQ2/ض}TeQ=4x_)0[Ȯx7?K:湹4Oz;"Av=骬=*Go~`Lܾǎ*?<'?8(WC:$`=s Ɩc~Ǐ,z=19ޤa%խ~bފtn_6lz{UmnIQnfz+\Ww6'fi[};~4ݸ_ @Ir{WL=cPo p챎ˮٸndV?}??Dƈ#?裁 1 Ո6 3 .]#4.lLlӞRL[jɝHxĈseddضϼd&hFqapStdyncjdĖur˻+7|чe3)g[T7M ^e={ZƜGkڅ>%oV_2ד=I'^K;.z)t/wț̹N+'˽Ut>|m=Kqț/y_vmNduS@qm_[RǓ{KimX>MpS©}[|e/S|B8aLT:щLD nRqƙ3gg|z~]tEɣFӟ> s̡pd{zRfҾ2QU_½{~_ǎ;̟?0`*g#۶SΘqqG*JS JrYNH0p( ~DU,($`[M o3]r &3hРC6Ԥj4;i]PWWMWu`IˌSZ^zm%+@ o?o{nǎG;v,Itz뭶:+^yCdN d#|#$#ݹR$W%qQ@4lb`Զ)a"gZ:s4ιAfZT :o׋{!++[nNѱ{ 7+KDP#B^JD=}>%ZsgޖL69)q#OSuLM%|@D=W#pa􈹺e.QybaN5Y~[)~*0NK T{ A8(¢=8Hzi.e6~9^!(0FQFБ6K(o$He9S 儶G3u.yw$sZ}bnIKTE@ 6RP~{iegoټ@ :l-eGa&dSE"+3 ĉoVi^@'z|39TMe&$bZRHr3j7D57nhS*"Jѕ479%CL>.AM 6ڠӫihi3pii99`XL>GvobL*eδ+x*JZ<@-haBo%KeG 1oJG!L ,-mnkkSedM'{ ۫tm;UWqqz}9 kihihavٽ!V RBcʀ,w11ZfT4iZU5Su-/1k+oFQwT+FeӋj1c"@ihihkZo9D]˖Da{YTlɜJLܟ*1J$Z EYغz3&A44 MC44 M @  OJ(K3ge@ ]o>ӧ9[a#@DaѢEg>mO7@ o5{~@ DOyC@ DoJ1@ b@ -B)@ qt7P!@tQ!@9 1,@ =RĵcW @ @ J1@ 0V @ " \1b@ `ҤI+[ZZbּ{,D 8j@)@ R @ V_b֝ !p?4 @ +֫uXEol: };־Ģ xn= +zWz1jV-Af/ @ϮUꇏ B wN,{/@ѹ_lZx.<^VXU `1@ P!~=P6n#{jV v/ة\UlͶ}-So] @ (]D>タn];!{œR{j%a\`@ P!:ʹ%+ eӮ(Z`njU=ƪTuwL+F@ >ŷC1|o֚ooaqIIIIqq'Xi:%`~ao'mz韟r̹bY {Zlpe6~pUiv6(>~aUȝ0:5Uj#ւ k:ClZbզ}-67hpٸ }͛{y[f2D5O<|@ƨg-Z>U ~يM*?6{տYU5`mKk= ̹fx*7\fS%((=^R=[W,]6$8$XZR|!~3.8<`ByQ+}נ+nYlvJ5jEkm+nw(^o9@9 OJ9s6Uq7nx@ oW4}lXīOQaܴ? }eU֑ݰglmwVmZ<ŵ;jCݪ8MKx~ڐ:O,Sծڪ%/oԳٕU{D}?/qw4"-b>yd7p.X)n)RfVUZGVO sܫOKWE_i'uKVTYм|O=wy&pg>#Ͼ#=z\I%pp6`Bt5WoW Z*+[H).Ihްm-)æp=̽vl^tk0TNrŵ^=q2˖UV.q7_;l!^vPï{Tg}휹s1 UG6lX'sL0nxW2sܛgD:R_.YQl4j7pxZ6s1֭ӈ@ wGkţ'HPiJHPe; y 0`)|cNJ*ŃW2~I\}|ْ;m 3G?ٷ:{PnP'Mq cSKS8ŬM-9,j7U\&ԬY=`Ҁ +8>޶*֣*`yyf^P$Z\W3JyhZKMURS}> ݥjg{G^VV|ΠT3r7Y_Z䕌6㚊nsC B۟AY8\b?m\"b|V+w3KbN/`U1m4l^ɔie(}5vfخ]B'lhBfk y%}޵C՛b!yy&4WUpwK|8Toj'-DH-.ͅnnvb҆2% Tg'ݥ;$.\`&11Wz坥qC ;#ż4CЧLӔ֑ݣw-i+\\sG!ݑb=,Y~Ѽ7vVٹT ;k=++#zkU\4pp69ok-/4vֶTT55+yDSIq2V'GTMTqC Vf `uOV[U+W{VŢj5Im[R٥*RKSޭmʪjaZX-*+}5ଇT*@@o,qoּ_A_}wR7ؕr\lAc}=/)nXLzAVX5bT9V'κvB|!y.5k*C œUkz|=gUӢ8gNJUsD+L.ݥSW9nKEJV{G}E n@)UbmXtCU}s}K6؜*2*6UU8W[Z Ɨ]pv`dáf{b-]*yj,J('|ɪ`s?Gy8d|U5[W,^G(z)&'kׯZ]S]]=vqK7UWWZdp-7 @ ]=9uW<Z6:3r{p9-;럟 gu} Jw_S6VA\6G,f_X_ь{m}zVͿ(**jEM=mڴi_D1D1D1@"4b_^D1)Q Q Q [xsQǦ +r((bb((v*ɓ'q< JD1D1@@D1D1@@D1D1@@D1D1@@@D{UN>q8z=bw*o'./Q]8!`7Sv ߏ}Rь-{jɥϏghy.ŋ,Q g"ƢtW0,6);/׼LFbYeaaÿz?ЌyP\z63 ڮg-9w{v?Lw()QӕN߿3tBI~.N5K9meӟ p1EDupǰs,)ʯuQ W_IRND"ezh/"ߊJ H'X"܋>w!HDT4od9i""r:H="i}?au/a/a۰v |8=9-.YRس7.'ކK+=oHt1!]YbaF].6Y*Zҗ2_\1y)$AgąkrrB!pHbY+HRn->𶥳$1ED4"̦KQ:qA%l2ьE,I-tR'.3*Z`[TLDE6 KDż%?%uōAMs.ADٳ 9GHb)yϩr>s (fk\ʷV^n)KVb~`rҲgPHt1g]0qr\[4̘;{h9LlRnKfĂ$(6xG,2_0 hGjõ7QQZ%YǛ/wK%gvѬ%Kf M.fliI|gs)!*^dw.!(sY2")9K_uWEDTrzY.Z`]i={ K(y`({:oxw "ʜ}ЯrXaɬO+?:mqz(s*"̉Hb@mK }/~?:r%3Flog]?ҕ˙,=5[K.yo}I?\RdDD3L07>/bj.zzଘK]Du2M~f/]wߝ_RDDYU;ۼܢi3˿˙nϭ_155fte2Y2>mp.3X㯌3z2Wr^=F t.҈$p/Xֳl࿪!"bjy ?iYaSXV`-Ң?^4Q]ڸthnCmE7SCbwz6/zֺHUfNkeB ruh-^մD"kʪ} Anаձ[Rd*} IDATqכrg4;l.˶ES*Uh,nߐVxhn:VRr^嘻բ\@G 5qkn ql5*؛n@Z Ȭ\ҲA}h6?atdS#2~u;2k?VۄƝvߺrusgww.e!qns!5QWl1;{;b3Vm^jm85Sh"Uf3ϰ_Q싀X/Nz*x^Ϙ*=-Nc^kV@%;|Q"۽ķm?8sEbSz*|LxeccoˣwFv N=:Vf++ }b ,KpeYfἨxf1Sm8 k*9X!"\Xu;xvjHDYUg`sL=}Wd"U7>Ļ;Ek(zͤ14{қayha}l @mE*XT6M2VT +ʡ`20#f}3-ctnx1R*n]Ĵ;RkbH9/+Zδ8S;Ǽ˗ bEZ͂ܽ{3ǁC)JdڰS1 g۰Ŷɓ1l eY-)R ֗1M͒:VTuYCsW;jJ6i컯PRuUJ4w]vwosPPS βҽ#x2F3J%\ 1'ztJ:3ɎW*\ܫx~l}ӱ2oPCCFLTrva&W&9mH &ՁO0FI'}TW'&8nv_=Pk1Vw M\IQ`+gv8e3٣kUSCŦ*fboI(ZumwWdU[KRz+_nL*D$mi D5-|eCd{MJ5UMj{{tA,³QӶ*"QT"S uu,ZC k۸9TilTZytr]$l8hjY*z4Tu#o[~ܹs$C] q WOJj"EeYJUR9wQlw;ѳU{6h&,ֱks̾[^5ݕ沙zfN_]ٵjlܟζE?Пi2MU+֌~H9孍-J3Lg͙ckM>c2o{gͻ=]&9;wufA 75ޘF hbJ TpVc9[3Eg^ sͳFOSZ\.s\./^_ÿKw_B#*,](k#/,^۷oŊ7nȑ#G=Q"xNN>}Z6H6mZip~ӗm*)֝I*r}cl^38)_ Z{݋bw.j@@DIIYޛP_>R# VgPDȡ~G9rTك[7(h,A1mh35ո أj Vh4VpxRŰ60?&1l$|~EQNpXŦFX$Hbb.8"PJm{>.b^6O1Wd{w6sgHeYjkAD?8i&lwQ?vhEu-ރ[Uclx4B$y܈ (vT`#WV:b!\J9dV\qU8cDUGX$&Quy pouV1LD",n{^##~x)ˊqI-,E&닡&peQiN {yL-WTz\F])9θ#" w:ŽvW;;"kwER^3j ݽ1;'u'bQw\"ƹ}:?yѤ39Y56706^%XcR Q%R5$c}y܎*R4LN),,):1?1>3$qvqUc`Bآo{(pKò,|Hő%["X06YSb8%b8'H!@zA[QfX\^XV x͐"+DXh"m!v/h4yO`,hcǻ.xk1a1m}rfȈ&I&bo(c|r9,7BG`8 +9\3^1`wdwHB( D$;ws,Ƃ^$"1a. kFIe)2MMa_js;p@qFbP111qW"v63ऐMxΰĀ+D+CR,"q\,C mõbDt|J|; ȝG}vV 9߳[t' ٭$VU!1qVq] AXcJ!g?J QeOs=1~]oz]8]J Yq g v8S%I!b=ym.Wf7byE?7CFTY*xWEe&q́8Z$uxO88X1߅*"°,ː2݃GR\"1J)Xqu {9ydw, F<u3*LΗq ~Om m˹t,7P2:UtIaHe0q@L%ID%N Q6a,n "./22{=!Q%Rİ"GJ X9;,*Dynofg&B~D[Ȳ,KȒ,5p{B(=wx¢,-u1,%B7"ʲJ#bķ\pBe1S&/,9vE@)rE-<h"c'!{oǮCt]?uwwwuu}嗝.]eŋ$={V~i[[Ǐ=Op۩fNqݟLHgZG{¨]OxLgO_ts}QI}xmϻ ͼ2]tyKњ2 +M%DDgt`7\g({gv kk̆|]uV;F?f.)OvhX ;[c2?IftL?=7|6^*t>7Ygzteow_\ACCo|c[-[vǏ}駿EQ<{$I/^eҥK_~eWWWww\.w5͵~===y\ի\._\z5sトD p稲Dg]&le/^&Bn˲s?D%lL^%.'[bōûrȑmg%^jӧO>}zVͿ(**jEM=m4-8$p7b"Q<)x.6ŚlF5і*8|2kWm$ջ|u>Pys895Y:)&Y.P[=S?*vPez[-e u}Enx%OHZX>tJ!RI6=pl!b5B J#; S+^heX&#g1,ÕyZdiy(xf1Smm*9X!"\XŏqYUg`sL=}WdLZ~pKRRZh &fOz ,-k;gd2@nՖ-N3/m(˞Z+ln TW<ǰ&{}@~xuϱ _oMF6U k@PvOg[Ѣ´YT_L~eku`UH9孍MhF&e7GX=S]NYO*aμ _*ܴŒ6|2gnX"k!Z˜y-ߙ3/ۄ9vH=jd>xaaޜy}?Fy4y]49eaeL kzzzr\OOիWs\.˿zj/}i m3L5,ѐ 3h*Tia{7 jb_.I][+yhݻJA8*j ߹*ѾQ vW<32m{Ⱦ}VXq𮮮9rdۻ%iWZӧOj/ZmQiӦO6M=01?#]M&={τ0  roCa1-sU1*jZ՚6FZ; ׊  ]s]+[`@         }D;Wɓ8rGyQ G@%         ݧ/J"4Qw޳l6J|EP+(p W.;qb&KDE3fR2rg]=DT4ðb V;w4_&ҙ1N\%B<3Ya)+ ܸ%Ñ2#?<5Wcb9)Ň]f2 %ڮ_{~ie ŗ0MDEmXRN>|型\gK 4u=7 '.\mN>i""r:H="i}?au/aX+5/>5B@!SVLh.ǣge%單߰zJ(&R&.X̒OFZn2#N\P+,1eij QN>!ьEK8mN9-vͲ{-~""J'A9 &iZsKmnؓI?aE@Ar7!Q n|lJMUpzI{c=WnCGm 5퓣V빊PR%"R9XF? tZ`8侖GÎ,[.<殪DLyHuizWH]1SClUT!RSM}˪#XaSq,Õ9} :v6]Ml}!lZp*|6e7Y;**؛n!.l9ܲ͞iDr77=&C @` e<2|5T)5;ȥN"UI_Z@ĭ>i6rF e3*hZ\[0y1m]9G3d)&*!"M %=Vrc/!Zc"JWp[]$*1xXb« ;R*#DgaHWb'10c;yaYw**c~$de{ソAbByU2vPqhax )Y,˯2}*.S.#)FI #>}zook&V1Ԛbh8*!!#&˩699p"f[lgTm6aYFoyL9lUBt)Ү0,272fw}d7mۨG?p"w"VdozyPST{774[}QGW o2 &`2(bO@&O 0u|1uF#G0MK2fڅuBIHk,} eTHz#"IΖ8w($6lWB[ɍzS8)&ϳ{ԧ_q+,R:)YL ⫵c)Ynnz.QӬe!بjLlΖ"S uu,ZC M Yƣ_xڑKDgvCg7S}zVͿ(**jEM=m4\?-~woo 4_'ϖj1oVL:T`j H3}bgD"qBhitt;%y؝+X}_W ^R-ٟyT삽D[hO%Am05 %ʞ?9ieUkt9O*3_ϿW;u}7yA7O4ѵk 3?o޼9?vzsNfYx3j8:UmZr':Ueh[[nhh؛9kyὴ"mMpFZhLv;f,[\++BԊ],]}r] `JAF>^#YGzu/UNuDkz{{xgq2|\yN 9۫[wy52w9dnJԷR|V`LM:Q֙9eXKh3Fex/:qy1m^9GkzLY9&%-eX+XʣbFK/({^%z`}ڵkJ:q.Z$5\<|;w31{Zo/eOn}f g o#;͕lr*w\MօK6DEU%ޠ@M3M}^ywgow4(Z1@9r9R˴u""5%=0OChH#Q]k CS'?wyu.b 8#kUl=(lr*vص* (:t eUE?SCU(QfQ(2r2!+X,Q!O<*Tkt{*(Çc'˖-N@ صkF嫗zlkQ?{xq|}mK_Go͟v"ȅ,\Ko~??6[wsjiXebt0=.)b|{շ1}b.+_]o<}S}/6oߟ|NmO[z=kݮx,NCYeHm6Z'p<_VXSU-_sG꥝)YhYǮ17So7VP`*@o# |ퟭXbC~>ziZw,!+ŀ7ZD(QV;'1Ge-oFvٷoxQ+6b8pbY4L8]5`4ױӎ:F0UAc4 <сy>Fc J&ߒVsO7 Ql(6Ir1R ISVS5sG$-$u 8OJN+[pQgu xGǹ ;F>nb.8"P2MJ83 gZyfJ75tZ8FaX+DI&K$#G=6N0B)h|{czUHY536Isüe@$~ Vٌ0mո/9Ax+,J'G&Quy pouV1L*E<6h4 8*Q&+qC`5!,o&_wǥ U ͯq=$O=*JG;06DBDիW'O؟'Ϩ۹Cq}٫!?w>ZS[QZܯbb}'?>+3i/ o=VnxJ*1WǤ:Zbwoog̭Hm87YM>ӗkI$̬3yػ;#n_\%%,M"0D h 01D_]!ǡ3,[DSSSQQW&===˗qD>=C1"䵲DdqHL n?ycncYobȣTGv"VNt[)pAyJn_0XX%G3 1r ,B>1c&+B S!;7MfwD<곳b0o˻_$Q&"9!+1,C]7e&'g!Dɍ9"SPJ1MIQQX%RX$PeRX gx!#Y%oBMVEe&q́τS&yy,VĩJ1l>#&'a!7 ]-.}Y}VJ}bY@uzHG:6{ܗ8xHB_gPo1r/g}6J 0!_oѷXvoX4 .Avvv"CYH(6JW~,E Q(9r9 i `a,0E(FD4rbXWWHߞᮩLR" $U7]6024vJ,G3ºՔ=M]WFq̈́u 'umD5 31ևpoAXV7߮r_Y~ɔ/=rݔ?n)W˿uW_Ѡ(Wcn糩.?8*$IJ4;*qAr$=Sɏ;'cI_-wg KtQlXTu~?yDT<,iQ7֧weTN29ދz49}7-gy%{t"ku`|6IHĤp<xiW-Xqň^tuDT,R(ݙŮbPU<¯0h/~5tĸbx~ळ,FDb4nH U=tͤJ8T4ry,9[Ӵk*.v½#fAD&QqŊDgӈHDŮuJgݴz:θW}3KwF(Ƣ6/F?3@Ler)R̡y//K\X]OMTSMc7#- -&7ң*CdfʃRgʒfFE!AYxMzlEX]ٳoj&7_o-===jl@ق07{ fd|"غBkjZRpaiED3jZ0Sj;-OGDYXzߞ/ׯGEbǓנSPy-Z3?w}\YkÏiZ|KeX>x-nʚ' DD o ]=XIq$tw6Rʈ%uMI86%o}_ao{{uz"QL+{n {^~Yj IDAT׺dn}%E:{#GoH\(JBb "%w^]OFDDҌGNTo`D$]ҨZ|K"ؠ?p]mu^u),uoo#V[xf|-o`ЊXde~rEm˶򢧒*5k䇞htZZ(8:+VƗ_~YJKeFyЇ0p6T0 ԉtj4$>pi|Eô'ָ؁kn*]B7_Vi̯B36[_A[Y9fyF&/7JXQ&3f$8-+LcMs@DA5TW7 DkZ_7s"ٳ>P@"@JW,#իWr-? ةN:ƍn喍7^@)Sղ%.g}&Iy(pRACVG9[o_c}ooik*xup*1A`F  Ҙo_4B9X3Ԩ%FPlĵgb1')v};(0Gy護vz6"_Oh͟GG>5/KψQͅ]D|޿\zuͽ~YϾwohmnX;vߢ0YaىfI,ms6TFDtg  ڰ\uuх (6i3o߬w'~䥩L~ȟݯ9e7^ϺoR/[o^S3zٝtnꥦks6gP29rUBuɐX2~ЧGbzUV9Ad D(6~[x5mKoW];{ENŞ_nX`jX,q9*bƾPX^M}?NY]:pbb݌1' rLpjñbOW9}쭡Z}򧮮_.ZrO@5()tY_!*|Ց=191Ƀ}P zFJjC!D:Eח>-Tb5_9й罺b _ݳZ3=#TvV I*9;aq/1%2}B9h{c w-Wl1w ¢Y~$9ގTl8iM÷~6U L w,. B$>DYϤ?8B†.2^- (V|O[]i{+N~q AU,rKW/]jym~/b ϿϺvok[U,q׿^C9 1Mfẅ́ɵDXl{64 <5ó"}1ާw|3kbfꦣf3氙i-fK;s)9!/iXAN=pbqo.NM^hZݻK9#pޞ6:qSf Ql)uO\w4EQ._;<1APhyjASٴlF' Bu8#&Hp6ώGd& tF7Lg j;O"r9_(pR"zT-e}uݕ,  )q3U$1QfX`W<(˲ԫr,Nz&0"iMFzR)Od,ar=;&' KÿyM֍ĈD5+yL:2,Ij,f\ηLI)MEƤHqxDD)MoKijOCeJaDwۚ(FMr-N֣sKznoؾ;X2/ jB6lT-ẌpRs.&cћx!ȧ4EbLzfQ"d#ssYuaI&G}w8|!9{f&s onm45 \jOw|0bfӖҎxΌdKy<FV^,li6 tF,rʷ͵2"&kIeӹh~0( MEO jQ :RkCéT#O<2^pX"&3"QSnx"*<ϟrv(hXri1]Tqpo!ta0I)9IԐo#_=ݐFNŎ木>G$"z"32g yg2"G5M  d#nBDD;X'1IPOdlHZpB@{0洛o҂ Yg2:(s$Z@Wilumh}c( ÈffcСj;cnζ-ˌ]< ?2߲arn-g Į!QlEJP^>b/|L?MFdFT=OM؞ಹcHD=`݋;Z1NFogsCLs}"~@8fyG<<hcq(v:p@CbqocݖmSDxoF\tY$*=2-mF|o'܀$X~|&x{!9pJr'29̬HDw|e1p/6ӥb&T!a8wg/8Ɋg:%*ZOR/Q>t <\sfJ{|z;>fFD.CkǩgCf3:OB:UJdƋ9s^~}EEŨιs~,\M,7#p2"twqvtXIf6Su?$Q ,EIrTDtڐHDf:+Ǝz)ocLaes4Q[biXȝnɩɌ'u7J< 1Mfv\eyHܵ-W4U,;|P<%#Zߺ:::{ݳgvDDwyEU{>;TZv?-GC3],;]н8sۼB+wՇ߻>T;Chr믿~ ;N髬iX|w`)7lmaX|rg]_--VZj +Z[̙e˖~{۶m~;u?cv={_߿/z/Y-/Qʬa}y݂^eoocV%w vjGoTm+NK['~7]LOdpmDDɿ?tܻ N?[ܻj+}3eeV4ND89ba5g>Q>6Di uWZ>/<3C[ox|k!/޵mݽSG{I5{޵Vo:*+к}+-g "ڿ7/XO놦]m5_W[u2?v=5H+ٙcϬ'YS]pdst}7[.HМ P.19jܢN㯶RJ=/od_ÂUJ?2KJoc$ԉ<.ϛ|Υh.]yo*sv~"kDc+m[_ɇoE{ſ6B[Vu~``?9;"uowT>'m wN+>Z?ʹޙ۷hXA;DXA^R15rD"^x&B"MpTDDϭh5G~xx;/UuNvBYY9 mQDxĞ%(2DN}~mcO?aUzz;1VU Cf1p7|JiV6lk~{]D:N82',K Qvtt.Ӟs":s_R&3Ʈ#2Ve`4Q+XQ{ɬ4LbC'z|'>oi-3  }~i2#"MYqs"P:e!dNX';vQM<|+RrĩDDm:s ;|gu& 3}?~=ybŊg LUU@8ퟹwp"UJ FVٵG~?O^1WڵG;M< XW)M߷oͿ>>S}k'WDi"5Z_Ms/9O_sEy?mx|Ż"緶ugME$ľ3w*;*N~̈*-jCDmtsu]vN} e{𒁱7ߙ;+:>:{eDDm:XѦ/ύߙPwf޺k׮ֶʚs7^$pztvvnK::::;;8QsҝT*%]G/ѺVoC|d|0LV^`_j|D4!e*++Jw*++Kw****+++zL0t;a\`  ?'U$^1D1D1@@D1D1@@D1D1@@D1D1U9s|pjb 58W Q `D 74cΜkjb0+Ţ xk_uXpvi]]]D4.Q؍BoũG*~ߜ]-TOlh4V5C!S.kR# FQ +VRs""ּeۯvlٛ!E(h9X,vuu PS3Ϩ>K"؁DD'"j>8_EZlz,ؚyjML9NG EZ#wϠאڸƘ- լa~iH:k诎ge]o~xS IDAT&#ADmz-?_2VϨm#o>oR޶~SFT3#\GDf\P(o=8SwQjX\C[8Dp;X1鴉bKʪӈY>,YRbsOgg'D1/4oo_~?zF-C{c̛ΖJ=Z봯U$*}+bWWw!^ve]v]zb)+v;F-xODJ Qfg /XHCDm[xnٹ3ZRr>2nǹ+N>ϧG[4tefhK,~04h(6&1 j=K F-~+bs#Wν5Bf}x[״ĒK>ʕ kՍ=i m^ڈXjp/2+b'1%D/,~L}h;/l~]/lLdDbt]^K'̔3^~{}K㶶Vs_iݪ3^g`}Wb]?!yі޿凿uF}̈ZVDT[/2"}bx*gqwdl鵛$QcBM?t8;[Dw _;S fYQ9t}iDѺED\$}?fƼ%e֣.Jct+bnk0beB,}  M2[{ה۶Zt. FD5 6pJ<'"?#4b2 ~PhYTK)cjE)$8dQl)~򽝽{v.ӹjm~ 0lwTl=ɮmڥem+=8wC]<#&'&Dc{J,=bcosLKTKh,qDkxDĝnɩ62Iԓ8yXP ʉb"c.S z$ '&J%"{J$wDrkM40ڮ JoxNĭiWٵzTa t6dNBψ.bqqCWVc T_EՂ 3maǐA&if8O$R8GtٖS%Z7\zѥ>DEZ}1@Y-@7ysV&)ŢTG͇R^uhͽYR*nJ4c15?Z80IzcHb(vҮ45O8rAcM{s%jDuuGNߜ^,ڰƚuڃQ=| 2TFd-l:u&i4Tɚz6IݹϦR2+H 'ryKP!Uל)qJkriJ#Lq|Ї}iw4GW> bmJi,@cD!̯"=VI.^#?4L| a pLvG gpLaD5˽dnTYE%bX.G͇RꈚiKI+};y5q]dE2۶:xfo.'I}oi%ꍧꭏEbG-8căMQ=HdㆣYNqVǦlγD?hZRɧTckOWyi&9Mr]~4h9Qm ͥ~#,ذ]Z߽mi}; x[g91LM9dQ`F^7,X(0"ID\윏{Ϛ\ r5/$be+%Zs@2 Z6YXT ^֗0v~H" Zʉ1*4y8 GTR5b," kHܵM;;A5e4gNb4px̌YZ<&OD~.vXTF͏RX] DLL9XJ)7EQ_>, f}pڅAr$zp Fʅ2/2d;9) 64;M8GT2ʦe3:I1DD"?{[^nki})Bp%ȺeA93WP!ԼW}:l1dAI%qգt[yҝR)>w4$Z`tNڄ$M6͚55GQщ8oXz2/uф˖-*ݩ,ݩ1a„ +0jMܥu\ +54bpشib0 A90p3vPbH;+(0 J8qppuuPs|<Q ֱhQs2@17LNqMDЇ(G sXމ\A`rX8&E39P0}ʑt'QR杻w޽s;fۢ cH?Q{q9fCbj8,$*1A`ʤDz1C˝"eLL 0Qe7CkgQEdo'>?CNPQkg;?nмWI= b'I%Id%%P;pvLATcҚDgKkYD1jg&|Z]Y͢~׿2%n&$2&1f)2AM{.$Ig2siYLl<" h6#L!;(k=/TT{Kkұ"wZ׳A䓔֌ÊTvߊi)nnJ Cz4Iq'(lϤxoC@:^F,)v7%p$(#$OI Nڕk6[WŁ~&vۋŽ9#HFcO$fZ)%95ۼ&4os;F%+41YjX1Vh-1r$">Lۈ=6nrqJl)HDoʥf -mTYR"gDƖ;ik)CeDL֒:˦sQ'`LC*yϻ{z6IJɔ0"1-M`(55gJܲҚ\x5(|yTO͍fL](-/Nj2#U=L. "0i$aɩgrP8a6َ?DT^6XGLxDDbL&S#3cLJ/ ImtwQ(K6U ^w@ K݋]?b8G#U3b80" r Éx.SeIa~jWQs"1r#i~Qk#"os֐c& 5T๭LD"0洛oC> )=g1I)y7Vi;϶tg&﵉jdh5=e_U楙4umT3l*"Qkޣ Ԉ6҆d@1"Y؇Pz:b\?=f7 ~]6عsΝ=;&43nJG΄Y\nN~&v2Q( #Ccc,Nah"xvnF gM.lfb2 G9 A8jmnҹĮ 0lwTl=qlGg HR%6TrpUUdntYVPXn=lԶn;rB* DRU.1ϧѴvrҾd}7DQe}kaȱ<~!|z/\oMej>`Q |}=7oTsw7S )7]N*ьGDIꖜa#M=p"b ^ R9VnmEhkX&߭]ej,LT&vb&lj') S&3fZ",wmNd --erM͍lyP3C33gEghL>tڕcQy RX] DLL9X͏]yޔ)SL9oZ0s'aYٵzTa t6dNBψ.bqg7wO@)*Tt ~Yː} e2w.w&f1h F3R"13tX FR-4KNe>SٴlF' Bu8#&HguoMs-/0hKuˊq s"f6>󪲳Og ;,=&QU'Gggg鶤;(IRU}?|hzi x|}rfXNFPy!gnJQ?Ɠ ѦMp\F55^WVUu0S/N>r"`ٲeUUUUUU;;=&LP0az+j:ePCbqg7,wNbíV$Ni"EgÝw7DDžK`ր( ȷKyBsZ׳A䓔֌,ذ>%V|k<*5,LnLmmiTdIbZ8Q%w=R]Cge[0N2؈nX~}N<ƆtbmyZ>Q"syS#7gW.PcObik)CeDL֒:˦sb#-hQмeiw;߾ywyD$iW{ZwkKS% 1(6aL:F$ k: "ySIi5COu(ڱ]7=u&e:]|ՑX#ԫIhH Ob1oJ߱Z;p¸b7W1|Λ"yH$7Zj>>;a4*#o~c6:_VotF}"O0Hb> b#[7'Te1v=VwQl7t/cIk`_?u=aaȚWa+{Lc]#gbBD5m[KÖ-w?}o:|ѡc W˗sXOꥧFOFQz^ȏ>Y?ޛ&r2;k֎i)y' n!Sz&ޯfUW >{/*|f5@l ʷ~Jy+ 5ٝٽg-} k=d{wIsɏ?JEI '?">Y88>=-r%S)$vIRG,ϳ|Ii?:^|6O .܁aFĝ^yFݳ{ܿhAL…\.'?OHL֭A ߏ2n ؎^iZ#k:s'{Z7#H[[[[[[D…H$Z>U `  <6 (ꠃQ `q <6fh;bطnD8sA ( 1dJlA)!kyD-N8iE$IX$jD!glx9)KK§ݢ1T\a^_ϕ}6 Ӳwuo?{ݻ (f IDAT&3Ib\sv9bZʈe'XHcR+eN@$B L 9jk OfCS4#rn,WnxFp[dI5MC9ID)/gM;UXaIK/*RAG{cT,W2Bb$u<[qBQshW4zzaEY,m ݪ(J=X 6Ɗ)YXӛ]v&v2_E5dIP5;vfK߼_ FYuj""[b%M+(``lA5G`?@Ps:(vA)0w2<,AU Q `qnͱPs5@X3\:gj-G{N0Ŷ ΈOnD͑+dh]M^ k+'Q"NFON}"b}rk'6ID+l$~rG?{?qJRgK:'G~@o|zwyg6-R#bMY1jSjoCc.ȥh`=O.3Y.bu-H_1׎QM8@"с۝Ui$޳oQFc'}z_ֻn[>s`,~p񷒝7_VS(K Xс>FMq72cѾ[:7үӾdS'ǜfȱʞ)1E6ַ5Gͷ!wPdh{,V|q{z.OQF$ku?꓉H#o~hrA}+EuQl=1ډrߕ7D[EcQG )rwe|Now|v 7xVW>ӚIڥ͵Ο|9%Zn#76S}Jүλ*d}"s&cD;`]*Ps|Ux$SGU"~"a|e.G;+owF7fa|bGya`:O)%[Ν6\r[rFD/~ Vfkra[rVu5g$IBbK7jr; oh's͠7Q9Kؕdie|`V11t6q3ז=!j{R*ѨΏO]Wk+=] !"p'HؓZׄ;:^GT}n8ڌh.=*mv n<*YӸ_"8v~9jk.\~ޝg. akط-ZS0@U+d5{dn9c3(֓:й1Aؗ-KhovrMHo[{2KO. 8l_앋e7o؍LDSG2X)w"2gb@QsXCxndGܙ1x|ova[?y}<sГ b/ZS|gcp;Xebo]vgԱ[x2FۣG_]OO9%"AwԱ[;7TCEeC#k\nD?F~^GupJRH쒤XgK8P9j.}O\PQ/5 nLL|3{p?AL…\.'?Ou_z؆sCF/鑡f]t nj1ozi8OƟ"jqD"mmmmmmB$ Z[[#HkEKKK҂+(5Q>.w.Kѣq\ u 55{;ECr;7=ݽoS*&Y45#w#񃙏G2xkhppE@QsD1[s.j9l"@yv\%bb;+@@@GJD1(A)!ky@|I+jVJ+$82vґ6uYp0.I$DI=] 64b $I⪞)(*غd,V9˭ Uډr5#( MfĸV̿P-߽կ{k,[h{~HdXξ{WnEPN ;Q,z+,l0à Ӌn)QGbvZR0;8Y2T a鸢(\r _15S9۵qUSFh MMϤ{']{;(PعDR .WB&,ky,$iTc$1z.1"$+edu?p,W5NĈM,ٴ+jnŋV:c "ɔ0""̼ӕ0T|Ś !5cNrل™ZV>]7cZStDAo ZB%O%d"5M7ݜ*I{8GŚk죲a/jK\ Cp jj(F @ݵl$&+R~jJv0&/lRSowRq D╈QX}/Tb |B]zreUSTMS<\3 hYbsMc\Q]74&M7i/Wƀ 15N&v&̵ ȷ-O5b 7j){) O-i{gI'P6St+d, .-2X)w"2gbnVue9dƲu& rg#"%d[2n=bE Gn˕>wړ(Ys唺{=yvݻaB,W2Bb$u<[ˆ&R3ϦlR>[(VVN?g<~[ʪ(.@H*ffft055.r9'ЈAe^iZ#k:s'{Z7#H[[[[[[D…H$Z>U `  U%c\ o?~Sw1̓ (P ^8buPȪLfѓi =Sl2غrXQ1jz]>Žg\v=.Y[?~R2""1V9}D&ga3ݙGTc1yΔ"\_a=t=}پeD\7>=ƛţi7O?"Xs}害?n,s삃  6ߺ#xַa|唣]vVO`kaWwU_:| D╈詷+opSΕ#g$棗G՛h|sB{s]<ڭE EQO@X64r~aD=9)F &Q5e2ÂWG n1t6q\+x'?lub =Mh@[f׬{/$D\yr'".ݘA,}}jUn-eR6D-d B0YՕK_G :3s:k^ċ(ϝE*+JkN62Q~M۽gϞ×Ι{ϞaB,W2Bb$u<[cDxD_+~tl,yw\JkN{4J}t-ۍ4bff&| MOOLMMMOOOOO SSSB.?_|c _m[q;G788?ʸ5&0`;zi8OƟ"jqD"mmmmmmB$ Z[[#HkEKKK҂V1M((`^1P(l 6JD1guPb :*( J8ɢYp(D1d$ZxT@PD$"IR/׭xyJgld1D'q?6lgx*!W{n? &sNXxƈ}یˌOƻYfzД*謜'˕1ؖ&dԨKbBh$IˉU08ErAXHx?dDCIW\,],1gBXd*.IysZ'9h~_ޭb333chzzzfffjjjzzzzz:\ rw?zQGh n2lqW^459=@D-ΛH-\D"Bkkk$ihii [ZZ*ivJippE@MuA (J́pOxdh@yp%l[0EsߏkQY<:Ql rvd-bOL@@X Mfĸ젱rN׸$IS⩢I@{|7szyG{B/WO_e^hu &Nw'.V^a>gi='KldS`D|v[7'[Iࡖ(gFZww_=qέ[M^ k+'c/߿hoO6eZBϥ4F=d%Sf8SF#@9i'`}9'h]W݀T`}z_ְֻfDԾ(#Ҟ?3.Hnh@Dn [0%W^de\px3D/R#]z/tPϲ'zd3yJNo "v>'E""bc$@4BP$^'tqӣ9Q|"]`AW{ `3JAs a떝O(% =;]aX\;A.h""YӸ_\(Ը %lL bM$I-ըrKD 髢ƽ&;58(?i)y' n!Sz&IVreWJ)ݔ弮u,WKDڭ,NfB' ̝o\.׮=tuzx˩#n˕L%I<ϖqN䗲?ΔVcp1FR 泺WEi;6r{JP `IDATDD>Qő̂qF`uYeDS'YX +$Jlٺ3A񆱪6BkiuSX9eՆUXJ5mdž1Uj2Ғ٩.GEhQe헋7&T~99S ;;ž%[u]<@%~_V1a巓3-Wx`;y饗nܸ˭|'e=PVU[2~ΆA/6["ľ'Ͽmm_nm_Raܰk3hjOr$[Z 7dK/VI{\{]_2q-;lq V1h_'7o~dIϳ*F>-wdUɈ68o]_T頬tb@&m]<T_NOO~ō1Q{bRZ[[k[[[#Hu9Tr˫Ym+(g+vc؄ybE)6-Vf8U&~1aaోb0~ITU dWU_8;-bi,T]0Zt;VaվjXD1xV; O썏-(\.6֎<]+

{{.Name}}{{.Ins}} {{with .Doc}}

{{.}}

{{end}} {{with .Methods}}

methods: {{range .}} {{.}}  {{end}}

{{end}} {{with .Examples}}

examples: {{range .}} [{{.}}] {{end}}

{{end}}

{{end}} {{.Include "head.html"}} {{.Include "header.html"}}

Syntax

The mumax3 input syntax is a subset of Go's syntax, somewhat similar to C. It is case-independent however, so msat is the same as Msat or MSAT.

Defining variables

New variables are declared using :=. Variables have a fixed type, inferred from the declaration's right-hand-side. Assigning to existing variables is done using =. E.g.:
i := 7         // defines a new variable i, type automatically detected to be int
print(i)       // now we can use i
i = 5          // assign new value, don't use ':=' (attempt to re-declare)

str := "hello" // defines str, type automatically is string
//str = 1      // would fail, cannot assign int to string

Arithmetic

Most common arithmetic operations are possible. Also Go's math library and some common constants are available. For raise-to-the-power, pow(x,y) should be used.
x := pi*(3+4)/5
x = pow(x, 3)
x++
y := abs(cbrt(cosh(erf(erfc(gamma(J0(Y0(2))))))))

Control structures

Loops are possible as well:
for i:=0; i<10; i++{
	 print(i)
}

Implicit functions

Some of the API features accept a function as argument (e.g.: RunWhile(func()bool), or all input parameters). In that case, and only in this case, the argument is implicitly converted to a function, which is re-evaluated each time it's needed. E.g.:
value := sin(pi*t)  // value is a float64, RHS evaluated only once
Msat = value        // time-independent Msat
versus:
Msat = sin(pi*t)    // RHS converted to function, re-evaluted every time

Methods

Some of the API instances have methods defined on them. You can call methods on an instance by using '.' as in most object oriented programming languages. E.g.: a material parameter such as Msat has the method SetRegion(int, float) to set the value of the material parameter in a certain region:
Msat.SetRegion(1, 800e3) // Set Msat=520e3 in region 1 

Mesh size and geometry

The simulation mesh defines the size of the box around your magnet. It should be set at the beginning of the script. The number of cells should preferably be powers of two, or at least have small prime factors (2,3,5,7). E.g.:
Nx := 128
Ny := 64
Nz := 2
sizeX := 500e-9
sizeY := 250e-9
sizeZ := 10e-9
SetGridSize(Nx, Ny, Nz)
SetCellSize(sizeX/Nx, sizeY/Ny, sizeZ/Nz)

Periodic boundary conditions

Optionally, periodic boundary conditions can be enabled:
SetPBC(5, 0, 0)        // 5 extra images on left and right sides.
SetGridSize(128, 64, 1)
SetCellSize(5e-9, 5e-9, 5e-9)
Setting a nonzero PBC value in a direction enables wrap-around in that direction. The precise value passed determines how many repetitions are seen by the demag field. E.g., in the above example the demag field behaves as if 5 repetitions are present to the left and to the right side. Choosing a large number may cause long initialization time.

Resizing the mesh

The mesh can be changed at any later time in the simulation. This will cause the magnetization to be stretched onto the new mesh if needed, and the geometry and regions to be re-calculated. After resize some cells which had zero magnetization may now fall inside the magnet geometry, they will be initialized to random magnetization.

Setting the geometry

Optionally a magnet Shape other than the full simulation box can be specified. In order to set the geometry, you first need to define a shape.
 geometryShape := cylinder(400e-9, 20e-9).RotX(45*pi/180).Transl(1e-6,0,0)
SetGeom(geometryShape)

{{range .FilterName "setgeom" "setgridsize" "setcellsize" "setpbc" "setmesh"}} {{template "entry" .}} {{end}} {{range .FilterName "edgesmooth"}} {{template "entry" .}} {{end}}

Shapes

A shape is an abstract object which outlines an area in a 3D universe. Shapes are useful for different tasks, e.g.: to define the geometry of a magnet, to define material regions, or to set locally a specific initial magnetization configuration. One can specify primitive shapes, constructed at the origin (box center), and translate/rotate them if needed. All positions are specified in meters and the origin lies in the center of the simulation box. E.g.:
myShape := cylinder(400e-9, 20e-9).RotX(45*pi/180).Transl(1e-6,0,0))
anotherShape := Circle(400e-9).sub(Circle(200e-9))

{{range .FilterReturn "Shape"}} {{template "entry" .}} {{end}}

Material regions

Optionally, up to 256 material regions can be defined. Since each cell is made from one material, it is associated with exactly one region. So regions can not overlap. Each cell is assigned material region 0 by default. It's a good idea to output regions to verify whether each cell is assigned to the intended region. Each region can have its own material parameters, and we can output averages over each region. E.g.:
DefRegion(1, circle(1e-6))
DefRegion(0, circle(1e-6).Inverse()) // redundant
save(regions)
Msat.SetRegion(1, 800e6)
tableAdd(m.Region(1))    // add average m over region 1 to table

{{range .FilterName "DefRegion" "DefRegionCell" "regions"}} {{template "entry" .}} {{end}}

Initial magnetization

The initial magnetization is set by assigning a Config to m, setting it in separate regions, or by loading a file directly.
m = uniform(1, 0, 0)
m.SetRegion(1, vortex(1, 1))
m.LoadFile("config.ovf")
m.SetInShape(circle(50e-9), uniform(0,0,1))

{{range .FilterName "m"}} {{template "entry" .}} {{end}} {{range .FilterReturn "Config"}} {{template "entry" .}} {{end}}

Material parameters

Assigning to a material parameter sets a value in all regions. E.g.:
Msat  = 800e3
AnisU = vector(1, 0, 0)
When regions are defined, they can also be set region-wise:
Msat.SetRegion(0, 800e3)
Msat.SetRegion(1, 540e3)
Material parameters can be functions of time as well. E.g.:
f := 500e6
Ku1 = 500 * sin(2*pi*f*t)

{{range .FilterType "*engine.RegionwiseScalar" "*engine.RegionwiseVector"}} {{template "entry" .}} {{end}}

Excitation

Field or current excitations can be set in the same way as material parameters:
B_ext = vector(0.01, 1e-6*sin(2*pi*f*t), 0)
B_ext.SetRegion(1, vector(0, 0, 0.1))
Additionally, an arbitrary number of time- and space-dependent vector fields of the form g(x,y,z) * f(t) may be added. (E.g., to simulate the field of an antenna or an arbitrary current running through the magnet)
B_ext.Add(LoadFile("antenna.ovf"), sin(2*pi*f*t))
J.Add(LoadFile("current.ovf"), 1)

{{range .FilterType "*engine.Excitation"}} {{template "entry" .}} {{end}}

Spin currents

The effect of spin-polarized currents on the magnetization dynamics can be modelled in different ways. In Mumax3 you can use the Zhang-Li model or the Slonczewski model. For both models, a spin-polarized current field needs to be defined. This is done by setting the current density field J and the polarization Pol.

Zhang-Li model

When using the the Zhang-Li model, it is possible to set the non-adiabaticity through the material parameter xi:
J = vector(1e12, 0, 0)
Pol = 1
xi = 0.1

Slonczewski model

To use the Slonczewski model, you need to define the magnetization configuration of the fixed layer. This fixed layer can be placed above or below the sample. The Slonczewski parameter and the prefactor of the secondary spin transfer torque term of the Slonczewski model can be set through the material parameters Lambda and EpsilonPrime respectively:
DisableZhangLiTorque = true
J = vector(1e12, 0, 0)
Pol = 0.6
FixedLayer = vector(1,0,0)
FixedLayerPosition = FIXEDLAYER_TOP
EpsilonPrime = 0.02
Lambda = 1

{{range .FilterName "epsilonprime" "Lambda" "Pol" "xi" "J" "FreeLayerThickness" "fixedlayer" "fixedlayerposition" "fixedlayer_top" "fixedlayer_bottom" "DisableSlonczewskiTorque" "DisableZhangLiTorque" }} {{template "entry" .}} {{end}}

Magnetic Force Microscopy

Mumax3 has built-in generation of MFM images from a 2D magnetization. The MFM tip lift can be freely chosen. By default the tip magnetization is modeled as a point monopole at the apex. This is sufficient for most situations. Nevertheless, it is also possible to model partially magnetized tips by setting MFMDipole to the magnetized portion of the tip, in meters. E.g., if only the first 20nm of the tip is (vertically) magnetized, set MFMDipole=20e-9.

{{range .FilterPrefix "MFM"}} {{template "entry" .}} {{end}}

Output quantities

The quantities listed below can be output. Also, derived quantities can be produced: the quantity restricted to a certain region or a single component. E.g.:
m           // magnetization quantity
m.Comp(0)   // x-component
m.Region(1) // magnetization in region 1 (0 elsewhere)

{{range .FilterType "engine.ScalarField" "engine.VectorField" "*engine.geom" "*engine.thermField" "*engine.ScalarValue" "*engine.VectorValue"}} {{template "entry" .}} {{end}}

Slicing and dicing output

To save storage space, it's possible to save only the part of the output we're interested in. This works on all output quantities (not only m)
save(m)                         // save full magnetization
save(m.Comp(0))                 // save only x-component
save(CropLayer(m, 13))          // save only layer 13
save(CropLayer(m.Comp(0), 13))  // save only x-component of layer 13
Or even:
mx   := m.Comp(0)
mx13 := CropLayer(mx, 13) 
save(mx13)
tableAdd(mx13)

{{range .FilterName "Crop" "CropX" "CropY" "CropZ" "CropLayer" "CropRegion"}} {{template "entry" .}} {{end}}

Scheduling output

All input and output quantities (as described above) can be saved in a space-dependent way (".ovf" file), or as spatial averages (table output). The data table ("table.txt") contains by default the time and average magnetization. More columns can be added with TableAdd().
save(B_ext)

tableadd(B_ext)
tablesave()
Optionally, the output/averaging can be done over a single region:
save(m.Region(1))
TableAdd(m.Region(1)) 
User-defined variables can be added to the table with TableAddVar().
myField := 0.42
TableAddVar(myField, "B_extra", "T")
myField = ...

{{range .FilterName "dump" "tableadd" "tableaddvar" "tablesave" "tableautosave" "save" "saveas" "autosave" "snapshot" "snapshotas" "snapshotformat" "autosnapshot" "filenameformat" "outputformat" "ovf1_text" "ovf1_binary" "ovf2_text" "ovf2_binary" "TablePrint" "FPrintln" "Sprint" "Sprintf" "Print" "Flush"}} {{template "entry" .}} {{end}}

Running

Run(time) runs the simulation for a given time in seconds, using sensible error settings.
Run(1e-9)
More fine-grained control is provided by RunWhile(condition), which runs as long as an arbitrary condition is met. E.g.:
mx := m.comp(0)
RunWhile(mx.average() < 0)   // search for switching field during reversal
Optionally, the solver accuracy may be fine-tuned. E.g.:
MaxDt = 1e-12
MinDt = 1e-15
MaxErr = 1e-6
Optionally, a different solver may be chosen (at any point) with SetSolver(int). Currently available solver types:
  • 6: RK56 (Fehlberg) solver. This is the highest order solver available, but which is typically not faster than the RK45 solver.
  • 5: RK45 (Dormand-Prince) solver (the default). An accurate solver, very fast for magnetization dynamics at the cost of some memory usage.
  • 4: Classical 4th-order Runge-Kutta method. Intended for simulations where a fixed, relatively large time step is desired.
  • 3: RK23 (Bogacki-Shampine) solver. A robust and reasonably fast solver with low memory requirements. Typically outperforms RK45 when relaxing the magnetization with little dynamics, so it used internally by Relax().
  • 2: Adaptive Heun solver. Robust and uses very little memory but takes smaller time steps than the higher-order solvers. Also suited when a fixed, relatively small time step is desired.
  • 1: Euler solver (requires FixDt = ..., ignores other settings). Only useful in exceptional situations or for debugging.
E.g.:
SetSolver(2) // Heun
FixDt = 1e-15

Relax

Relax() tries to evolve the magnetization as closely as possible to the minimum energy state. This function assumes all excitations have been turned off (temperature, electrical current, time-dependent magnetic fields). During relax precession is disabled and the time t does not increase. There is no need to set high damping.

In general it is difficult to be sure the minimum energy state has been truly reached. Hence, relax may occasionally return after the energy has reached a local minimum, a saddle point, or a rather flat valley in the energy landscape.

Minimize

Minimize() is like Relax, but uses the conjugate gradient method to find the energy minimum. It is usually much faster than Relax, but is a bit less robust against divergence. E.g., a random starting configuration can be Relaxed, but may fail with Minimize. Minimize is very well suited for hysteresis calculations, where we are never far away from the ground state.


{{range .FilterName "run" "steps" "runwhile" "relax" "minimize"}} {{template "entry" .}} {{end}} {{range .FilterName "t" "dt" "MinDt" "MaxDt" "FixDt" "HeadRoom" "MaxErr" "step" "NEval" "peakErr" "lastErr" "minimizerstop" "minimizersamples" "relaxtorquethreshold"}} {{template "entry" .}} {{end}} {{range .FilterName "SetSolver"}} {{template "entry" . }} {{end}}

Moving simulation window

Mumax3 can automatically shift the magnetization so that the simulation "window" stays centered on a region of interest. Shifting is done to keep a freely chosen magnetization component nearly zero. E.g.
ext_centerwall(0)
ext_rmSurfaceCharge(0, -1, 1)
TableAdd(TotalShift)
will try to keep mx (component 0, counting from 0) close to zero. If desired, one can override which "new" magnetization is inserted from the sides by setting ShiftMagL and ShiftMagR, though the default behaviour is usually OK.
{{range .FilterName "ext_centerwall" "ext_rmSurfaceCharge" "shift" "shiftgeom" "shiftm" "shiftregions" "shiftmagl" "shiftmagr" "shiftmagd" "shiftmagu" "totalshift"}} {{template "entry" .}} {{end}}

Extensions

Extensions are extra functionalities that are not officially supported. They are aimed at rather specific problems and may not work as expected for your particular situation. Their API and functionality may change in future releases.
{{range .FilterPrefix "ext_"}} {{template "entry" .}} {{end}}

Custom quantities

Using existing quantities, it is possible to define new custom quantities. E.g.: instead of using the pre-defined ext_topologicalchargedensity quantity, it is possible to define this quantity yourselves inside an input script:
cs := 1e-9
setcellsize(cs,cs,cs)
setgridsize(64,64,1)

// Use central finite differences to approximate the spatial derivatives of m
mL := Shifted(m,-1,0,0) // shift left
mR := Shifted(m,1,0,0)  // shift right
mD := Shifted(m,0,-1,0) // shift up
mU := Shifted(m,0,1,0)  // shift down
dmdx := Mul( Const(1/(2*cs)), Madd(mR,mL,1,-1) )
dmdy := Mul( Const(1/(2*cs)), Madd(mU,mD,1,-1) ) 

// Define the topological charge density
chargeDensity := Mul( Const(1/(4*pi)), Dot(m, Cross(dmdx,dmdy)))

// Save the topological charge density of a skyrmion
m = neelskyrmion(1,-1)
saveas(chargeDensity, "chargeDensity.ovf")

{{range .FilterName "Add" "Const" "ConstVector" "Cross" "Div" "Dot" "MAdd" "Masked" "Mul" "MulMV" "Shifted"}} {{template "entry" .}} {{end}}

Custom effective field terms

It is possible to define additional effective field terms by promoting a custom quantity to an effective field term. The corresponding energy density term can also be added by promoting a custom quantity. E.g.: instead of using the existing anistropy field in mumax3, you could define the uniaxial anisotropy field (and the corresponding energy density) yourselves:

Ms := 1100e3
K  := 0.5e6
u  := ConstVector(1, 0, 0)
anisField := Mul( Const(2*K/Ms)  , Mul( Dot(u, m), u))
anisEdens := Mul( Const(-0.5*Ms) , Dot( anisField, m))

AddFieldTerm(anisField) // promote anisField to an effective field term
AddEdensTerm(anisEdens) // promote anisEdens to an energy density term

tableAdd(E_custom)  // Add a column with the energy related to the custom field

{{range .FilterName "AddFieldTerm" "AddEdensTerm" "RemoveCustomFields" "B_custom" "E_custom" "Edens_custom" }} {{template "entry" .}} {{end}}

Misc

Other available functions.
{{range .FilterLeftovers}} {{template "entry" .}} {{end}}
{{range .All }} {{template "entry" .}} {{end}}
mumax3-3.10/doc/templates/download-template.html000066400000000000000000000107631371432437400217370ustar00rootroot00000000000000 {{.Include "head.html"}} {{.Include "header.html"}}

Prerequisites

To run mumax3.10 you need
  • An NVIDIA GPU with at least a compute capability 3.0
  • An up to date NVIDIA driver (compatible versions given below)
  • Optional: gnuplot for plots in the web GUI

Download and installation

Select the platform and the NVIDIA driver for which you want to download mumax3.

After downloading and unpacking the archive, you will have a mumax3 executable which is ready to be used. Note that mumax3 is a command line application, so it is a good idea to add the directory containing the mumax3 executable to the PATH environment variable.

Building mumax3.10 from the source

The source code of mumax3.10 as well as the build instructions for linux can be found on github. mumax3-3.10/doc/templates/examples-template.html000066400000000000000000000436741371432437400217550ustar00rootroot00000000000000 {{.Include "head.html"}} {{.Include "header.html"}}

mumax 3.10 examples

These are example input scripts, the full API can be found here.

mumax3 input files are run with the command
mumax3 myfile.mx3
Output is automatically stored in the "myfile.out" directory. Additionally, a web interface provides live output. Default is http://localhost:35367.
For more details, run mumax3 -help which will show the available command-line flags (e.g. to select a certain GPU).

Getting started with Standard Problem #4

Let's start with the classic mumag standard problem 4, as defined here. {{.Example ` SetGridsize(128, 32, 1) SetCellsize(500e-9/128, 125e-9/32, 3e-9) Msat = 800e3 Aex = 13e-12 alpha = 0.02 m = uniform(1, .1, 0) relax() save(m) // relaxed state autosave(m, 200e-12) tableautosave(10e-12) B_ext = vector(-24.6E-3, 4.3E-3, 0) run(1e-9) `}}

This example should be pretty straight-forward to follow. Space-dependent output is stored in OVF format, which is compatible with OOMMF and can be converted with mumax3-convert. Below is the output converted to PNG.

The data table is stored in a simple text format compatible with gnuplot, like used for the plot below.

{{.Output }}

Standard Problem #2

Using the scripting language explained above, relatively complex input files can be easily defined. E.g. micromagnetic standard problem #2 specifies the simulation size in exchange lengths. The script below calculates the exchange length and chooses cells not larger than 0.75 exchange lengths so that the number of cells is a power of two (for best performance). {{.Example ` Msat = 1000e3 Aex = 10e-12 // define exchange length lex := sqrt(10e-12 / (0.5 * mu0 * pow(1000e3 ,2))) d := 30 * lex // we test for d/lex = 30 Sizex := 5*d // magnet size x Sizey := 1*d Sizez := 0.1*d nx := pow(2, ilogb(Sizex / (0.75*lex))) // power-of-two number of cells ny := pow(2, ilogb(Sizey / (0.75*lex))) // not larger than 0.75 exchange lengths SetGridSize(nx, ny, 1) SetCellSize(Sizex/nx, Sizey/ny, Sizez) m = Uniform(1, 0.1, 0) // initial mag relax() save(m) // remanent magnetization print(" for d/lex=30: ", m.average()) `}} {{.Output}} This example saves and prints the remanent magnetization state so we can verify it against known values.

Hysteresis

Below is an example of a hysteresis loop where we step the applied field in small increments and find the magnetization ground state after each step. Minimize() finds the ground state using the conjugate gradient method, which is very fast. However, this method might fail on very high energy initial states like a random magnetization. In that case, Relax() is more robust (albeit much slower). {{.Example ` SetGridsize(128, 32, 1) SetCellsize(4e-9, 4e-9, 30e-9) Msat = 800e3 Aex = 13e-12 m = randomMag() relax() // high-energy states best minimized by relax() Bmax := 100.0e-3 Bstep := 1.0e-3 MinimizerStop = 1e-6 TableAdd(B_ext) for B:=0.0; B<=Bmax; B+=Bstep{ B_ext = vector(B, 0, 0) minimize() // small changes best minimized by minimize() tablesave() } for B:=Bmax; B>=-Bmax; B-=Bstep{ B_ext = vector(B, 0, 0) minimize() // small changes best minimized by minimize() tablesave() } for B:=-Bmax; B<=Bmax; B+=Bstep{ B_ext = vector(B, 0, 0) minimize() // small changes best minimized by minimize() tablesave() } `}} {{.OutputHysteresis}}

Geometry

mumax3 has powerful API to programatically define geometries. A number of primitive shapes are defined, like ellipses, rectangles, etc. They can be transformed (rotated, translated) and combined using boolean logic (add, sub, inverse). All positions are specified in meters and the origin lies in the center of the simulation box. See the full API. Edges can be smoothed to reduce staircase effects. EdgeSmooth=n means samples per cell are used to determine its volume. EdgeSmooth=0 implies a staircase approximation, while EdgeSmooth=8 results in quite accurately resolved edges. {{.Example ` SetGridsize(100, 100, 50) SetCellsize(1e-6/100, 1e-6/100, 1e-6/50) EdgeSmooth = 8 setgeom( rect(800e-9, 500e-9) ) saveas(geom, "rect") setgeom( cylinder(800e-9, inf) ) saveas(geom, "cylinder") setgeom( circle(200e-9).repeat(300e-9, 400e-9, 0) ) saveas(geom, "circle_repeat") setgeom( cylinder(800e-9, inf).inverse() ) saveas(geom, "cylinder_inverse") setgeom( cylinder(800e-9, 600e-9).transl(200e-9, 100e-9, 0) ) saveas(geom, "cylinder_transl") setgeom( ellipsoid(800e-9, 600e-9, 500e-9) ) saveas(geom, "ellipsoid") setgeom( cuboid(800e-9, 600e-9, 500e-9) ) saveas(geom, "cuboid") setgeom( cuboid(800e-9, 600e-9, 500e-9).rotz(-10*pi/180) ) saveas(geom, "cuboid_rotZ") setgeom( layers(0, 25) ) saveas(geom, "layers") setgeom( cell(50, 20, 0) ) saveas(geom, "cell") setgeom( xrange(0, inf) ) saveas(geom, "xrange") a := cylinder(600e-9, 600e-9).transl(-150e-9, 50e-9, 0 ) b := rect(600e-9, 600e-9).transl(150e-9, -50e-9, 0) setgeom( a.add(b) ) saveas(geom, "logicAdd") setgeom( a.sub(b) ) saveas(geom, "logicSub") setgeom( a.intersect(b) ) saveas(geom, "logicAnd") setgeom( a.xor(b) ) saveas(geom, "logicXor") setgeom( imageShape("mask.png") ) saveas(geom, "imageShape") `}} {{.Output}} Note: these are 3D geometries seen from above. The displayed cell filling is averaged along the thickness (notable in ellipse and layers example). Black means empty space, white is filled.

Initial Magnetization

Some initial magnetization functions are provided, as well as transformations similar to those on Shapes. See the Config API. {{.Example ` setgridsize(256, 128, 1) setcellsize(5e-9, 5e-9, 5e-9) m = Uniform(1, 1, 0) // no need to normalize length saveas(m, "uniform") m = Vortex(1, -1) // circulation, polarization saveas(m, "vortex") m = TwoDomain(1,0,0, 0,1,0, -1,0,0) // Néel wall saveas(m, "twodomain") m = RandomMag() saveas(m, "randommag") m = TwoDomain(1,0,0, 0,1,0, -1,0,0).rotz(-pi/4) saveas(m, "twodomain_rot") m = VortexWall(1, -1, 1, 1) saveas(m, "vortexwall") m = VortexWall(1, -1, 1, 1).scale(1/2, 1, 1) saveas(m, "vortexwall_scale") m = Vortex(1,-1).transl(100e-9, 50e-9, 0) saveas(m, "vortex_transl") m = Vortex(1,-1).Add(0.1, randomMag()) saveas(m, "vortex_add_random") m = BlochSkyrmion(1, -1).scale(3,3,1) saveas(m, "Bloch_skyrmion") m = NeelSkyrmion(1,-1).scale(3,3,1) saveas(m, "Néel_skyrmion") // set m in only a part of space, or a single cell: m = uniform(1, 1, 1) m.setInShape(cylinder(400e-9, 100e-9), vortex(1, -1)) m.setCell(20, 10, 0, vector(0.1, 0.1, -0.9)) // set in cell index [20,10,0] saveas(m, "setInShape_setCell") //Read m form .ovf file. m.loadfile("myfile.ovf") saveas(m, "loadfile") `}} {{.Output}} These initial states are approximate, after setting them it is a good idea to relax the magnetization to the actual ground state. The magnetization can also be set in separate regions, see below.

Interlude: Rotating Cheese

In this example we define a geometry that looks like a slice of cheese and have it rotate in time. {{.Example ` setgridsize(128, 128, 1) setcellsize(2e-9, 2e-9, 2e-9) d := 200e-9 sq := rect(d, d) // square with side d h := 50e-9 hole := cylinder(h, h) // circle with diameter h hole1 := hole.transl(100e-9, 0, 0) // translated circle #1 hole2 := hole.transl(0, -50e-9, 0) // translated cricle #2 cheese:= sq.sub(hole1).sub(hole2)// subtract the circles from the square (makes holes). setgeom(cheese) msat = 600e3 aex = 12e-13 alpha = 3 // rotate the cheese. for i:=0; i<=90; i=i+30{ angle := i*pi/180 setgeom(cheese.rotz(angle)) m = uniform(cos(angle), sin(angle), 0) minimize() save(m) } `}} {{.Output}}

Regions: Space-dependent Parameters

Space-dependent parameters are defined using material regions. Regions are numbered 0-255 and represent different materials. Each cell can belong to only one region. At the start of a simulation all cells have region number 0.

Regions are defined with defregion(number, shape), where shape is explained in the geometry example.

When you're not using regions, like in the above examples, you'll probably set parameters with a simple assign:

Aex = 12e-13
Behind the screens, this sets Aex in all regions.

It's always a good idea to output the regions quantity, as well as all your material parameters.

{{.Example ` N := 128 setgridsize(N, N, 1) c := 4e-9 setcellsize(c, c, c) // disk with different anisotropy in left and right half setgeom(circle(N*c)) defregion(1, xrange(0, inf)) // left half defregion(2, xrange(-inf, 0)) // right half save(regions) Ku1.setregion(1, .1e6) anisU.setRegion(1, vector(1, 0, 0)) Ku1.setregion(2, .2e6) anisU.setRegion(2, vector(0, 1, 0)) save(Ku1) save(anisU) Msat = 800e3 // sets it everywhere save(Msat) Aex = 12e-13 alpha = 1 m.setRegion(1, uniform(1, 1, 0)) m.setRegion(2, uniform(-1, 1, 0)) saveas(m, "m_inital") run(.1e-9) saveas(m, "m_final") `}} {{.Output}}

Slicing and dicing output

The example below illustrates how to save only the part of the output you're interested in. {{.Example ` Nx := 256 Ny := 256 Nz := 1 setgridsize(Ny, Nx, Nz) c := 4e-9 setcellsize(c, c, c) setgeom(circle(Nx*c)) Msat = 800e3 Aex = 12e-13 alpha = 1 m = vortex(1, 1) save(m) save(m.Comp(0)) save(Crop(m, 0, Nx/2, 0, Ny/2, 0, Nz)) mx := m.Comp(0) mx_center := CropY(mx, Ny/4, 3*Ny/4) save(mx_center) `}} {{.Output}}

Magnetic Force Microscopy

Mumax3 has built-in generation of MFM images from the magnetization. The MFM tip lift can be freely chosen. By default the tip magnetization is modeled as a point monopole at the apex. This is sufficient for most situations. Nevertheless, it is also possible to model partially magnetized tips by setting MFMDipole to the magnetized portion of the tip, in meters. E.g., if only the first 20nm of the tip is (vertically) magnetized, set MFMDipole=20e-9.

{{.Example ` setgridsize(256, 256, 1) setcellsize(2e-9, 2e-9, 1e-9) setgeom(rect(400e-9, 400e-9)) msat = 600e3 aex = 10e-12 m = vortex(1, 1) relax() save(m) MFMLift = 10e-9 saveas(MFM, "lift_10nm") MFMLift = 40e-9 saveas(MFM, "lift_40nm") MFMLift = 90e-9 saveas(MFM, "lift_90nm") `}} {{.Output}}

PMA Racetrack

In this example we drive a domain wall in PMA material by spin-transfer torque. We set up a post-step function that makes the simulation box "follow" the domain wall. Like this, only a small number of cells is needed to simulate an infinitely long magnetic wire. {{.Example ` setGridSize(128, 128, 1) setCellSize(2e-9, 2e-9, 1e-9) Msat = 600e3 Aex = 10e-12 anisU = vector(0, 0, 1) Ku1 = 0.59e6 alpha = 0.02 Xi = 0.2 m = twoDomain(0, 0, 1, 1, 1, 0, 0, 0, -1) // up-down domains with wall between Bloch and Néél type relax() // Set post-step function that centers simulation window on domain wall. ext_centerWall(2) // keep m[2] (= m_z) close to zero // Schedule output autosave(m, 100e-12) // Run for 1ns with current through the sample j = vector(1.5e13, 0, 0) pol = 1 run(.5e-9) `}} {{.Output}} Since we center on the domain wall we can not see that it is actually moving, but the domain wall breakdown is visible.

Py Racetrack

In this example we drive a vortex wall in Permalloy by spin-transfer torque. The simulation box "follows" the domain wall. By removing surface charges at the left and right ends, we mimic an infintely long wire. {{.Example ` setGridSize(256, 64, 1) setCellSize(3e-9, 3e-9, 10e-9) Msat = 860e3 Aex = 13e-12 Xi = 0.1 alpha = 0.02 m = twodomain(1,0,0, 0,1,0, -1,0,0) notches := rect(15e-9, 15e-9).RotZ(45*pi/180).Repeat(200e-9, 64*3e-9, 0).Transl(0, 32*3e-9, 0) setGeom(notches.inverse()) // Remove surface charges from left (mx=1) and right (mx=-1) sides to mimic infinitely long wire. We have to specify the region (0) at the boundaries. BoundaryRegion := 0 MagLeft := 1 MagRight := -1 ext_rmSurfaceCharge(BoundaryRegion, MagLeft, MagRight) relax() ext_centerWall(0) // keep m[0] (m_x) close to zero // Schedule output autosave(m, 50e-12) tableadd(ext_dwpos) // domain wall position tableautosave(10e-12) // Run the simulation with current through the sample pol = 0.56 J = vector(-10e12, 0, 0) Run(0.5e-9) `}} {{.Output}} Since we center on the domain wall we can not really see the motion, despite the vortex wall moving pretty fast. Note the absence of closure domains at the edges due to the surface charges being removed there.

Voronoi tessellation

In this example we use regions to specify grains in a material. The built-in extension ext_makegrains is used to define grain-like regions using Voronoi tessellation. We vary the material parameters in each grain. {{.Example ` N := 256 c := 4e-9 d := 40e-9 setgridsize(N, N, 1) setcellsize(c, c, d) setGeom(circle(N*c)) // define grains with region number 0-255 grainSize := 40e-9 // m randomSeed := 1234567 maxRegion := 255 ext_makegrains(grainSize, maxRegion, randomSeed) defregion(256, circle(N*c).inverse()) // region 256 is outside, not really needed alpha = 3 Kc1 = 1000 Aex = 13e-12 Msat = 860e3 // set random parameters per region for i:=0; i

RKKY

Scaling the exchange coupling between regions can be used to obtain antiferromagnetic coupling like the RKKY interaction. In that case we only model the magnetic layers and do not explicitly add a spacer layer (which is negligibly thin). We scale the exchange coupling to get the desired RKKY strength: scale = (RKKY * cellsize_z) / (2 * Aex). {{.Example ` N := 10 setgridsize(N, N, 2) c := 1e-9 setcellsize(c, c, c) defRegion(0, layer(0)) defRegion(1, layer(1)) Msat = 1e6 Aex = 10e-12 RKKY := -1e-3 // 1mJ/m2 scale := (RKKY * c) / (2 * Aex.Average()) ext_scaleExchange(0, 1, scale) tableAdd(E_total) m.setRegion(0, uniform(1, 0, 0)) for ang:=0; ang<360; ang++{ m.setRegion(1, uniform(cos(ang*pi/180), sin(ang*pi/180), 0)) t = ang * 1e-9 // output "time" is really angle tablesave() } `}} {{.Output}}

Slonczewski STT

Example of a spin-torque MRAM stack consisting of a fixed layer, spacer and free layer. Only the free layer magnetization is explicitly modeled, so we use a 2D grid. The fixed layer polarization is set with FixedLayer = ..., which can be space-dependent. The spacer layer properties are modeled by setting the parameters Lambda and EpsilonPrime. Finally Pol sets the current polarization and J the current density, which should be along z in this case. Below we switch an MRAM bit. {{.Example ` // geometry sizeX := 160e-9 sizeY := 80e-9 sizeZ := 5e-9 Nx := 64 Ny := 32 setgridsize(Nx, Ny, 1) setcellsize(sizeX/Nx, sizeY/Ny, sizeZ) setGeom(ellipse(sizeX, sizeY)) // set up free layer Msat = 800e3 Aex = 13e-12 alpha = 0.01 m = uniform(1, 0, 0) // set up spacer layer parameters lambda = 1 Pol = 0.5669 epsilonprime = 0 // set up fixed layer polarization angle := 20 px := cos(angle * pi/180) py := sin(angle * pi/180) fixedlayer = vector(px, py, 0) // send current Jtot := -0.008 // total current in A area := sizeX*sizeY*pi/4 jc := Jtot / area // current density in A/m2 J = vector(0, 0, jc) // schedule output & run autosave(m, 100e-12) tableautosave(10e-12) run(1e-9) `}} {{.Output}}

Spinning hard disk

Using the Shift function, we can shift the system (magnetization, regions and geometry) by a given number of cells. Here we use this feature to simulate a moving hard disk platter. A time-dependent gaussian field profile mimics the write field. {{.Example ` Nx := 512 Ny := 128 c := 5e-9 setgridsize(Nx, Ny, 1) setcellsize(c, c, c) ext_makegrains(30e-9, 256, 0) // PMA material Ku1 = 0.4e6 Aex = 10e-12 Msat = 600e3 alpha = 1 delta := 0.2 // anisotropy variation for i:=0; i<256; i++{ // random cubic anisotropy direction AnisU.SetRegion(i, vector(delta*(rand()-0.5), delta*(rand()-0.5), 1)) // strongly reduce exchange coupling between grains for j:=i+1; j<256; j++{ ext_scaleExchange(i, j, 0.1) } } m = uniform(0, 0, 1) // Gaussian external field profile mask := newVectorMask(Nx, Ny, 1) for i:=0; i mumax3-3.10/doc/templates/head.html000066400000000000000000000012341371432437400172110ustar00rootroot00000000000000 mumax3 mumax3-3.10/doc/templates/header.html000066400000000000000000000012031371432437400175340ustar00rootroot00000000000000
mumax3
GPU-accelerated micromagnetism

Home Download Examples API Forum


mumax3-3.10/doc/templates/index-template.html000066400000000000000000000077151371432437400212420ustar00rootroot00000000000000 {{.Include "head.html"}} {{.Include "header.html"}}

mumax3 is a GPU-accelerated micromagnetic simulation program developed at the DyNaMat group of Prof. Van Waeyenberge at Ghent University. The code is written and maintained by Arne Vansteenkiste.

A speed-up of the order of 100x compared to CPU-based simulations can easily be reached, even with relatively inexpensive gaming GPUs. Additionally, the software is optimized for low memory use and can handle about 16 million FD cells with 2GB of GPU RAM.

Citations and licence

If you use mumax in any work or publication, we kindly ask you to cite:
"The design and verification of mumax3", AIP Advances 4, 107133 (2014).

mumax3 is open-source software. You are free to modify and distribute the source code under the GPLv3 licence.

Web interface showing the spatial magnetization.

Features

  • Landau-Lifshitz micromagnetic formalism
  • Magnetostatic field
  • Heisenberg exchange
  • Arbitrary inter-region exchange like RKKY coupling
  • Dzyaloshinskii-Moriya interaction
  • Spin-transfer torque (Zhang-Li and Slonczewski)
  • Uniaxial and cubic magnetocrystalline anisotropy
  • Thermal fluctuations (Brown)
  • Voronoi tessellation
  • Time- and space dependent material parameters
  • Arbitrary complex excitation (field, current)
  • Simulation window can automatically follow a moving domain wall
  • Edge charges can be removed to simulate an infinitely long geometry
  • Optional 1D, 2D or 3D periodic boundary conditions

web GUI

mumax3 includes a browser-based user interface that lets you follow a running simulation or modify it on-the-fly, be it on your local machine or remotely.

simple scripting

mumax3 provides simple yet powerful input scripting. E.g., the example applies a time-dependent external field to a uniform magnet (FMR experiment).


setgridsize(128, 32, 4)
setcellsize(5e-9, 5e-9, 5e-9)
Msat = 860e3
Aex  = 13e-12
alpha= 0.2
m=uniform(1, 1, 0)

f := 1e9  // 1GHz
A := 0.01 // 10mT
B_ext = vector(0.1, A*sin(2*pi*f*t), 0)

run(10e-9)
Web interface can view and set parameters on-the-fly.

GPU/driver requirements

Mumax3 is cross-platform and runs on Linux, Windows and Mac platforms. You need an nVIDIA GPU with compute capability 2.0 or higher, as listed here. You also need to use nVIDIA's proprietary graphics driver, which may already be installed on your system. The benchmark below may guide your GPU choice.

Mumax3 GPU perfomance for 2D simulations containing 4 million cells.

mumax3-3.10/doc/tex/000077500000000000000000000000001371432437400142245ustar00rootroot00000000000000mumax3-3.10/doc/tex/.gitignore000066400000000000000000000000141371432437400162070ustar00rootroot00000000000000*.aux *.pdf mumax3-3.10/doc/tex/Makefile000066400000000000000000000002761371432437400156710ustar00rootroot00000000000000mumax3.pdf: mumax3.tex pdflatex -halt-on-error mumax3.tex pdflatex -halt-on-error mumax3.tex .PHONY: clean clean: rm -f *.aux *.bbl *.blg *.ind *.ilg *.log *.toc *.out mumax3.pdf *.idx mumax3-3.10/doc/tex/mumax3.tex000066400000000000000000000067511371432437400161710ustar00rootroot00000000000000\documentclass[12pt]{article} \usepackage{a4wide} \usepackage{amsmath} \newcommand{\vc}[1]{\ensuremath{\vec{\textbf{#1}}}} \newcommand{\ofrt}{\ensuremath{\left(\vc{r},t \right)}} \newcommand{\m}{\vc{m}} \newcommand{\M}{\vc{M}} \newcommand{\Ms}{M_\mathrm{s}} \newcommand{\B}[1]{\vc{B}_\mathrm{#1}} \newcommand{\Beff}{\B{eff}} \newcommand{\tq}[1]{\vc{\ensuremath{\tau}}\ensuremath{_\mathrm{#1}}} \newcommand{\damp}{\ensuremath{\alpha}} \newcommand{\Kern}{\vec{\vec{\textbf{K}}}} \newcommand{\FFT}{\mathcal{F}} \newcommand{\hspin}{(\vc{u}\cdot\nabla)\vc{m}} \begin{document} \pagestyle{empty} We solve: \begin{eqnarray*} \frac{\partial \m}{\partial t} &=& \gamma_0 \left( \tq{LL} + \tq{STT} \right) \end{eqnarray*} \vspace{3cm} With boundary conditions: \begin{eqnarray*} \left.\frac{\partial m_x}{\partial x}\right|_{\partial V} &=& -\frac{D}{2A}m_z \\ \left.\frac{\partial m_y}{\partial x}\right|_{\partial V} &=& 0\\ \left.\frac{\partial m_z}{\partial x}\right|_{\partial V} &=& \frac{D}{2A}m_x\\ \left.\frac{\partial m_x}{\partial y}\right|_{\partial V} &=& 0\\ \left.\frac{\partial m_y}{\partial y}\right|_{\partial V} &=& -\frac{D}{2A}m_z \\ \left.\frac{\partial m_z}{\partial y}\right|_{\partial V} &=& \frac{D}{2A}m_y\\ \left.\frac{\partial m_x}{\partial z}\right|_{\partial V} &=& 0\\ \left.\frac{\partial m_y}{\partial z}\right|_{\partial V} &=& 0\\ \left.\frac{\partial m_z}{\partial z}\right|_{\partial V} &=& 0\\ \end{eqnarray*} Where: \begin{eqnarray*} \tq{LL} &=& \frac{1}{1+\damp^2} \left( \m \times \Beff +\damp\left( \m \times \left( \m \times \Beff \right)\right) \right)\\ \alpha &=& \alpha\ofrt \\ \m &=& \frac{\M\ofrt}{\Ms} \\ \Ms &=& {\left| \M\ofrt\right|} \\ \Beff &=& \B{d} + \B{ex} + \B{z} + \B{a} + \B{th} \\ \B{d}\ofrt &=& \iiint_V \Kern(\vc{r} - \vc{r}') \cdot \mu_0 \M(\vc{r'}, t) \mathrm{d}^3\vc{r}' \\ & = & \FFT^{-1} \left( \FFT\left(\Kern(\vc{r})\right) \cdot \FFT\left({\mu_0\M\ofrt}\right) \right) \\ \vc{K}_i(\vc{r}) & = & \frac{1}{4\pi}\left(\frac{3(\vc{e}_i\cdot\hat{\vc{r}})\hat{\vc{r}}-\vc{e}_i}{r^3}\right) + \frac{2}{3}\vc{e}_i\delta^3(\vc{r}) \\ \B{ex} &=& \frac{2 A}{\Ms} \Delta \m + \frac{2D}{\Ms} \left(\frac{\partial m_z}{\partial x},\ \frac{\partial m_z}{\partial y},\ -\frac{\partial m_x}{\partial x}-\frac{\partial m_y}{\partial y}\right) \\ A &=& A\ofrt \\ \B{z} &=& \B{z}\ofrt \\ \B{a} &=& \B{u} + \B{c} \\ \B{u} &=& 2 K_\mathrm{u1} \left( \m \cdot \vc{u} \right) \vc{u} \\ K_\mathrm{u1} &=& K_\mathrm{u1}\ofrt\\ \vc{u} &=& \vc{u}\ofrt \\ \vc{B}_{\mathrm{c}i} &=& \left( A_{cx} c_{1i} + A_{cy} c_{2i} + A_{cz} c_{3i} \right) \\ \vc{A}_{c} &=& K_{c1} \left(a_1(a_2^2+a_3^2),\ a_2(a_1^2+a_3^2),\ a_3(a_1^2+a_2^2)\right)\\ a_i &=& \vc{c}_i \cdot \m \\ \vc{c}_1 &=& \vc{c}_1 \ofrt \\ \vc{c}_2 &=& \vc{c}_2 \ofrt \\ \vc{c}_3 &=& \vc{c}_1 \times \vc{c}_2 \\ \B{th}\ofrt &=& \eta \ofrt \sqrt{ \frac{k_B^2 \alpha T} {\mu_0\gamma_0\Ms \Delta V \Delta t }} \\ \tq{STT} &=& \tq{ZL} + \tq{SL} \\ \tq{ZL} &=& \frac{1}{1+\alpha^2} \left( \left(1+\xi\alpha\right) \m \times \left(\m \times \hspin \right) + \left(\xi-\alpha\right)\vc{m}\times \hspin \right) \\ \vc{u} &=& \frac{\mu_B \mu_0}{2 e \gamma_0 B_\mathrm{s} (1 + \xi^2)} \vc{j}\\ \vc{j} &=& \vc{j}\ofrt \\ \tq{SL} &=& \beta\epsilon (\m \times \m_P \times \m) - \beta\epsilon' \m\times \m_P \\ \beta &=& \frac{j_z \hbar}{ \Ms e d} \\ \epsilon &=& \frac{P\ofrt \Lambda^2}{(\Lambda^2 + 1)+ (\Lambda^2-1)(\m\cdot\m_P)} \\ \epsilon' &=& \epsilon'\ofrt\\ \end{eqnarray*} \end{document} mumax3-3.10/draw/000077500000000000000000000000001371432437400136145ustar00rootroot00000000000000mumax3-3.10/draw/Makefile000066400000000000000000000000241371432437400152500ustar00rootroot00000000000000all: go install -v mumax3-3.10/draw/arrows.go000066400000000000000000000041301371432437400154560ustar00rootroot00000000000000package draw import ( "github.com/mumax/3/data" "github.com/mumax/3/freetype/raster" "image" "image/color" "math" ) func drawArrows(img *image.RGBA, arr [3][][][]float32, sub int) { c := NewCanvas(img) Na := data.SizeOf(arr[0]) // number of arrows h := Na[Y] // orignal image height Na[X] = imax(Na[X]/sub, 1) Na[Y] = imax(Na[Y]/sub, 1) Na[Z] = 1 small := data.Downsample(arr[:], Na) S := float32(sub) for iy := 0; iy < Na[Y]; iy++ { Ay := float32(h) - (float32(iy)+0.5)*S for ix := 0; ix < Na[X]; ix++ { Ax := (float32(ix) + 0.5) * S mx := small[X][0][iy][ix] my := small[Y][0][iy][ix] mz := small[Z][0][iy][ix] c.Arrow(Ax, Ay, mx, my, mz, float32(sub)) } } c.rasterizer.Rasterize(c.RGBAPainter) c.rasterizer.Clear() } // A Canvas is used to draw on. type Canvas struct { *image.RGBA *raster.RGBAPainter rasterizer *raster.Rasterizer } // Make a new canvas of size w x h. func NewCanvas(img *image.RGBA) *Canvas { c := new(Canvas) c.RGBA = img c.RGBAPainter = raster.NewRGBAPainter(c.RGBA) c.rasterizer = raster.NewRasterizer(img.Bounds().Max.X, img.Bounds().Max.Y) c.rasterizer.UseNonZeroWinding = true c.SetColor(color.RGBA{0, 0, 0, 100}) return c } func (c *Canvas) Arrow(x, y, mx, my, mz, size float32) { arrlen := 0.4 * size arrw := 0.2 * size norm := float32(math.Sqrt(float64(mx*mx + my*my + mz*mz))) if norm == 0 { return } if norm > 1 { norm = 1 } theta := math.Atan2(float64(my), float64(mx)) cos := float32(math.Cos(theta)) sin := float32(math.Sin(theta)) r1 := arrlen * norm * float32(math.Cos(math.Asin(float64(mz)))) r2 := arrw * norm pt1 := pt((r1*cos)+x, -(r1*sin)+y) pt2 := pt((r2*sin-r1*cos)+x, -(-r2*cos-r1*sin)+y) pt3 := pt((-r2*sin-r1*cos)+x, -(r2*cos-r1*sin)+y) var path raster.Path path.Start(pt1) path.Add1(pt2) path.Add1(pt3) path.Add1(pt1) c.rasterizer.AddPath(path) } func pt(x, y float32) raster.Point { return raster.Point{fix32(x), fix32(y)} } func fix32(x float32) raster.Fix32 { return raster.Fix32(int(x * (1 << 8))) } func imax(a, b int) int { if a > b { return a } else { return b } } mumax3-3.10/draw/colorscale.go000066400000000000000000000024171371432437400162750ustar00rootroot00000000000000package draw import "image/color" import "fmt" type ColorMapSpec struct { Cmap []color.RGBA Ccomp int } func ColorMap(min, max, value float32, colormap ...color.RGBA) color.RGBA { // default colormap: black-white if len(colormap) < 1 { colormap = []color.RGBA{{0, 0, 0, 255}, {255, 255, 255, 255}} } // map value to interval [O,1] val := float64((value - min) / (max - min)) if val > 1 { val = 1 } if val < 0 { val = 0 } // find index of color below our value maxIndex := float64(len(colormap) - 1) index := val * maxIndex // corner case val==max: if index == maxIndex { index-- } // get two neighboring colors i := int(index) if i < 0 { i = 0 } if i >= len(colormap)-1 { i = len(colormap) - 2 } c1 := colormap[i] c2 := colormap[i+1] // location between two neighboring colors [0..1] x := (val - float64(i)/maxIndex) * maxIndex if x < 0 || x > 1 { panic(fmt.Sprint("x=", x)) } // interpolate between colors r := (1-x)*float64(c1.R) + x*float64(c2.R) g := (1-x)*float64(c1.G) + x*float64(c2.G) b := (1-x)*float64(c1.B) + x*float64(c2.B) a := (1-x)*float64(c1.A) + x*float64(c2.A) return color.RGBA{bte(r), bte(g), bte(b), bte(a)} } func bte(x float64) uint8 { if x < 0 { return 0 } if x > 255 { return 255 } return uint8(x) } mumax3-3.10/draw/doc.go000066400000000000000000000000551371432437400147100ustar00rootroot00000000000000// 2D rendering of data slices. package draw mumax3-3.10/draw/encode.go000066400000000000000000000031131371432437400153760ustar00rootroot00000000000000package draw import ( "bufio" "fmt" "github.com/mumax/3/data" "image" "image/gif" "image/jpeg" "image/png" "io" "os" "path" "strings" ) func RenderFile(fname string, f *data.Slice, min, max string, arrowSize int, colormap ...ColorMapSpec) error { out, err := os.Create(fname) if err != nil { return err } defer out.Close() return RenderFormat(out, f, min, max, arrowSize, fname, colormap...) } func RenderFormat(out io.Writer, f *data.Slice, min, max string, arrowSize int, format string, colormap ...ColorMapSpec) error { var codecs = map[string]codec{".png": PNG, ".jpg": JPEG100, ".gif": GIF256} ext := strings.ToLower(path.Ext(format)) enc := codecs[ext] if enc == nil { return fmt.Errorf("render: unhandled image type: " + ext) } return Render(out, f, min, max, arrowSize, enc, colormap...) } // encodes an image type codec func(io.Writer, image.Image) error // Render data and encode with arbitrary codec. func Render(out io.Writer, f *data.Slice, min, max string, arrowSize int, encode codec, colormap ...ColorMapSpec) error { img := Image(f, min, max, arrowSize, colormap...) buf := bufio.NewWriter(out) defer buf.Flush() return encode(buf, img) } // full-quality jpeg codec, passable to Render() func JPEG100(w io.Writer, img image.Image) error { return jpeg.Encode(w, img, &jpeg.Options{100}) } // full quality gif coded, passable to Render() func GIF256(w io.Writer, img image.Image) error { return gif.Encode(w, img, &gif.Options{256, nil, nil}) } // png codec, passable to Render() func PNG(w io.Writer, img image.Image) error { return png.Encode(w, img) } mumax3-3.10/draw/hslscale.go000066400000000000000000000024601371432437400157430ustar00rootroot00000000000000package draw import ( "image/color" "math" ) // Colormap for 3D vector data. func HSLMap(x, y, z float32) color.RGBA { s := sqrtf(x*x + y*y + z*z) l := 0.5*z + 0.5 h := float32(math.Atan2(float64(y), float64(x))) return HSLtoRGB(h, s, l) } // h = 0..2pi, s=0..1, l=0..1 func HSLtoRGB(h, s, l float32) color.RGBA { if s > 1 { s = 1 } if l > 1 { l = 1 } h = h * (180.0 / math.Pi / 60.0) for h < 0 { h += 6 } for h >= 6 { h -= 6 } var c float32 // chroma if l <= 0.5 { c = 2 * l * s } else { c = (2 - 2*l) * s } x := c * (1 - abs(fmod(h, 2)-1)) var r, g, b float32 switch { case 0 <= h && h < 1: r, g, b = c, x, 0. case 1 <= h && h < 2: r, g, b = x, c, 0. case 2 <= h && h < 3: r, g, b = 0., c, x case 3 <= h && h < 4: r, g, b = 0, x, c case 4 <= h && h < 5: r, g, b = x, 0., c case 5 <= h && h < 6: r, g, b = c, 0., x } m := l - 0.5*c r, g, b = r+m, g+m, b+m R, G, B := uint8(255*r), uint8(255*g), uint8(255*b) return color.RGBA{R, G, B, 255} } // modulo func fmod(number, mod float32) float32 { for number < mod { number += mod } for number >= mod { number -= mod } return number } func abs(number float32) float32 { if number < 0 { return -number } // else return number } func sqrtf(x float32) float32 { return float32(math.Sqrt(float64(x))) } mumax3-3.10/draw/image.go000066400000000000000000000064131371432437400152310ustar00rootroot00000000000000package draw import ( "github.com/mumax/3/data" "github.com/mumax/3/util" "image" "image/color" "log" "strconv" ) // Renders an image of slice. fmin, fmax = "auto" or a number to set the min/max color scale. func Image(f *data.Slice, fmin, fmax string, arrowSize int, colormap ...ColorMapSpec) *image.RGBA { img := new(image.RGBA) On(img, f, fmin, fmax, arrowSize, colormap...) return img } // Render on existing image buffer. Resize it if needed func On(img *image.RGBA, f *data.Slice, fmin, fmax string, arrowSize int, colormap ...ColorMapSpec) { dim := f.NComp() switch dim { default: log.Fatalf("unsupported number of components: %v", dim) case 3: if colormap == nil { drawVectors(img, f.Vectors(), arrowSize) break } if colormap[0].Ccomp >= 0 { ff := f.Comp(colormap[0].Ccomp) min, max := parseMinMax(ff, fmin, fmax) drawFloats(img, ff.Scalars(), min, max, colormap[0].Cmap...) if arrowSize > 0 { drawArrows(img, f.Vectors(), arrowSize) } } else { drawVectors(img, f.Vectors(), arrowSize) } case 1: min, max := parseMinMax(f, fmin, fmax) if colormap == nil { drawFloats(img, f.Scalars(), min, max) } else { drawFloats(img, f.Scalars(), min, max, colormap[0].Cmap...) } } } func parseMinMax(f *data.Slice, fmin, fmax string) (min, max float32) { min, max = extrema(f.Host()[0]) if fmin != "auto" { m, err := strconv.ParseFloat(fmin, 32) if err != nil { util.Fatal("draw: scale:", err) } min = float32(m) } if fmax != "auto" { m, err := strconv.ParseFloat(fmax, 32) if err != nil { util.Fatal("draw: scale:", err) } max = float32(m) } if min == max { min -= 1 max += 1 // make it gray instead of black } return } // Draws rank 4 tensor (3D vector field) as image // averages data over X (usually thickness of thin film) func drawVectors(img *image.RGBA, arr [3][][][]float32, arrowSize int) { w, h := len(arr[X][0][0]), len(arr[X][0]) d := len(arr[X]) norm := float32(d) *img = *recycle(img, w, h) for iy := 0; iy < h; iy++ { for ix := 0; ix < w; ix++ { var x, y, z float32 = 0., 0., 0. for iz := 0; iz < d; iz++ { x += arr[0][iz][iy][ix] y += arr[1][iz][iy][ix] z += arr[2][iz][iy][ix] } x /= norm y /= norm z /= norm img.Set(ix, (h-1)-iy, HSLMap(x, y, z)) } } if arrowSize > 0 { drawArrows(img, arr, arrowSize) } } func extrema(data []float32) (min, max float32) { min = data[0] max = data[0] for _, d := range data { if d < min { min = d } if d > max { max = d } } return } // Draws rank 3 tensor (3D scalar field) as image // averages data over X (usually thickness of thin film) func drawFloats(img *image.RGBA, arr [][][]float32, min, max float32, colormap ...color.RGBA) { w, h := len(arr[0][0]), len(arr[0]) d := len(arr) *img = *recycle(img, w, h) for iy := 0; iy < h; iy++ { for ix := 0; ix < w; ix++ { var v float32 = 0. for iz := 0; iz < d; iz++ { v += arr[iz][iy][ix] } v /= float32(d) img.Set(ix, (h-1)-iy, ColorMap(min, max, v, colormap...)) } } } // recycle image if it has right size func recycle(img *image.RGBA, w, h int) *image.RGBA { if img == nil || img.Bounds().Size().X != w || img.Bounds().Size().Y != h { img = image.NewRGBA(image.Rect(0, 0, w, h)) } return img } const ( X = 0 Y = 1 Z = 2 ) mumax3-3.10/draw/svg.go000066400000000000000000000024211371432437400147410ustar00rootroot00000000000000package draw import ( "fmt" "github.com/mumax/3/svgo" "io" "math" ) // Renders svg image of vector data. func SVG(out io.Writer, arr [3][][][]float32) { h, w := len(arr[0][0]), len(arr[0][0][0]) const ( r1 = 1. / 2. // arrow half length r2 = 1. / 4. // arrow half width ) canvas := svg.New(out) canvas.Start(w, h) for slice := 0; slice < len(arr[0]); slice++ { Mx := arr[X][slice] My := arr[Y][slice] Mz := arr[Z][slice] for i := 0; i < h; i++ { y := float64(h) - (float64(i) + 1./2.) for j := 0; j < w; j++ { x := float64(j) + 1./2. mx := Mx[i][j] my := My[i][j] mz := Mz[i][j] // skip zero-length vectors if mx*mx+my*my+mz*mz == 0 { continue } theta := math.Atan2(float64(my), float64(mx)) c := math.Cos(theta) s := math.Sin(theta) r1 := r1 * math.Cos(math.Asin(float64(mz))) xs := []float64{(r1 * c) + x, (r2*s - r1*c) + x, (-r2*s - r1*c) + x} ys := []float64{-(r1 * s) + y, -(-r2*c - r1*s) + y, -(r2*c - r1*s) + y} col := HSLMap(mx, my, mz) style := "fill:#" + hex(col.R) + hex(col.G) + hex(col.B) canvas.Polygon(xs, ys, style) } } } canvas.End() } func hex(i uint8) string { j := int(i) - 32 // make it a bit darker if j < 0 { j = 0 } return fmt.Sprintf("%02X", j) } mumax3-3.10/dump/000077500000000000000000000000001371432437400136245ustar00rootroot00000000000000mumax3-3.10/dump/Makefile000066400000000000000000000000241371432437400152600ustar00rootroot00000000000000all: go install -v mumax3-3.10/dump/read.go000066400000000000000000000064071371432437400150750ustar00rootroot00000000000000// legacy dump data format. package dump import ( "fmt" "github.com/mumax/3/data" "github.com/mumax/3/util" "hash" "hash/crc64" "io" "math" "os" "unsafe" ) func Read(in io.Reader) (*data.Slice, data.Meta, error) { r := newReader(in) return r.readSlice() } func ReadFile(fname string) (*data.Slice, data.Meta, error) { f, err := os.Open(fname) if err != nil { return nil, data.Meta{}, err } defer f.Close() return Read(f) } func MustReadFile(fname string) (*data.Slice, data.Meta) { s, t, err := ReadFile(fname) util.FatalErr(err) return s, t } // Reads successive data frames in dump format. type reader struct { in io.Reader crc hash.Hash64 err error } func newReader(in io.Reader) *reader { r := new(reader) r.in = in r.crc = crc64.New(table) return r } func (r *reader) readSlice() (s *data.Slice, info data.Meta, err error) { r.err = nil // clear previous error, if any magic := r.readString() if r.err != nil { return nil, data.Meta{}, r.err } if magic != MAGIC { r.err = fmt.Errorf("dump: bad magic number:%v", magic) return nil, data.Meta{}, r.err } nComp := r.readInt() size := [3]int{} size[2] = r.readInt() // backwards compatible coordinates! size[1] = r.readInt() size[0] = r.readInt() cell := [3]float64{} cell[2] = r.readFloat64() cell[1] = r.readFloat64() cell[0] = r.readFloat64() info.CellSize = cell info.MeshUnit = r.readString() info.Time = r.readFloat64() _ = r.readString() // time unit s = data.NewSlice(nComp, size) info.Name = r.readString() info.Unit = r.readString() precission := r.readUint64() util.AssertMsg(precission == 4, "only single precission supported") if r.err != nil { return } host := s.Tensors() ncomp := s.NComp() for c := 0; c < ncomp; c++ { for iz := 0; iz < size[2]; iz++ { for iy := 0; iy < size[1]; iy++ { for ix := 0; ix < size[0]; ix++ { host[c][iz][iy][ix] = r.readFloat32() } } } } // Check CRC var mycrc uint64 // checksum by this reader if r.crc != nil { mycrc = r.crc.Sum64() } storedcrc := r.readUint64() // checksum from data stream. 0 means not set if r.err != nil { return nil, data.Meta{}, r.err } if r.crc != nil { r.crc.Reset() // reset for next frame } if r.crc != nil && storedcrc != 0 && mycrc != storedcrc { r.err = fmt.Errorf("dump CRC error: expected %16x, got %16x", storedcrc, mycrc) return nil, data.Meta{}, r.err } return s, info, nil } func (r *reader) readInt() int { x := r.readUint64() if uint64(int(x)) != x { r.err = fmt.Errorf("value overflows int: %v", x) } return int(x) } // read until the buffer is full func (r *reader) read(buf []byte) { _, err := io.ReadFull(r.in, buf[:]) if err != nil { r.err = err } if r.crc != nil { r.crc.Write(buf) } } // read a maximum 8-byte string func (r *reader) readString() string { var buf [8]byte r.read(buf[:]) // trim trailing NULs. i := 0 for i = 0; i < len(buf); i++ { if buf[i] == 0 { break } } return string(buf[:i]) } func (r *reader) readFloat64() float64 { return math.Float64frombits(r.readUint64()) } func (r *reader) readUint64() uint64 { var buf [8]byte r.read(buf[:]) return *((*uint64)(unsafe.Pointer(&buf[0]))) } func (r *reader) readFloat32() float32 { var buf [4]byte r.read(buf[:]) return *((*float32)(unsafe.Pointer(&buf[0]))) } mumax3-3.10/dump/write.go000066400000000000000000000054161371432437400153130ustar00rootroot00000000000000package dump import ( "bufio" "github.com/mumax/3/data" "github.com/mumax/3/util" "hash" "hash/crc64" "io" "math" "os" "unsafe" ) // Write the slice to out in binary format. Add time stamp. func Write(out io.Writer, s *data.Slice, info data.Meta) error { w := newWriter(out) // Writes the header. w.writeString(MAGIC) w.writeUInt64(uint64(s.NComp())) size := s.Size() w.writeUInt64(uint64(size[2])) // backwards compatible coordinates! w.writeUInt64(uint64(size[1])) w.writeUInt64(uint64(size[0])) cell := info.CellSize w.writeFloat64(cell[2]) w.writeFloat64(cell[1]) w.writeFloat64(cell[0]) w.writeString(info.MeshUnit) w.writeFloat64(info.Time) w.writeString("s") // time unit w.writeString(info.Name) w.writeString(info.Unit) w.writeUInt64(4) // precission // return header write error before writing data if w.err != nil { return w.err } w.writeData(s) w.writeHash() return w.err } // Write the slice to file in binary format. Add time stamp. func WriteFile(fname string, s *data.Slice, info data.Meta) error { f, err := os.OpenFile(fname, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0666) if err != nil { return err } defer f.Close() w := bufio.NewWriter(f) defer w.Flush() return Write(w, s, info) } // Write the slice to file in binary format, panic on error. func MustWriteFile(fname string, s *data.Slice, info data.Meta) { err := WriteFile(fname, s, info) util.FatalErr(err) } var table = crc64.MakeTable(crc64.ISO) type writer struct { out io.Writer crc hash.Hash64 err error } func newWriter(out io.Writer) *writer { w := new(writer) w.crc = crc64.New(table) w.out = io.MultiWriter(w.crc, out) return w } const MAGIC = "#dump002" // identifies dump format // Writes the data. func (w *writer) writeData(array *data.Slice) { data := array.Tensors() size := array.Size() ncomp := array.NComp() for c := 0; c < ncomp; c++ { for iz := 0; iz < size[2]; iz++ { for iy := 0; iy < size[1]; iy++ { for ix := 0; ix < size[0]; ix++ { w.writeFloat32(data[c][iz][iy][ix]) } } } } } // Writes the accumulated hash of this frame, closing the frame. func (w *writer) writeHash() { w.writeUInt64(w.crc.Sum64()) w.crc.Reset() } func (w *writer) count(n int, err error) { if err != nil && w.err == nil { w.err = err } } func (w *writer) writeFloat32(x float32) { var bytes []byte bytes = (*[4]byte)(unsafe.Pointer(&x))[:] w.count(w.out.Write(bytes)) } func (w *writer) writeFloat64(x float64) { w.writeUInt64(math.Float64bits(x)) } func (w *writer) writeString(x string) { var buf [8]byte copy(buf[:], x) w.count(w.out.Write(buf[:])) } func (w *writer) writeUInt64(x uint64) { w.count(w.out.Write((*(*[8]byte)(unsafe.Pointer(&x)))[:8])) } // product of elements. func prod(size [3]int) int { return size[0] * size[1] * size[2] } mumax3-3.10/engine/000077500000000000000000000000001371432437400141245ustar00rootroot00000000000000mumax3-3.10/engine/Makefile000066400000000000000000000000241371432437400155600ustar00rootroot00000000000000all: go install -v mumax3-3.10/engine/anisotropy.go000066400000000000000000000072721371432437400166720ustar00rootroot00000000000000package engine // Magnetocrystalline anisotropy. import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) // Anisotropy variables var ( Ku1 = NewScalarParam("Ku1", "J/m3", "1st order uniaxial anisotropy constant") Ku2 = NewScalarParam("Ku2", "J/m3", "2nd order uniaxial anisotropy constant") Kc1 = NewScalarParam("Kc1", "J/m3", "1st order cubic anisotropy constant") Kc2 = NewScalarParam("Kc2", "J/m3", "2nd order cubic anisotropy constant") Kc3 = NewScalarParam("Kc3", "J/m3", "3rd order cubic anisotropy constant") AnisU = NewVectorParam("anisU", "", "Uniaxial anisotropy direction") AnisC1 = NewVectorParam("anisC1", "", "Cubic anisotropy direction #1") AnisC2 = NewVectorParam("anisC2", "", "Cubic anisotorpy directon #2") B_anis = NewVectorField("B_anis", "T", "Anisotropy field", AddAnisotropyField) Edens_anis = NewScalarField("Edens_anis", "J/m3", "Anisotropy energy density", AddAnisotropyEnergyDensity) E_anis = NewScalarValue("E_anis", "J", "total anisotropy energy", GetAnisotropyEnergy) ) var ( sZero = NewScalarParam("_zero", "", "utility zero parameter") ) func init() { registerEnergy(GetAnisotropyEnergy, AddAnisotropyEnergyDensity) } func addUniaxialAnisotropyFrom(dst *data.Slice, M magnetization, Msat, Ku1, Ku2 *RegionwiseScalar, AnisU *RegionwiseVector) { if Ku1.nonZero() || Ku2.nonZero() { ms := Msat.MSlice() defer ms.Recycle() ku1 := Ku1.MSlice() defer ku1.Recycle() ku2 := Ku2.MSlice() defer ku2.Recycle() u := AnisU.MSlice() defer u.Recycle() cuda.AddUniaxialAnisotropy2(dst, M.Buffer(), ms, ku1, ku2, u) } } func addCubicAnisotropyFrom(dst *data.Slice, M magnetization, Msat, Kc1, Kc2, Kc3 *RegionwiseScalar, AnisC1, AnisC2 *RegionwiseVector) { if Kc1.nonZero() || Kc2.nonZero() || Kc3.nonZero() { ms := Msat.MSlice() defer ms.Recycle() kc1 := Kc1.MSlice() defer kc1.Recycle() kc2 := Kc2.MSlice() defer kc2.Recycle() kc3 := Kc3.MSlice() defer kc3.Recycle() c1 := AnisC1.MSlice() defer c1.Recycle() c2 := AnisC2.MSlice() defer c2.Recycle() cuda.AddCubicAnisotropy2(dst, M.Buffer(), ms, kc1, kc2, kc3, c1, c2) } } // Add the anisotropy field to dst func AddAnisotropyField(dst *data.Slice) { addUniaxialAnisotropyFrom(dst, M, Msat, Ku1, Ku2, AnisU) addCubicAnisotropyFrom(dst, M, Msat, Kc1, Kc2, Kc3, AnisC1, AnisC2) } // Add the anisotropy energy density to dst func AddAnisotropyEnergyDensity(dst *data.Slice) { haveUnixial := Ku1.nonZero() || Ku2.nonZero() haveCubic := Kc1.nonZero() || Kc2.nonZero() || Kc3.nonZero() if !haveUnixial && !haveCubic { return } buf := cuda.Buffer(B_anis.NComp(), Mesh().Size()) defer cuda.Recycle(buf) // unnormalized magnetization: Mf := ValueOf(M_full) defer cuda.Recycle(Mf) if haveUnixial { // 1st cuda.Zero(buf) addUniaxialAnisotropyFrom(buf, M, Msat, Ku1, sZero, AnisU) cuda.AddDotProduct(dst, -1./2., buf, Mf) // 2nd cuda.Zero(buf) addUniaxialAnisotropyFrom(buf, M, Msat, sZero, Ku2, AnisU) cuda.AddDotProduct(dst, -1./4., buf, Mf) } if haveCubic { // 1st cuda.Zero(buf) addCubicAnisotropyFrom(buf, M, Msat, Kc1, sZero, sZero, AnisC1, AnisC2) cuda.AddDotProduct(dst, -1./4., buf, Mf) // 2nd cuda.Zero(buf) addCubicAnisotropyFrom(buf, M, Msat, sZero, Kc2, sZero, AnisC1, AnisC2) cuda.AddDotProduct(dst, -1./6., buf, Mf) // 3nd cuda.Zero(buf) addCubicAnisotropyFrom(buf, M, Msat, sZero, sZero, Kc3, AnisC1, AnisC2) cuda.AddDotProduct(dst, -1./8., buf, Mf) } } // Returns anisotropy energy in joules. func GetAnisotropyEnergy() float64 { buf := cuda.Buffer(1, Mesh().Size()) defer cuda.Recycle(buf) cuda.Zero(buf) AddAnisotropyEnergyDensity(buf) return cellVolume() * float64(cuda.Sum(buf)) } mumax3-3.10/engine/asyncio.go000066400000000000000000000022621371432437400161220ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/timer" "github.com/mumax/3/util" "time" ) // Asynchronous I/O queue flushes data to disk while simulation keeps running. // See save.go, autosave.go var ( saveQue chan func() // passes save requests to runSaver for asyc IO queLen util.Atom // # tasks in queue ) const maxOutputQueLen = 16 // number of outputs that can be queued for asynchronous I/O. func init() { DeclFunc("Flush", drainOutput, "Flush all pending output to disk.") saveQue = make(chan func()) go runSaver() } func queOutput(f func()) { if cuda.Synchronous { timer.Start("io") } queLen.Add(1) saveQue <- f if cuda.Synchronous { timer.Stop("io") } } // Continuously executes tasks the from SaveQue channel. func runSaver() { for f := range saveQue { f() queLen.Add(-1) } } // Finalizer function called upon program exit. // Waits until all asynchronous output has been saved. func drainOutput() { if saveQue == nil { return } for queLen.Load() > 0 { select { default: time.Sleep(1 * time.Millisecond) // other goroutine has the last job, wait for it to finish case f := <-saveQue: f() queLen.Add(-1) } } } mumax3-3.10/engine/autosave.go000066400000000000000000000036141371432437400163060ustar00rootroot00000000000000package engine // Bookkeeping for auto-saving quantities at given intervals. import "fmt" var ( output = make(map[Quantity]*autosave) // when to save quantities autonum = make(map[string]int) // auto number for out file ) func init() { DeclFunc("AutoSave", AutoSave, "Auto save space-dependent quantity every period (s).") DeclFunc("AutoSnapshot", AutoSnapshot, "Auto save image of quantity every period (s).") } // Periodically called by run loop to save everything that's needed at this time. func DoOutput() { for q, a := range output { if a.needSave() { a.save(q) a.count++ } } if Table.needSave() { Table.Save() } } // Register quant to be auto-saved every period. // period == 0 stops autosaving. func AutoSave(q Quantity, period float64) { autoSave(q, period, Save) } // Register quant to be auto-saved as image, every period. func AutoSnapshot(q Quantity, period float64) { autoSave(q, period, Snapshot) } // register save(q) to be called every period func autoSave(q Quantity, period float64, save func(Quantity)) { if period == 0 { delete(output, q) } else { output[q] = &autosave{period, Time, -1, save} // init count to -1 allows save at t=0 } } // generate auto file name based on save count and FilenameFormat. E.g.: // m000001.ovf func autoFname(name string, format OutputFormat, num int) string { return fmt.Sprintf(OD()+FilenameFormat+"."+StringFromOutputFormat[format], name, num) } // keeps info needed to decide when a quantity needs to be periodically saved type autosave struct { period float64 // How often to save start float64 // Starting point count int // Number of times it has been autosaved save func(Quantity) // called to do the actual save } // returns true when the time is right to save. func (a *autosave) needSave() bool { t := Time - a.start return a.period != 0 && t-float64(a.count)*a.period >= a.period } mumax3-3.10/engine/average.go000066400000000000000000000021661371432437400160720ustar00rootroot00000000000000package engine // Averaging of quantities over entire universe or just magnet. import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) // average of quantity over universe func qAverageUniverse(q Quantity) []float64 { s := ValueOf(q) defer cuda.Recycle(s) return sAverageUniverse(s) } // average of slice over universe func sAverageUniverse(s *data.Slice) []float64 { nCell := float64(prod(s.Size())) avg := make([]float64, s.NComp()) for i := range avg { avg[i] = float64(cuda.Sum(s.Comp(i))) / nCell checkNaN1(avg[i]) } return avg } // average of slice over the magnet volume func sAverageMagnet(s *data.Slice) []float64 { if geometry.Gpu().IsNil() { return sAverageUniverse(s) } else { avg := make([]float64, s.NComp()) for i := range avg { avg[i] = float64(cuda.Dot(s.Comp(i), geometry.Gpu())) / magnetNCell() checkNaN1(avg[i]) } return avg } } // number of cells in the magnet. // not necessarily integer as cells can have fractional volume. func magnetNCell() float64 { if geometry.Gpu().IsNil() { return float64(Mesh().NCell()) } else { return float64(cuda.Sum(geometry.Gpu())) } } mumax3-3.10/engine/backwardeuler.go000066400000000000000000000025001371432437400172630ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Implicit midpoint solver. type BackwardEuler struct { dy1 *data.Slice } // Euler method, can be used as solver.Step. func (s *BackwardEuler) Step() { util.AssertMsg(MaxErr > 0, "Backward euler solver requires MaxErr > 0") t0 := Time y := M.Buffer() y0 := cuda.Buffer(VECTOR, y.Size()) defer cuda.Recycle(y0) data.Copy(y0, y) dy0 := cuda.Buffer(VECTOR, y.Size()) defer cuda.Recycle(dy0) if s.dy1 == nil { s.dy1 = cuda.Buffer(VECTOR, y.Size()) } dy1 := s.dy1 Dt_si = FixDt dt := float32(Dt_si * GammaLL) util.AssertMsg(dt > 0, "Backward Euler solver requires fixed time step > 0") // Fist guess Time = t0 + 0.5*Dt_si // 0.5 dt makes it implicit midpoint method // with temperature, previous torque cannot be used as predictor if Temp.isZero() { cuda.Madd2(y, y0, dy1, 1, dt) // predictor euler step with previous torque M.normalize() } torqueFn(dy0) cuda.Madd2(y, y0, dy0, 1, dt) // y = y0 + dt * dy M.normalize() // One iteration torqueFn(dy1) cuda.Madd2(y, y0, dy1, 1, dt) // y = y0 + dt * dy1 M.normalize() Time = t0 + Dt_si err := cuda.MaxVecDiff(dy0, dy1) * float64(dt) NSteps++ setLastErr(err) setMaxTorque(dy1) } func (s *BackwardEuler) Free() { s.dy1.Free() s.dy1 = nil } mumax3-3.10/engine/bib.go000066400000000000000000000140761371432437400152170ustar00rootroot00000000000000package engine import ( "io" "github.com/mumax/3/httpfs" "github.com/mumax/3/util" ) const separationline = ` --------------------------------------------------------------------------- ` const bibheader = ` This bibtex file is automatically generated by Mumax3. The following list are references relevant for your simulation. If you use the results of these simulations in any work or publication, we kindly ask you to cite them.` var ( bibfile io.WriteCloser library map[string]*bibEntry ) func init() { buildLibrary() } func initBib() { // inited in engine.InitIO if bibfile != nil { panic("bib already initialized") } var err error bibfile, err = httpfs.Create(OD() + "references.bib") if err != nil { panic(err) } util.FatalErr(err) fprintln(bibfile, bibheader) fprintln(bibfile, separationline) Refer("vansteenkiste2014") // Make sure that Mumax3 is always referenced } type bibEntry struct { reason string bibtex string shortref string used bool } func Refer(tag string) { bibentry, inLibrary := library[tag] if bibentry.used || !inLibrary { return } bibentry.used = true if bibfile != nil { fprintln(bibfile, bibentry.reason) fprintln(bibfile, bibentry.bibtex) fprintln(bibfile, separationline) } } func areRefsUsed() bool { for _, bibentry := range library { if bibentry.used { return true } } return false } func LogUsedRefs() { if !areRefsUsed() { return } LogOut("********************************************************************//") LogOut("Please cite the following references, relevant for your simulation. //") LogOut("See bibtex file in output folder for justification. //") LogOut("********************************************************************//") for _, bibentry := range library { if bibentry.used { LogOut(" * " + bibentry.shortref) } } } func buildLibrary() { library = make(map[string]*bibEntry) library["vansteenkiste2014"] = &bibEntry{ reason: "Main paper about Mumax3", shortref: "Vansteenkiste et al., AIP Adv. 4, 107133 (2014).", bibtex: ` @article{Vansteenkiste2014, author = {Vansteenkiste, Arne and Leliaert, Jonathan and Dvornik, Mykola and Helsen, Mathias and Garcia-Sanchez, Felipe and {Van Waeyenberge}, Bartel}, title = {{The design and verification of Mumax3}}, journal = {AIP Advances}, number = {10}, pages = {107133}, volume = {4}, year = {2014}, doi = {10.1063/1.4899186}, url = {http://doi.org/10.1063/1.4899186} }`} library["exl2014"] = &bibEntry{ reason: "Mumax3 uses Exl's minimizer", shortref: "Exl et al., J. Appl. Phys. 115, 17D118 (2014).", bibtex: ` @article{Exl2014, author = {Exl, Lukas and Bance, Simon and Reichel, Franz and Schrefl, Thomas and {Peter Stimming}, Hans and Mauser, Norbert J.}, title = {{LaBonte's method revisited: An effective steepest descent method for micromagnetic energy minimization}}, journal = {Journal of Applied Physics}, number = {17}, pages = {17D118}, volume = {115}, year = {2014}, doi = {10.1063/1.4862839}, url = {http://doi.org/10.1063/1.4862839} }`} library["Lel2014"] = &bibEntry{ reason: "Mumax3 used function ext_makegrains", shortref: "Leliaert et al., J. Appl. Phys. 115, 233903 (2014)", bibtex: ` @article{Lel2014, author = {Leliaert, Jonathan and Van de Wiele, Ben and Vansteenkiste, Arne and Laurson, Lasse and Durin, Gianfranco and Dupr{\'e}, Luc and Van Waeyenberge, Bartel}, title = {{Current-driven domain wall mobility in polycrystalline permalloy nanowires: A numerical study}}, journal = {Journal of Applied Physics}, volume = {115}, number = {23}, pages = {233903}, year = {2014}, doi = {10.1063/1.4883297}, url = {http://dx.doi.org/10.1063/1.4883297} }`} library["mulkers2017"] = &bibEntry{ reason: "Simulated system has interfacially induced DMI", shortref: "Mulkers et al., Phys. Rev. B 95, 144401 (2017).", bibtex: ` @article{Mulkers2017, author = {Mulkers, Jeroen and Van Waeyenberge, Bartel and Milo{\v{s}}evi{\'{c}}, Milorad V.}, title = {{Effects of spatially-engineered Dzyaloshinskii-Moriya interaction in ferromagnetic films}}, journal = {Physical Review B}, number = {14}, pages = {144401}, volume = {95}, year = {2017}, doi = {10.1103/PhysRevB.95.144401}, url = {doi.org/10.1103/PhysRevB.95.144401}, }`} library["leliaert2017"] = &bibEntry{ reason: "Simulated nonzero temperatures with adaptive time steps", shortref: "Leliaert et al., AIP Adv. 7, 125010 (2017).", bibtex: ` @article{Leliaert2017, author = {Leliaert, Jonathan and Mulkers, Jeroen and De Clercq, Jonas and Coene, Annelies and Dvornik, Mykola and Van Waeyenberge, Bartel}, title = {{Adaptively time stepping the stochastic Landau-Lifshitz-Gilbert equation at nonzero temperature: implementation and validation in MuMax$^3$}}, journal = {AIP Advances}, number = {12}, pages = {125010}, volume = {7}, year = {2017}, doi = {doi.org/10.1063/1.5003957}, url = {http://aip.scitation.org/doi/10.1063/1.5003957}, }`} library["Berg1981"] = &bibEntry{ reason: "Computed the topological charge using the formula of Berg and Lüscher", shortref: "Berg et al., Nucl. Phys. B 190, 412–24 (1981)", bibtex: ` @article{Berg1981, author = {Berg, Bernd A Lüscher, Martin}, title = {{Definition and statistical distributions of a topological number in the lattice O(3) $\sigma$-model}}, journal = {Nuclear Physics B}, pages = {412-424}, volume = {190}, year = {1981}, doi = {doi.org/10.1016/0550-3213(81)90568-X}, url = {https://doi.org/10.1016/0550-3213(81)90568-X}, }`} } mumax3-3.10/engine/comp.go000066400000000000000000000020761371432437400154160ustar00rootroot00000000000000package engine // Comp is a Derived Quantity pointing to a single component of vector Quantity import ( "fmt" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" ) type component struct { parent Quantity comp int } // Comp returns vector component c of the parent Quantity func Comp(parent Quantity, c int) ScalarField { util.Argument(c >= 0 && c < parent.NComp()) return AsScalarField(&component{parent, c}) } func (q *component) NComp() int { return 1 } func (q *component) Name() string { return fmt.Sprint(NameOf(q.parent), "_", compname[q.comp]) } func (q *component) Unit() string { return UnitOf(q.parent) } func (q *component) Mesh() *data.Mesh { return MeshOf(q.parent) } func (q *component) Slice() (*data.Slice, bool) { p := q.parent src := ValueOf(p) defer cuda.Recycle(src) c := cuda.Buffer(1, src.Size()) return c, true } func (q *component) EvalTo(dst *data.Slice) { src := ValueOf(q.parent) defer cuda.Recycle(src) data.Copy(dst, src.Comp(q.comp)) } var compname = map[int]string{0: "x", 1: "y", 2: "z"} mumax3-3.10/engine/config.go000066400000000000000000000166161371432437400157320ustar00rootroot00000000000000package engine // Utilities for setting magnetic configurations. import ( "github.com/mumax/3/data" "math" "math/rand" ) func init() { DeclFunc("Uniform", Uniform, "Uniform magnetization in given direction") DeclFunc("Vortex", Vortex, "Vortex magnetization with given circulation and core polarization") DeclFunc("Antivortex", AntiVortex, "Antivortex magnetization with given circulation and core polarization") DeclFunc("NeelSkyrmion", NeelSkyrmion, "Néél skyrmion magnetization with given charge and core polarization") DeclFunc("BlochSkyrmion", BlochSkyrmion, "Bloch skyrmion magnetization with given chirality and core polarization") DeclFunc("TwoDomain", TwoDomain, "Twodomain magnetization with with given magnetization in left domain, wall, and right domain") DeclFunc("VortexWall", VortexWall, "Vortex wall magnetization with given mx in left and right domain and core circulation and polarization") DeclFunc("RandomMag", RandomMag, "Random magnetization") DeclFunc("RandomMagSeed", RandomMagSeed, "Random magnetization with given seed") DeclFunc("Conical", Conical, "Conical state for given wave vector, cone direction, and cone angle") DeclFunc("Helical", Helical, "Helical state for given wave vector") } // Magnetic configuration returns m vector for position (x,y,z) type Config func(x, y, z float64) data.Vector // Random initial magnetization. func RandomMag() Config { return RandomMagSeed(0) } // Random initial magnetization, // generated from random seed. func RandomMagSeed(seed int) Config { rng := rand.New(rand.NewSource(int64(seed))) return func(x, y, z float64) data.Vector { return randomDir(rng) } } // generate anisotropic random unit vector func randomDir(rng *rand.Rand) data.Vector { theta := 2 * rng.Float64() * math.Pi z := 2 * (rng.Float64() - 0.5) b := math.Sqrt(1 - z*z) x := b * math.Cos(theta) y := b * math.Sin(theta) return data.Vector{x, y, z} } // Returns a uniform magnetization state. E.g.: // M = Uniform(1, 0, 0)) // saturated along X func Uniform(mx, my, mz float64) Config { return func(x, y, z float64) data.Vector { return data.Vector{mx, my, mz} } } // Make a vortex magnetization with given circulation and core polarization (+1 or -1). // The core is smoothed over a few exchange lengths and should easily relax to its ground state. func Vortex(circ, pol int) Config { diam2 := 2 * sqr64(Mesh().CellSize()[X]) return func(x, y, z float64) data.Vector { r2 := x*x + y*y r := math.Sqrt(r2) mx := -y * float64(circ) / r my := x * float64(circ) / r mz := 1.5 * float64(pol) * math.Exp(-r2/diam2) return noNaN(data.Vector{mx, my, mz}, pol) } } func NeelSkyrmion(charge, pol int) Config { w := 8 * Mesh().CellSize()[X] w2 := w * w return func(x, y, z float64) data.Vector { r2 := x*x + y*y r := math.Sqrt(r2) mz := 2 * float64(pol) * (math.Exp(-r2/w2) - 0.5) mx := (x * float64(charge) / r) * (1 - math.Abs(mz)) my := (y * float64(charge) / r) * (1 - math.Abs(mz)) return noNaN(data.Vector{mx, my, mz}, pol) } } func BlochSkyrmion(charge, pol int) Config { w := 8 * Mesh().CellSize()[X] w2 := w * w return func(x, y, z float64) data.Vector { r2 := x*x + y*y r := math.Sqrt(r2) mz := 2 * float64(pol) * (math.Exp(-r2/w2) - 0.5) mx := (-y * float64(charge) / r) * (1 - math.Abs(mz)) my := (x * float64(charge) / r) * (1 - math.Abs(mz)) return noNaN(data.Vector{mx, my, mz}, pol) } } func AntiVortex(circ, pol int) Config { diam2 := 2 * sqr64(Mesh().CellSize()[X]) return func(x, y, z float64) data.Vector { r2 := x*x + y*y r := math.Sqrt(r2) mx := -x * float64(circ) / r my := y * float64(circ) / r mz := 1.5 * float64(pol) * math.Exp(-r2/diam2) return noNaN(data.Vector{mx, my, mz}, pol) } } // Make a vortex wall configuration. func VortexWall(mleft, mright float64, circ, pol int) Config { h := Mesh().WorldSize()[Y] v := Vortex(circ, pol) return func(x, y, z float64) data.Vector { if x < -h/2 { return data.Vector{mleft, 0, 0} } if x > h/2 { return data.Vector{mright, 0, 0} } return v(x, y, z) } } func noNaN(v data.Vector, pol int) data.Vector { if math.IsNaN(v[X]) || math.IsNaN(v[Y]) || math.IsNaN(v[Z]) { return data.Vector{0, 0, float64(pol)} } else { return v } } // Make a 2-domain configuration with domain wall. // (mx1, my1, mz1) and (mx2, my2, mz2) are the magnetizations in the left and right domain, respectively. // (mxwall, mywall, mzwall) is the magnetization in the wall. The wall is smoothed over a few cells so it will // easily relax to its ground state. // E.g.: // TwoDomain(1,0,0, 0,1,0, -1,0,0) // head-to-head domains with transverse (Néel) wall // TwoDomain(1,0,0, 0,0,1, -1,0,0) // head-to-head domains with perpendicular (Bloch) wall // TwoDomain(0,0,1, 1,0,0, 0,0,-1)// up-down domains with Bloch wall func TwoDomain(mx1, my1, mz1, mxwall, mywall, mzwall, mx2, my2, mz2 float64) Config { ww := 2 * Mesh().CellSize()[X] // wall width in cells return func(x, y, z float64) data.Vector { var m data.Vector if x < 0 { m = data.Vector{mx1, my1, mz1} } else { m = data.Vector{mx2, my2, mz2} } gauss := math.Exp(-sqr64(x / ww)) m[X] = (1-gauss)*m[X] + gauss*mxwall m[Y] = (1-gauss)*m[Y] + gauss*mywall m[Z] = (1-gauss)*m[Z] + gauss*mzwall return m } } // Conical magnetization configuration. // The magnetization rotates on a cone defined by coneAngle and coneDirection. // q is the wave vector of the conical magnetization configuration. // The magnetization is // // m = u*cos(coneAngle) + sin(coneAngle)*( ua*cos(q*r) + ub*sin(q*r) ) // // with ua and ub unit vectors perpendicular to u (normalized coneDirection) func Conical(q, coneDirection data.Vector, coneAngle float64) Config { u := coneDirection.Div(coneDirection.Len()) // two unit vectors perpendicular to each other and to the cone direction u p := math.Sqrt(1 - u[Z]*u[Z]) ua := data.Vector{u[X] * u[Z], u[Y] * u[Z], u[Z]*u[Z] - 1}.Div(p) ub := data.Vector{-u[Y], u[X], 0}.Div(p) // cone direction along z direction? -> oops devided by zero, let's fix this if u[Z]*u[Z] == 1 { ua = data.Vector{1, 0, 0} ub = data.Vector{0, 1, 0} } sina, cosa := math.Sincos(coneAngle) return func(x, y, z float64) data.Vector { sinqr, cosqr := math.Sincos(q[X]*x + q[Y]*y + q[Z]*z) return u.Mul(cosa).MAdd(sina*cosqr, ua).MAdd(sina*sinqr, ub) } } func Helical(q data.Vector) Config { return Conical(q, q, math.Pi/2) } // Transl returns a translated copy of configuration c. E.g.: // M = Vortex(1, 1).Transl(100e-9, 0, 0) // vortex with center at x=100nm func (c Config) Transl(dx, dy, dz float64) Config { return func(x, y, z float64) data.Vector { return c(x-dx, y-dy, z-dz) } } // Scale returns a scaled copy of configuration c. func (c Config) Scale(sx, sy, sz float64) Config { return func(x, y, z float64) data.Vector { return c(x/sx, y/sy, z/sz) } } // Rotates the configuration around the Z-axis, over θ radians. func (c Config) RotZ(θ float64) Config { cos := math.Cos(θ) sin := math.Sin(θ) return func(x, y, z float64) data.Vector { x_ := x*cos + y*sin y_ := -x*sin + y*cos m := c(x_, y_, z) mx_ := m[X]*cos - m[Y]*sin my_ := m[X]*sin + m[Y]*cos return data.Vector{mx_, my_, m[Z]} } } // Returns a new magnetization equal to c + weight * other. // E.g.: // Uniform(1, 0, 0).Add(0.2, RandomMag()) // for a uniform state with 20% random distortion. func (c Config) Add(weight float64, other Config) Config { return func(x, y, z float64) data.Vector { return c(x, y, z).MAdd(weight, other(x, y, z)) } } mumax3-3.10/engine/crop.go000066400000000000000000000071631371432437400154250ustar00rootroot00000000000000package engine // Cropped quantity refers to a cut-out piece of a large quantity import ( "fmt" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" ) func init() { DeclFunc("Crop", Crop, "Crops a quantity to cell ranges [x1,x2[, [y1,y2[, [z1,z2[") DeclFunc("CropX", CropX, "Crops a quantity to cell ranges [x1,x2[") DeclFunc("CropY", CropY, "Crops a quantity to cell ranges [y1,y2[") DeclFunc("CropZ", CropZ, "Crops a quantity to cell ranges [z1,z2[") DeclFunc("CropLayer", CropLayer, "Crops a quantity to a single layer") DeclFunc("CropRegion", CropRegion, "Crops a quantity to a region") } type cropped struct { parent Quantity name string x1, x2, y1, y2, z1, z2 int } // Crop quantity to a box enclosing the given region. // Used to output a region of interest, even if the region is non-rectangular. func CropRegion(parent Quantity, region int) *cropped { n := MeshOf(parent).Size() // use -1 for unset values x1, y1, z1 := -1, -1, -1 x2, y2, z2 := -1, -1, -1 r := regions.HostArray() for iz := 0; iz < n[Z]; iz++ { for iy := 0; iy < n[Y]; iy++ { for ix := 0; ix < n[X]; ix++ { if r[iz][iy][ix] == byte(region) { // initialize all indices if unset if x1 == -1 { x1, y1, z1 = ix, iy, iz x2, y2, z2 = ix, iy, iz } if ix < x1 { x1 = ix } if iy < y1 { y1 = iy } if iz < z1 { z1 = iz } if ix > x2 { x2 = ix } if iy > y2 { y2 = iy } if iz > z2 { z2 = iz } } } } } return Crop(parent, x1, x2+1, y1, y2+1, z1, z2+1) } func CropLayer(parent Quantity, layer int) *cropped { n := MeshOf(parent).Size() return Crop(parent, 0, n[X], 0, n[Y], layer, layer+1) } func CropX(parent Quantity, x1, x2 int) *cropped { n := MeshOf(parent).Size() return Crop(parent, x1, x2, 0, n[Y], 0, n[Z]) } func CropY(parent Quantity, y1, y2 int) *cropped { n := MeshOf(parent).Size() return Crop(parent, 0, n[X], y1, y2, 0, n[Z]) } func CropZ(parent Quantity, z1, z2 int) *cropped { n := MeshOf(parent).Size() return Crop(parent, 0, n[X], 0, n[Y], z1, z2) } func Crop(parent Quantity, x1, x2, y1, y2, z1, z2 int) *cropped { n := MeshOf(parent).Size() util.Argument(x1 < x2 && y1 < y2 && z1 < z2) util.Argument(x1 >= 0 && y1 >= 0 && z1 >= 0) util.Argument(x2 <= n[X] && y2 <= n[Y] && z2 <= n[Z]) name := NameOf(parent) + "_" if x1 != 0 || x2 != n[X] { name += "xrange" + rangeStr(x1, x2) } if y1 != 0 || y2 != n[Y] { name += "yrange" + rangeStr(y1, y2) } if z1 != 0 || z2 != n[Z] { name += "zrange" + rangeStr(z1, z2) } return &cropped{parent, name, x1, x2, y1, y2, z1, z2} } func rangeStr(a, b int) string { if a+1 == b { return fmt.Sprint(a, "_") } else { return fmt.Sprint(a, "-", b, "_") } // (trailing underscore to separate from subsequent autosave number) } func (q *cropped) NComp() int { return q.parent.NComp() } func (q *cropped) Name() string { return q.name } func (q *cropped) Unit() string { return UnitOf(q.parent) } func (q *cropped) EvalTo(dst *data.Slice) { EvalTo(q, dst) } func (q *cropped) Mesh() *data.Mesh { c := MeshOf(q.parent).CellSize() return data.NewMesh(q.x2-q.x1, q.y2-q.y1, q.z2-q.z1, c[X], c[Y], c[Z]) } func (q *cropped) average() []float64 { return qAverageUniverse(q) } // needed for table func (q *cropped) Average() []float64 { return q.average() } // handy for script func (q *cropped) Slice() (*data.Slice, bool) { src := ValueOf(q.parent) defer cuda.Recycle(src) dst := cuda.Buffer(q.NComp(), q.Mesh().Size()) cuda.Crop(dst, src, q.x1, q.y1, q.z1) return dst, true } mumax3-3.10/engine/customfield.go000066400000000000000000000256101371432437400167750ustar00rootroot00000000000000package engine // Add arbitrary terms to B_eff, Edens_total. import ( "fmt" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" ) var ( B_custom = NewVectorField("B_custom", "T", "User-defined field", AddCustomField) Edens_custom = NewScalarField("Edens_custom", "J/m3", "Energy density of user-defined field.", AddCustomEnergyDensity) E_custom = NewScalarValue("E_custom", "J", "total energy of user-defined field", GetCustomEnergy) customTerms []Quantity // vector customEnergies []Quantity // scalar ) func init() { registerEnergy(GetCustomEnergy, AddCustomEnergyDensity) DeclFunc("AddFieldTerm", AddFieldTerm, "Add an expression to B_eff.") DeclFunc("AddEdensTerm", AddEdensTerm, "Add an expression to Edens.") DeclFunc("Add", Add, "Add two quantities") DeclFunc("Madd", Madd, "Weighted addition: Madd(Q1,Q2,c1,c2) = c1*Q1 + c2*Q2") DeclFunc("Dot", Dot, "Dot product of two vector quantities") DeclFunc("Cross", Cross, "Cross product of two vector quantities") DeclFunc("Mul", Mul, "Point-wise product of two quantities") DeclFunc("MulMV", MulMV, "Matrix-Vector product: MulMV(AX, AY, AZ, m) = (AX·m, AY·m, AZ·m). "+ "The arguments Ax, Ay, Az and m are quantities with 3 componets.") DeclFunc("Div", Div, "Point-wise division of two quantities") DeclFunc("Const", Const, "Constant, uniform number") DeclFunc("ConstVector", ConstVector, "Constant, uniform vector") DeclFunc("Shifted", Shifted, "Shifted quantity") DeclFunc("Masked", Masked, "Mask quantity with shape") DeclFunc("Normalized", Normalized, "Normalize quantity") DeclFunc("RemoveCustomFields", RemoveCustomFields, "Removes all custom fields again") } //Removes all customfields func RemoveCustomFields() { customTerms = nil } // AddFieldTerm adds an effective field function (returning Teslas) to B_eff. // Be sure to also add the corresponding energy term using AddEnergyTerm. func AddFieldTerm(b Quantity) { customTerms = append(customTerms, b) } // AddEnergyTerm adds an energy density function (returning Joules/m³) to Edens_total. // Needed when AddFieldTerm was used and a correct energy is needed // (e.g. for Relax, Minimize, ...). func AddEdensTerm(e Quantity) { customEnergies = append(customEnergies, e) } // AddCustomField evaluates the user-defined custom field terms // and adds the result to dst. func AddCustomField(dst *data.Slice) { for _, term := range customTerms { buf := ValueOf(term) cuda.Add(dst, dst, buf) cuda.Recycle(buf) } } // Adds the custom energy densities (defined with AddCustomE func AddCustomEnergyDensity(dst *data.Slice) { for _, term := range customEnergies { buf := ValueOf(term) cuda.Add(dst, dst, buf) cuda.Recycle(buf) } } func GetCustomEnergy() float64 { buf := cuda.Buffer(1, Mesh().Size()) defer cuda.Recycle(buf) cuda.Zero(buf) AddCustomEnergyDensity(buf) return cellVolume() * float64(cuda.Sum(buf)) } type constValue struct { value []float64 } func (c *constValue) NComp() int { return len(c.value) } func (d *constValue) EvalTo(dst *data.Slice) { for c, v := range d.value { cuda.Memset(dst.Comp(c), float32(v)) } } // Const returns a constant (uniform) scalar quantity, // that can be used to construct custom field terms. func Const(v float64) Quantity { return &constValue{[]float64{v}} } // ConstVector returns a constant (uniform) vector quantity, // that can be used to construct custom field terms. func ConstVector(x, y, z float64) Quantity { return &constValue{[]float64{x, y, z}} } // fieldOp holds the abstract functionality for operations // (like add, multiply, ...) on space-dependend quantites // (like M, B_sat, ...) type fieldOp struct { a, b Quantity nComp int } func (o fieldOp) NComp() int { return o.nComp } type dotProduct struct { fieldOp } type crossProduct struct { fieldOp } type addition struct { fieldOp } type mAddition struct { fieldOp fac1, fac2 float64 } type mulmv struct { ax, ay, az, b Quantity } // MulMV returns a new Quantity that evaluates to the // matrix-vector product (Ax·b, Ay·b, Az·b). func MulMV(Ax, Ay, Az, b Quantity) Quantity { util.Argument(Ax.NComp() == 3 && Ay.NComp() == 3 && Az.NComp() == 3 && b.NComp() == 3) return &mulmv{Ax, Ay, Az, b} } func (q *mulmv) EvalTo(dst *data.Slice) { util.Argument(dst.NComp() == 3) cuda.Zero(dst) b := ValueOf(q.b) defer cuda.Recycle(b) { Ax := ValueOf(q.ax) cuda.AddDotProduct(dst.Comp(X), 1, Ax, b) cuda.Recycle(Ax) } { Ay := ValueOf(q.ay) cuda.AddDotProduct(dst.Comp(Y), 1, Ay, b) cuda.Recycle(Ay) } { Az := ValueOf(q.az) cuda.AddDotProduct(dst.Comp(Z), 1, Az, b) cuda.Recycle(Az) } } func (q *mulmv) NComp() int { return 3 } // DotProduct creates a new quantity that is the dot product of // quantities a and b. E.g.: // DotProct(&M, &B_ext) func Dot(a, b Quantity) Quantity { return &dotProduct{fieldOp{a, b, 1}} } func (d *dotProduct) EvalTo(dst *data.Slice) { A := ValueOf(d.a) defer cuda.Recycle(A) B := ValueOf(d.b) defer cuda.Recycle(B) cuda.Zero(dst) cuda.AddDotProduct(dst, 1, A, B) } // CrossProduct creates a new quantity that is the cross product of // quantities a and b. E.g.: // CrossProct(&M, &B_ext) func Cross(a, b Quantity) Quantity { return &crossProduct{fieldOp{a, b, 3}} } func (d *crossProduct) EvalTo(dst *data.Slice) { A := ValueOf(d.a) defer cuda.Recycle(A) B := ValueOf(d.b) defer cuda.Recycle(B) cuda.Zero(dst) cuda.CrossProduct(dst, A, B) } func Add(a, b Quantity) Quantity { if a.NComp() != b.NComp() { panic(fmt.Sprintf("Cannot point-wise Add %v components by %v components", a.NComp(), b.NComp())) } return &addition{fieldOp{a, b, a.NComp()}} } func (d *addition) EvalTo(dst *data.Slice) { A := ValueOf(d.a) defer cuda.Recycle(A) B := ValueOf(d.b) defer cuda.Recycle(B) cuda.Zero(dst) cuda.Add(dst, A, B) } type pointwiseMul struct { fieldOp } func Madd(a, b Quantity, fac1, fac2 float64) *mAddition { if a.NComp() != b.NComp() { panic(fmt.Sprintf("Cannot point-wise add %v components by %v components", a.NComp(), b.NComp())) } return &mAddition{fieldOp{a, b, a.NComp()}, fac1, fac2} } func (o *mAddition) EvalTo(dst *data.Slice) { A := ValueOf(o.a) defer cuda.Recycle(A) B := ValueOf(o.b) defer cuda.Recycle(B) cuda.Zero(dst) cuda.Madd2(dst, A, B, float32(o.fac1), float32(o.fac2)) } // Mul returns a new quantity that evaluates to the pointwise product a and b. func Mul(a, b Quantity) Quantity { nComp := -1 switch { case a.NComp() == b.NComp(): nComp = a.NComp() // vector*vector, scalar*scalar case a.NComp() == 1: nComp = b.NComp() // scalar*something case b.NComp() == 1: nComp = a.NComp() // something*scalar default: panic(fmt.Sprintf("Cannot point-wise multiply %v components by %v components", a.NComp(), b.NComp())) } return &pointwiseMul{fieldOp{a, b, nComp}} } func (d *pointwiseMul) EvalTo(dst *data.Slice) { cuda.Zero(dst) a := ValueOf(d.a) defer cuda.Recycle(a) b := ValueOf(d.b) defer cuda.Recycle(b) switch { case a.NComp() == b.NComp(): mulNN(dst, a, b) // vector*vector, scalar*scalar case a.NComp() == 1: mul1N(dst, a, b) case b.NComp() == 1: mul1N(dst, b, a) default: panic(fmt.Sprintf("Cannot point-wise multiply %v components by %v components", a.NComp(), b.NComp())) } } // mulNN pointwise multiplies two N-component vectors, // yielding an N-component vector stored in dst. func mulNN(dst, a, b *data.Slice) { cuda.Mul(dst, a, b) } // mul1N pointwise multiplies a scalar (1-component) with an N-component vector, // yielding an N-component vector stored in dst. func mul1N(dst, a, b *data.Slice) { util.Assert(a.NComp() == 1) util.Assert(dst.NComp() == b.NComp()) for c := 0; c < dst.NComp(); c++ { cuda.Mul(dst.Comp(c), a, b.Comp(c)) } } type pointwiseDiv struct { fieldOp } // Div returns a new quantity that evaluates to the pointwise product a and b. func Div(a, b Quantity) Quantity { nComp := -1 switch { case a.NComp() == b.NComp(): nComp = a.NComp() // vector/vector, scalar/scalar case b.NComp() == 1: nComp = a.NComp() // something/scalar default: panic(fmt.Sprintf("Cannot point-wise divide %v components by %v components", a.NComp(), b.NComp())) } return &pointwiseDiv{fieldOp{a, b, nComp}} } func (d *pointwiseDiv) EvalTo(dst *data.Slice) { a := ValueOf(d.a) defer cuda.Recycle(a) b := ValueOf(d.b) defer cuda.Recycle(b) switch { case a.NComp() == b.NComp(): divNN(dst, a, b) // vector*vector, scalar*scalar case b.NComp() == 1: divN1(dst, a, b) default: panic(fmt.Sprintf("Cannot point-wise divide %v components by %v components", a.NComp(), b.NComp())) } } func divNN(dst, a, b *data.Slice) { cuda.Div(dst, a, b) } func divN1(dst, a, b *data.Slice) { util.Assert(dst.NComp() == a.NComp()) util.Assert(b.NComp() == 1) for c := 0; c < dst.NComp(); c++ { cuda.Div(dst.Comp(c), a.Comp(c), b) } } type shifted struct { orig Quantity dx, dy, dz int } // Shifted returns a new Quantity that evaluates to // the original, shifted over dx, dy, dz cells. func Shifted(q Quantity, dx, dy, dz int) Quantity { util.Assert(dx != 0 || dy != 0 || dz != 0) return &shifted{q, dx, dy, dz} } func (q *shifted) EvalTo(dst *data.Slice) { orig := ValueOf(q.orig) defer cuda.Recycle(orig) for i := 0; i < q.NComp(); i++ { dsti := dst.Comp(i) origi := orig.Comp(i) if q.dx != 0 { cuda.ShiftX(dsti, origi, q.dx, 0, 0) } if q.dy != 0 { cuda.ShiftY(dsti, origi, q.dy, 0, 0) } if q.dz != 0 { cuda.ShiftZ(dsti, origi, q.dz, 0, 0) } } } func (q *shifted) NComp() int { return q.orig.NComp() } // Masks a quantity with a shape // The shape will be only evaluated once on the mesh, // and will be re-evaluated after mesh change, // because otherwise too slow func Masked(q Quantity, shape Shape) Quantity { return &masked{q, shape, nil, data.Mesh{}} } type masked struct { orig Quantity shape Shape mask *data.Slice mesh data.Mesh } func (q *masked) EvalTo(dst *data.Slice) { if q.mesh != *Mesh() { // When mesh is changed, mask needs an update q.createMask() } orig := ValueOf(q.orig) defer cuda.Recycle(orig) mul1N(dst, q.mask, orig) } func (q *masked) NComp() int { return q.orig.NComp() } func (q *masked) createMask() { size := Mesh().Size() // Prepare mask on host maskhost := data.NewSlice(SCALAR, size) defer maskhost.Free() maskScalars := maskhost.Scalars() for iz := 0; iz < size[Z]; iz++ { for iy := 0; iy < size[Y]; iy++ { for ix := 0; ix < size[X]; ix++ { r := Index2Coord(ix, iy, iz) if q.shape(r[X], r[Y], r[Z]) { maskScalars[iz][iy][ix] = 1 } } } } // Update mask q.mask.Free() q.mask = cuda.NewSlice(SCALAR, size) data.Copy(q.mask, maskhost) q.mesh = *Mesh() // Remove mask from host } // Normalized returns a quantity that evaluates to the unit vector of q func Normalized(q Quantity) Quantity { return &normalized{q} } type normalized struct { orig Quantity } func (q *normalized) NComp() int { return 3 } func (q *normalized) EvalTo(dst *data.Slice) { util.Assert(dst.NComp() == q.NComp()) q.orig.EvalTo(dst) cuda.Normalize(dst, nil) } mumax3-3.10/engine/demag.go000066400000000000000000000067251371432437400155420ustar00rootroot00000000000000package engine // Calculation of magnetostatic field import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/mag" ) // Demag variables var ( Msat = NewScalarParam("Msat", "A/m", "Saturation magnetization") M_full = NewVectorField("m_full", "A/m", "Unnormalized magnetization", SetMFull) B_demag = NewVectorField("B_demag", "T", "Magnetostatic field", SetDemagField) Edens_demag = NewScalarField("Edens_demag", "J/m3", "Magnetostatic energy density", AddEdens_demag) E_demag = NewScalarValue("E_demag", "J", "Magnetostatic energy", GetDemagEnergy) EnableDemag = true // enable/disable global demag field NoDemagSpins = NewScalarParam("NoDemagSpins", "", "Disable magnetostatic interaction per region (default=0, set to 1 to disable). "+ "E.g.: NoDemagSpins.SetRegion(5, 1) disables the magnetostatic interaction in region 5.") conv_ *cuda.DemagConvolution // does the heavy lifting DemagAccuracy = 6.0 // Demag accuracy (divide cubes in at most N^3 points) ) var AddEdens_demag = makeEdensAdder(&B_demag, -0.5) func init() { DeclVar("EnableDemag", &EnableDemag, "Enables/disables demag (default=true)") DeclVar("DemagAccuracy", &DemagAccuracy, "Controls accuracy of demag kernel") registerEnergy(GetDemagEnergy, AddEdens_demag) } // Sets dst to the current demag field func SetDemagField(dst *data.Slice) { if EnableDemag { msat := Msat.MSlice() defer msat.Recycle() if NoDemagSpins.isZero() { // Normal demag, everywhere demagConv().Exec(dst, M.Buffer(), geometry.Gpu(), msat) } else { setMaskedDemagField(dst, msat) } } else { cuda.Zero(dst) // will ADD other terms to it } } // Sets dst to the demag field, but cells where NoDemagSpins != 0 do not generate nor recieve field. func setMaskedDemagField(dst *data.Slice, msat cuda.MSlice) { // No-demag spins: mask-out geometry with zeros where NoDemagSpins is set, // so these spins do not generate a field buf := cuda.Buffer(SCALAR, geometry.Gpu().Size()) // masked-out geometry defer cuda.Recycle(buf) // obtain a copy of the geometry mask, which we can overwrite geom, r := geometry.Slice() if r { defer cuda.Recycle(geom) } data.Copy(buf, geom) // mask-out cuda.ZeroMask(buf, NoDemagSpins.gpuLUT1(), regions.Gpu()) // convolution with masked-out cells. demagConv().Exec(dst, M.Buffer(), buf, msat) // After convolution, mask-out the field in the NoDemagSpins cells // so they don't feel the field generated by others. cuda.ZeroMask(dst, NoDemagSpins.gpuLUT1(), regions.Gpu()) } // Sets dst to the full (unnormalized) magnetization in A/m func SetMFull(dst *data.Slice) { // scale m by Msat... msat, rM := Msat.Slice() if rM { defer cuda.Recycle(msat) } for c := 0; c < 3; c++ { cuda.Mul(dst.Comp(c), M.Buffer().Comp(c), msat) } // ...and by cell volume if applicable vol, rV := geometry.Slice() if rV { defer cuda.Recycle(vol) } if !vol.IsNil() { for c := 0; c < 3; c++ { cuda.Mul(dst.Comp(c), dst.Comp(c), vol) } } } // returns demag convolution, making sure it's initialized func demagConv() *cuda.DemagConvolution { if conv_ == nil { SetBusy(true) defer SetBusy(false) kernel := mag.DemagKernel(Mesh().Size(), Mesh().PBC(), Mesh().CellSize(), DemagAccuracy, *Flag_cachedir) conv_ = cuda.NewDemag(Mesh().Size(), Mesh().PBC(), kernel, *Flag_selftest) } return conv_ } // Returns the current demag energy in Joules. func GetDemagEnergy() float64 { return -0.5 * cellVolume() * dot(&M_full, &B_demag) } mumax3-3.10/engine/effectivefield.go000066400000000000000000000010511371432437400174140ustar00rootroot00000000000000package engine // Effective field import "github.com/mumax/3/data" var B_eff = NewVectorField("B_eff", "T", "Effective field", SetEffectiveField) // Sets dst to the current effective field, in Tesla. // This is the sum of all effective field terms, // like demag, exchange, ... func SetEffectiveField(dst *data.Slice) { SetDemagField(dst) // set to B_demag... AddExchangeField(dst) // ...then add other terms AddAnisotropyField(dst) AddMagnetoelasticField(dst) B_ext.AddTo(dst) if !relaxing { B_therm.AddTo(dst) } AddCustomField(dst) } mumax3-3.10/engine/energy.go000066400000000000000000000032131371432437400157430ustar00rootroot00000000000000package engine // Total energy calculation import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) // TODO: Integrate(Edens) // TODO: consistent naming SetEdensTotal, ... var ( energyTerms []func() float64 // all contributions to total energy edensTerms []func(dst *data.Slice) // all contributions to total energy density (add to dst) Edens_total = NewScalarField("Edens_total", "J/m3", "Total energy density", SetTotalEdens) E_total = NewScalarValue("E_total", "J", "total energy", GetTotalEnergy) ) // add energy term to global energy func registerEnergy(term func() float64, dens func(*data.Slice)) { energyTerms = append(energyTerms, term) edensTerms = append(edensTerms, dens) } // Returns the total energy in J. func GetTotalEnergy() float64 { E := 0. for _, f := range energyTerms { E += f() } checkNaN1(E) return E } // Set dst to total energy density in J/m3 func SetTotalEdens(dst *data.Slice) { cuda.Zero(dst) for _, addTerm := range edensTerms { addTerm(dst) } } // volume of one cell in m3 func cellVolume() float64 { c := Mesh().CellSize() return c[0] * c[1] * c[2] } // returns a function that adds to dst the energy density: // prefactor * dot (M_full, field) func makeEdensAdder(field Quantity, prefactor float64) func(*data.Slice) { return func(dst *data.Slice) { B := ValueOf(field) defer cuda.Recycle(B) m := ValueOf(M_full) defer cuda.Recycle(m) factor := float32(prefactor) cuda.AddDotProduct(dst, factor, B, m) } } // vector dot product func dot(a, b Quantity) float64 { A := ValueOf(a) defer cuda.Recycle(A) B := ValueOf(b) defer cuda.Recycle(B) return float64(cuda.Dot(A, B)) } mumax3-3.10/engine/engine.go000066400000000000000000000023221371432437400157170ustar00rootroot00000000000000/* engine does the simulation bookkeeping, I/O and GUI. space-dependence: value: space-independent param: region-dependent parameter (always input) field: fully space-dependent field TODO: godoc everything */ package engine import ( "fmt" "os" "runtime" "sync" "time" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/timer" ) const VERSION = "mumax 3.10" var UNAME = fmt.Sprintf("%s [%s_%s %s(%s) CUDA-%d.%d]", VERSION, runtime.GOOS, runtime.GOARCH, runtime.Version(), runtime.Compiler, cu.CUDA_VERSION/1000, (cu.CUDA_VERSION%1000)/10) var StartTime = time.Now() var ( busyLock sync.Mutex busy bool // are we so busy we can't respond from run loop? (e.g. calc kernel) ) // We set SetBusy(true) when the simulation is too busy too accept GUI input on Inject channel. // E.g. during kernel init. func SetBusy(b bool) { busyLock.Lock() defer busyLock.Unlock() busy = b } func GetBusy() bool { busyLock.Lock() defer busyLock.Unlock() return busy } // Cleanly exits the simulation, assuring all output is flushed. func Close() { drainOutput() LogUsedRefs() Table.flush() if logfile != nil { logfile.Close() } if bibfile != nil { bibfile.Close() } if *Flag_sync { timer.Print(os.Stdout) } } mumax3-3.10/engine/euler.go000066400000000000000000000014611371432437400155710ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/util" ) type Euler struct{} // Euler method, can be used as solver.Step. func (_ *Euler) Step() { y := M.Buffer() dy0 := cuda.Buffer(VECTOR, y.Size()) defer cuda.Recycle(dy0) torqueFn(dy0) setMaxTorque(dy0) // Adaptive time stepping: treat MaxErr as the maximum magnetization delta // (proportional to the error, but an overestimation for sure) var dt float32 if FixDt != 0 { Dt_si = FixDt dt = float32(Dt_si * GammaLL) } else { dt = float32(MaxErr / LastTorque) Dt_si = float64(dt) / GammaLL } util.AssertMsg(dt > 0, "Euler solver requires fixed time step > 0") setLastErr(float64(dt) * LastTorque) cuda.Madd2(y, y, dy0, 1, dt) // y = y + dt * dy M.normalize() Time += Dt_si NSteps++ } func (_ *Euler) Free() {} mumax3-3.10/engine/exchange.go000066400000000000000000000157361371432437400162510ustar00rootroot00000000000000package engine // Exchange interaction (Heisenberg + Dzyaloshinskii-Moriya) // See also cuda/exchange.cu and cuda/dmi.cu import ( "math" "unsafe" "github.com/mumax/3/cuda" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/data" "github.com/mumax/3/util" ) var ( Aex = NewScalarParam("Aex", "J/m", "Exchange stiffness", &lex2) Dind = NewScalarParam("Dind", "J/m2", "Interfacial Dzyaloshinskii-Moriya strength", &din2) Dbulk = NewScalarParam("Dbulk", "J/m2", "Bulk Dzyaloshinskii-Moriya strength", &dbulk2) lex2 exchParam // inter-cell Aex din2 exchParam // inter-cell Dind dbulk2 exchParam // inter-cell Dbulk B_exch = NewVectorField("B_exch", "T", "Exchange field", AddExchangeField) E_exch = NewScalarValue("E_exch", "J", "Total exchange energy (including the DMI energy)", GetExchangeEnergy) Edens_exch = NewScalarField("Edens_exch", "J/m3", "Total exchange energy density (including the DMI energy density)", AddExchangeEnergyDensity) // Average exchange coupling with neighbors. Useful to debug inter-region exchange ExchCoupling = NewScalarField("ExchCoupling", "arb.", "Average exchange coupling with neighbors", exchangeDecode) DindCoupling = NewScalarField("DindCoupling", "arb.", "Average DMI coupling with neighbors", dindDecode) OpenBC = false ) var AddExchangeEnergyDensity = makeEdensAdder(&B_exch, -0.5) // TODO: normal func func init() { registerEnergy(GetExchangeEnergy, AddExchangeEnergyDensity) DeclFunc("ext_ScaleExchange", ScaleInterExchange, "Re-scales exchange coupling between two regions.") DeclFunc("ext_InterExchange", InterExchange, "Sets exchange coupling between two regions.") DeclFunc("ext_ScaleDind", ScaleInterDind, "Re-scales Dind coupling between two regions.") DeclFunc("ext_InterDind", InterDind, "Sets Dind coupling between two regions.") DeclVar("OpenBC", &OpenBC, "Use open boundary conditions (default=false)") lex2.init(Aex) din2.init(Dind) dbulk2.init(Dbulk) } // Adds the current exchange field to dst func AddExchangeField(dst *data.Slice) { inter := !Dind.isZero() bulk := !Dbulk.isZero() ms := Msat.MSlice() defer ms.Recycle() switch { case !inter && !bulk: cuda.AddExchange(dst, M.Buffer(), lex2.Gpu(), ms, regions.Gpu(), M.Mesh()) case inter && !bulk: Refer("mulkers2017") cuda.AddDMI(dst, M.Buffer(), lex2.Gpu(), din2.Gpu(), ms, regions.Gpu(), M.Mesh(), OpenBC) // dmi+exchange case bulk && !inter: cuda.AddDMIBulk(dst, M.Buffer(), lex2.Gpu(), dbulk2.Gpu(), ms, regions.Gpu(), M.Mesh(), OpenBC) // dmi+exchange // TODO: add ScaleInterDbulk and InterDbulk case inter && bulk: util.Fatal("Cannot have interfacial-induced DMI and bulk DMI at the same time") } } // Set dst to the average exchange coupling per cell (average of lex2 with all neighbors). func exchangeDecode(dst *data.Slice) { cuda.ExchangeDecode(dst, lex2.Gpu(), regions.Gpu(), M.Mesh()) } // Set dst to the average dmi coupling per cell (average of din2 with all neighbors). func dindDecode(dst *data.Slice) { cuda.ExchangeDecode(dst, din2.Gpu(), regions.Gpu(), M.Mesh()) } // Returns the current exchange energy in Joules. func GetExchangeEnergy() float64 { return -0.5 * cellVolume() * dot(&M_full, &B_exch) } // Scales the heisenberg exchange interaction between region1 and 2. // Scale = 1 means the harmonic mean over the regions of Aex. func ScaleInterExchange(region1, region2 int, scale float64) { lex2.setScale(region1, region2, scale) } // Sets the exchange interaction between region 1 and 2. func InterExchange(region1, region2 int, value float64) { lex2.setInter(region1, region2, value) } // Scales the DMI interaction between region 1 and 2. func ScaleInterDind(region1, region2 int, scale float64) { din2.setScale(region1, region2, scale) } // Sets the DMI interaction between region 1 and 2. func InterDind(region1, region2 int, value float64) { din2.setInter(region1, region2, value) } // stores interregion exchange stiffness and DMI // the interregion exchange/DMI by default is the harmonic mean (scale=1, inter=0) type exchParam struct { parent *RegionwiseScalar lut [NREGION * (NREGION + 1) / 2]float32 // harmonic mean of regions (i,j) scale [NREGION * (NREGION + 1) / 2]float32 // extra scale factor for lut[SymmIdx(i, j)] inter [NREGION * (NREGION + 1) / 2]float32 // extra term for lut[SymmIdx(i, j)] gpu cuda.SymmLUT // gpu copy of lut, lazily transferred when needed gpu_ok, cpu_ok bool // gpu cache up-to date with lut source } // to be called after Aex or scaling changed func (p *exchParam) invalidate() { p.cpu_ok = false p.gpu_ok = false } func (p *exchParam) init(parent *RegionwiseScalar) { for i := range p.scale { p.scale[i] = 1 // default scaling p.inter[i] = 0 // default additional interexchange term } p.parent = parent } // Get a GPU mirror of the look-up table. // Copies to GPU first only if needed. func (p *exchParam) Gpu() cuda.SymmLUT { p.update() if !p.gpu_ok { p.upload() } return p.gpu } // sets the interregion exchange/DMI using a specified value (scale = 0) func (p *exchParam) setInter(region1, region2 int, value float64) { p.scale[symmidx(region1, region2)] = float32(0.) p.inter[symmidx(region1, region2)] = float32(value) p.invalidate() } // sets the interregion exchange/DMI by rescaling the harmonic mean (inter = 0) func (p *exchParam) setScale(region1, region2 int, scale float64) { p.scale[symmidx(region1, region2)] = float32(scale) p.inter[symmidx(region1, region2)] = float32(0.) p.invalidate() } func (p *exchParam) update() { if !p.cpu_ok { ex := p.parent.cpuLUT() for i := 0; i < NREGION; i++ { exi := ex[0][i] for j := i; j < NREGION; j++ { exj := ex[0][j] I := symmidx(i, j) p.lut[I] = p.scale[I]*exchAverage(exi, exj) + p.inter[I] } } p.gpu_ok = false p.cpu_ok = true } } func (p *exchParam) upload() { // alloc if needed if p.gpu == nil { p.gpu = cuda.SymmLUT(cuda.MemAlloc(int64(len(p.lut)) * cu.SIZEOF_FLOAT32)) } lut := p.lut // Copy, to work around Go 1.6 cgo pointer limitations. cuda.MemCpyHtoD(unsafe.Pointer(p.gpu), unsafe.Pointer(&lut[0]), cu.SIZEOF_FLOAT32*int64(len(p.lut))) p.gpu_ok = true } // Index in symmetric matrix where only one half is stored. // (!) Code duplicated in exchange.cu func symmidx(i, j int) int { if j <= i { return i*(i+1)/2 + j } else { return j*(j+1)/2 + i } } // Returns the intermediate value of two exchange/dmi strengths. // If both arguments have the same sign, the average mean is returned. If the arguments differ in sign // (which is possible in the case of DMI), the geometric mean of the geometric and arithmetic mean is // used. This average is continuous everywhere, monotonic increasing, and bounded by the argument values. func exchAverage(exi, exj float32) float32 { if exi*exj >= 0.0 { return 2 / (1/exi + 1/exj) } else { exi_, exj_ := float64(exi), float64(exj) sign := math.Copysign(1, exi_+exj_) magn := math.Sqrt(math.Sqrt(-exi_*exj_) * math.Abs(exi_+exj_) / 2) return float32(sign * magn) } } mumax3-3.10/engine/excitation.go000066400000000000000000000102451371432437400166240ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/script" "github.com/mumax/3/util" "math" "reflect" ) // An excitation, typically field or current, // can be defined region-wise plus extra mask*multiplier terms. type Excitation struct { name string perRegion RegionwiseVector // Region-based excitation extraTerms []mulmask // add extra mask*multiplier terms } // space-dependent mask plus time dependent multiplier type mulmask struct { mul func() float64 mask *data.Slice } func NewExcitation(name, unit, desc string) *Excitation { e := new(Excitation) e.name = name e.perRegion.init(3, "_"+name+"_perRegion", unit, nil) // name starts with underscore: unexported DeclLValue(name, e, cat(desc, unit)) return e } func (p *Excitation) MSlice() cuda.MSlice { buf, r := p.Slice() util.Assert(r == true) return cuda.ToMSlice(buf) } func (e *Excitation) AddTo(dst *data.Slice) { if !e.perRegion.isZero() { cuda.RegionAddV(dst, e.perRegion.gpuLUT(), regions.Gpu()) } for _, t := range e.extraTerms { var mul float32 = 1 if t.mul != nil { mul = float32(t.mul()) } cuda.Madd2(dst, dst, t.mask, 1, mul) } } func (e *Excitation) isZero() bool { return e.perRegion.isZero() && len(e.extraTerms) == 0 } func (e *Excitation) Slice() (*data.Slice, bool) { buf := cuda.Buffer(e.NComp(), e.Mesh().Size()) cuda.Zero(buf) e.AddTo(buf) return buf, true } // After resizing the mesh, the extra terms don't fit the grid anymore // and there is no reasonable way to resize them. So remove them and have // the user re-add them. func (e *Excitation) RemoveExtraTerms() { if len(e.extraTerms) == 0 { return } LogOut("REMOVING EXTRA TERMS FROM", e.Name()) for _, m := range e.extraTerms { m.mask.Free() } e.extraTerms = nil } // Add an extra mask*multiplier term to the excitation. func (e *Excitation) Add(mask *data.Slice, f script.ScalarFunction) { var mul func() float64 if f != nil { if IsConst(f) { val := f.Float() mul = func() float64 { return val } } else { mul = func() float64 { return f.Float() } } } e.AddGo(mask, mul) } // An Add(mask, f) equivalent for Go use func (e *Excitation) AddGo(mask *data.Slice, mul func() float64) { if mask != nil { checkNaN(mask, e.Name()+".add()") // TODO: in more places mask = data.Resample(mask, e.Mesh().Size()) mask = assureGPU(mask) } e.extraTerms = append(e.extraTerms, mulmask{mul, mask}) } func (e *Excitation) SetRegion(region int, f script.VectorFunction) { e.perRegion.SetRegion(region, f) } func (e *Excitation) SetValue(v interface{}) { e.perRegion.SetValue(v) } func (e *Excitation) Set(v data.Vector) { e.perRegion.setRegions(0, NREGION, slice(v)) } func (e *Excitation) getRegion(region int) []float64 { return e.perRegion.getRegion(region) } // for gui func (e *Excitation) SetRegionFn(region int, f func() [3]float64) { e.perRegion.setFunc(region, region+1, func() []float64 { return slice(f()) }) } func (e *Excitation) average() []float64 { return qAverageUniverse(e) } func (e *Excitation) Average() data.Vector { return unslice(qAverageUniverse(e)) } func (e *Excitation) IsUniform() bool { return e.perRegion.IsUniform() } func (e *Excitation) Name() string { return e.name } func (e *Excitation) Unit() string { return e.perRegion.Unit() } func (e *Excitation) NComp() int { return e.perRegion.NComp() } func (e *Excitation) Mesh() *data.Mesh { return Mesh() } func (e *Excitation) Region(r int) *vOneReg { return vOneRegion(e, r) } func (e *Excitation) Comp(c int) ScalarField { return Comp(e, c) } func (e *Excitation) Eval() interface{} { return e } func (e *Excitation) Type() reflect.Type { return reflect.TypeOf(new(Excitation)) } func (e *Excitation) InputType() reflect.Type { return script.VectorFunction_t } func (e *Excitation) EvalTo(dst *data.Slice) { EvalTo(e, dst) } func checkNaN(s *data.Slice, name string) { h := s.Host() for _, h := range h { for _, v := range h { if math.IsNaN(float64(v)) || math.IsInf(float64(v), 0) { util.Fatal("NaN or Inf in", name) } } } } mumax3-3.10/engine/ext_angles.go000066400000000000000000000005601371432437400166050ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) var ( ext_phi = NewScalarField("ext_phi", "rad", "Azimuthal angle", SetPhi) ext_theta = NewScalarField("ext_theta", "rad", "Polar angle", SetTheta) ) func SetPhi(dst *data.Slice) { cuda.SetPhi(dst, M.Buffer()) } func SetTheta(dst *data.Slice) { cuda.SetTheta(dst, M.Buffer()) } mumax3-3.10/engine/ext_bubblepos.go000066400000000000000000000041601371432437400173110ustar00rootroot00000000000000package engine import ( "math" ) var ( BubblePos = NewVectorValue("ext_bubblepos", "m", "Bubble core position", bubblePos) BubbleDist = NewScalarValue("ext_bubbledist", "m", "Bubble traveled distance", bubbleDist) BubbleSpeed = NewScalarValue("ext_bubblespeed", "m/s", "Bubble velocity", bubbleSpeed) BubbleMz = 1.0 ) func init() { DeclVar("ext_BubbleMz", &BubbleMz, "Center magnetization 1.0 or -1.0 (default = 1.0)") } func bubblePos() []float64 { m := M.Buffer() n := Mesh().Size() c := Mesh().CellSize() mz := m.Comp(Z).HostCopy().Scalars()[0] posx, posy := 0., 0. if BubbleMz != -1.0 && BubbleMz != 1.0 { panic("ext_BubbleMz should be 1.0 or -1.0") } { var magsum float32 var weightedsum float32 for iy := range mz { for ix := range mz[0] { magsum += ((mz[iy][ix]*float32(BubbleMz) + 1.) / 2.) weightedsum += ((mz[iy][ix]*float32(BubbleMz) + 1.) / 2.) * float32(iy) } } posy = float64(weightedsum / magsum) } { var magsum float32 var weightedsum float32 for ix := range mz[0] { for iy := range mz { magsum += ((mz[iy][ix]*float32(BubbleMz) + 1.) / 2.) weightedsum += ((mz[iy][ix]*float32(BubbleMz) + 1.) / 2.) * float32(ix) } } posx = float64(weightedsum / magsum) } return []float64{(posx-float64(n[X]/2))*c[X] + GetShiftPos(), (posy-float64(n[Y]/2))*c[Y] + GetShiftYPos(), 0} } var ( prevBpos = [2]float64{-1e99, -1e99} bdist = 0.0 ) func bubbleDist() float64 { pos := bubblePos() if prevBpos == [2]float64{-1e99, -1e99} { prevBpos = [2]float64{pos[X], pos[Y]} return 0 } w := Mesh().WorldSize() dx := pos[X] - prevBpos[X] dy := pos[Y] - prevBpos[Y] prevBpos = [2]float64{pos[X], pos[Y]} // PBC wrap if dx > w[X]/2 { dx -= w[X] } if dx < -w[X]/2 { dx += w[X] } if dy > w[Y]/2 { dy -= w[Y] } if dy < -w[Y]/2 { dy += w[Y] } bdist += math.Sqrt(dx*dx + dy*dy) return bdist } var ( prevBdist = 0.0 prevBt = -999.0 ) func bubbleSpeed() float64 { dist := bubbleDist() if prevBt < 0 { prevBdist = dist prevBt = Time return 0 } v := (dist - prevBdist) / (Time - prevBt) prevBt = Time prevBdist = dist return v } mumax3-3.10/engine/ext_centerbubble.go000066400000000000000000000016671371432437400200010ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/data" "math" ) func init() { DeclFunc("ext_centerBubble", CenterBubble, "centerBubble shifts m after each step to keep the bubble position close to the center of the window") } func centerBubble() { c := Mesh().CellSize() position := bubblePos() var centerIdx [2]int centerIdx[X] = int(math.Floor((position[X] - GetShiftPos()) / c[X])) centerIdx[Y] = int(math.Floor((position[Y] - GetShiftYPos()) / c[Y])) zero := data.Vector{0, 0, 0} if ShiftMagL == zero || ShiftMagR == zero || ShiftMagD == zero || ShiftMagU == zero { ShiftMagL[Z] = -BubbleMz ShiftMagR[Z] = -BubbleMz ShiftMagD[Z] = -BubbleMz ShiftMagU[Z] = -BubbleMz } //put bubble to center if centerIdx[X] != 0 { Shift(-centerIdx[X]) } if centerIdx[Y] != 0 { YShift(-centerIdx[Y]) } } // This post-step function centers the simulation window on a bubble func CenterBubble() { PostStep(func() { centerBubble() }) } mumax3-3.10/engine/ext_centerwall.go000066400000000000000000000042251371432437400174760ustar00rootroot00000000000000package engine import ( "fmt" "github.com/mumax/3/data" ) var ( DWPos = NewScalarValue("ext_dwpos", "m", "Position of the simulation window while following a domain wall", GetShiftPos) // TODO: make more accurate DWxPos = NewScalarValue("ext_dwxpos", "m", "Position of the simulation window while following a domain wall", GetDWxPos) DWSpeed = NewScalarValue("ext_dwspeed", "m/s", "Speed of the simulation window while following a domain wall", getShiftSpeed) ) func init() { DeclFunc("ext_centerWall", CenterWall, "centerWall(c) shifts m after each step to keep m_c close to zero") } func centerWall(c int) { M := &M mc := sAverageUniverse(M.Buffer().Comp(c))[0] n := Mesh().Size() tolerance := 4 / float64(n[X]) // x*2 * expected change for 1 cell shift zero := data.Vector{0, 0, 0} if ShiftMagL == zero || ShiftMagR == zero { sign := magsign(M.GetCell(0, n[Y]/2, n[Z]/2)[c]) ShiftMagL[c] = float64(sign) ShiftMagR[c] = -float64(sign) } sign := magsign(ShiftMagL[c]) //log.Println("mc", mc, "tol", tolerance) if mc < -tolerance { Shift(sign) } else if mc > tolerance { Shift(-sign) } } // This post-step function centers the simulation window on a domain wall // between up-down (or down-up) domains (like in perpendicular media). E.g.: // PostStep(CenterPMAWall) func CenterWall(magComp int) { PostStep(func() { centerWall(magComp) }) } func magsign(x float64) int { if x > 0.1 { return 1 } if x < -0.1 { return -1 } panic(fmt.Errorf("center wall: unclear in which direction to shift: magnetization at border=%v. Set ShiftMagL, ShiftMagR", x)) } // used for speed var ( lastShift float64 // shift the last time we queried speed lastT float64 // time the last time we queried speed lastV float64 // speed the last time we queried speed ) func getShiftSpeed() float64 { if lastShift != GetShiftPos() { lastV = (GetShiftPos() - lastShift) / (Time - lastT) lastShift = GetShiftPos() lastT = Time } return lastV } func GetDWxPos() float64 { M := &M mx := sAverageUniverse(M.Buffer().Comp(0))[0] c := Mesh().CellSize() n := Mesh().Size() position := mx * c[0] * float64(n[0]) / 2. return GetShiftPos() + position } mumax3-3.10/engine/ext_corepos.go000066400000000000000000000025611371432437400170110ustar00rootroot00000000000000package engine var CorePos = NewVectorValue("ext_corepos", "m", "Vortex core position (x,y) + polarization (z)", corePos) func corePos() []float64 { m := M.Buffer() m_z := m.Comp(Z).HostCopy().Scalars() s := m.Size() Nx, Ny, Nz := s[X], s[Y], s[Z] max := float32(-1.0) var maxX, maxY, maxZ int for z := 0; z < Nz; z++ { // Avoid the boundaries so the neighbor interpolation can't go out of bounds. for y := 1; y < Ny-1; y++ { for x := 1; x < Nx-1; x++ { m := abs(m_z[z][y][x]) if m > max { maxX, maxY, maxZ = x, y, z max = m } } } } pos := make([]float64, 3) mz := m_z[maxZ] // sub-cell interpolation in X and Y, but not Z pos[X] = float64(maxX) + interpolate_maxpos( max, -1, abs(mz[maxY][maxX-1]), 1, abs(mz[maxY][maxX+1])) - float64(Nx)/2 + 0.5 pos[Y] = float64(maxY) + interpolate_maxpos( max, -1, abs(mz[maxY-1][maxX]), 1, abs(mz[maxY+1][maxX])) - float64(Ny)/2 + 0.5 c := Mesh().CellSize() pos[X] *= c[X] pos[Y] *= c[Y] pos[Z] = float64(m_z[maxZ][maxY][maxX]) // 3rd coordinate is core polarization pos[X] += GetShiftPos() // add simulation window shift return pos } func interpolate_maxpos(f0, d1, f1, d2, f2 float32) float64 { b := (f2 - f1) / (d2 - d1) a := ((f2-f0)/d2 - (f0-f1)/(-d1)) / (d2 - d1) return float64(-b / (2 * a)) } func abs(x float32) float32 { if x > 0 { return x } else { return -x } } mumax3-3.10/engine/ext_dwtilt.go000066400000000000000000000011571371432437400166460ustar00rootroot00000000000000package engine import ( "math" ) // PMA domain wall tilt assuming straight wall. var DWTiltPMA = NewScalarValue("ext_dwtilt", "rad", "PMA domain wall tilt", dwTiltPMA) func dwTiltPMA() float64 { m := Download(&M) mz := m.Vectors()[Z][0] // slice0 nx := Mesh().Size()[X] ny := Mesh().Size()[Y] // find domain wall at these y positions: y1 := 4 y2 := ny - 5 // search for x values where mz = 0 (=wall) x1, x2 := 0, 0 for i := 1; i < nx; i++ { if mz[y1][i-1]*mz[y1][i] < 0 { x1 = i } if mz[y2][i-1]*mz[y2][i] < 0 { x2 = i } } angle := math.Atan(float64(x1-x2) / float64(y1-y2)) return angle } mumax3-3.10/engine/ext_magnetoelastic.go000066400000000000000000000072271371432437400203420ustar00rootroot00000000000000package engine // Mangeto-elastic coupling. import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" ) var ( B1 = NewScalarParam("B1", "J/m3", "First magneto-elastic coupling constant") B2 = NewScalarParam("B2", "J/m3", "Second magneto-elastic coupling constant") exx = NewScalarExcitation("exx", "", "exx component of the strain tensor") eyy = NewScalarExcitation("eyy", "", "eyy component of the strain tensor") ezz = NewScalarExcitation("ezz", "", "ezz component of the strain tensor") exy = NewScalarExcitation("exy", "", "exy component of the strain tensor") exz = NewScalarExcitation("exz", "", "exz component of the strain tensor") eyz = NewScalarExcitation("eyz", "", "eyz component of the strain tensor") B_mel = NewVectorField("B_mel", "T", "Magneto-elastic filed", AddMagnetoelasticField) F_mel = NewVectorField("F_mel", "N/m3", "Magneto-elastic force density", GetMagnetoelasticForceDensity) Edens_mel = NewScalarField("Edens_mel", "J/m3", "Magneto-elastic energy density", AddMagnetoelasticEnergyDensity) E_mel = NewScalarValue("E_mel", "J", "Magneto-elastic energy", GetMagnetoelasticEnergy) ) var ( zeroMel = NewScalarParam("_zeroMel", "", "utility zero parameter") ) func init() { registerEnergy(GetMagnetoelasticEnergy, AddMagnetoelasticEnergyDensity) } func AddMagnetoelasticField(dst *data.Slice) { haveMel := B1.nonZero() || B2.nonZero() if !haveMel { return } Exx := exx.MSlice() defer Exx.Recycle() Eyy := eyy.MSlice() defer Eyy.Recycle() Ezz := ezz.MSlice() defer Ezz.Recycle() Exy := exy.MSlice() defer Exy.Recycle() Exz := exz.MSlice() defer Exz.Recycle() Eyz := eyz.MSlice() defer Eyz.Recycle() b1 := B1.MSlice() defer b1.Recycle() b2 := B2.MSlice() defer b2.Recycle() ms := Msat.MSlice() defer ms.Recycle() cuda.AddMagnetoelasticField(dst, M.Buffer(), Exx, Eyy, Ezz, Exy, Exz, Eyz, b1, b2, ms) } func GetMagnetoelasticForceDensity(dst *data.Slice) { haveMel := B1.nonZero() || B2.nonZero() if !haveMel { return } util.AssertMsg(B1.IsUniform() && B2.IsUniform(), "Magnetoelastic: B1, B2 must be uniform") b1 := B1.MSlice() defer b1.Recycle() b2 := B2.MSlice() defer b2.Recycle() cuda.GetMagnetoelasticForceDensity(dst, M.Buffer(), b1, b2, M.Mesh()) } func AddMagnetoelasticEnergyDensity(dst *data.Slice) { haveMel := B1.nonZero() || B2.nonZero() if !haveMel { return } buf := cuda.Buffer(B_mel.NComp(), B_mel.Mesh().Size()) defer cuda.Recycle(buf) // unnormalized magnetization: Mf := ValueOf(M_full) defer cuda.Recycle(Mf) Exx := exx.MSlice() defer Exx.Recycle() Eyy := eyy.MSlice() defer Eyy.Recycle() Ezz := ezz.MSlice() defer Ezz.Recycle() Exy := exy.MSlice() defer Exy.Recycle() Exz := exz.MSlice() defer Exz.Recycle() Eyz := eyz.MSlice() defer Eyz.Recycle() b1 := B1.MSlice() defer b1.Recycle() b2 := B2.MSlice() defer b2.Recycle() ms := Msat.MSlice() defer ms.Recycle() zeromel := zeroMel.MSlice() defer zeromel.Recycle() // 1st cuda.Zero(buf) cuda.AddMagnetoelasticField(buf, M.Buffer(), Exx, Eyy, Ezz, Exy, Exz, Eyz, b1, zeromel, ms) cuda.AddDotProduct(dst, -1./2., buf, Mf) // 1nd cuda.Zero(buf) cuda.AddMagnetoelasticField(buf, M.Buffer(), Exx, Eyy, Ezz, Exy, Exz, Eyz, zeromel, b2, ms) cuda.AddDotProduct(dst, -1./1., buf, Mf) } // Returns magneto-ell energy in joules. func GetMagnetoelasticEnergy() float64 { haveMel := B1.nonZero() || B2.nonZero() if !haveMel { return float64(0.0) } buf := cuda.Buffer(1, Mesh().Size()) defer cuda.Recycle(buf) cuda.Zero(buf) AddMagnetoelasticEnergyDensity(buf) return cellVolume() * float64(cuda.Sum(buf)) } mumax3-3.10/engine/ext_make3dgrains.go000066400000000000000000000103431371432437400177040ustar00rootroot00000000000000// 3D Voronoi tessellation. Contributed by Peyton Murray. package engine import ( "math" "math/rand" ) func init() { DeclFunc("ext_make3dgrains", Voronoi3d, "3D Voronoi tesselation over shape (grain size, starting region number, num regions, shape, seed)") } func Voronoi3d(grainsize float64, startRegion int, numRegions int, inputShape Shape, seed int) { Refer("Lel2014") SetBusy(true) defer SetBusy(false) t := newTesselation3d(grainsize, numRegions, int64(seed), startRegion, inputShape) regions.hist = append(regions.hist, t.RegionOf) regions.render(t.RegionOf) } type tesselation3d struct { grainsize float64 maxRegion int rnd *rand.Rand startRegion int shape Shape //Shape of the tesselated region centers []center3d //List of Voronoi centers } // Stores location of each Voronoi center type center3d struct { x, y, z float64 // center position (m) region byte // region for all cells near center } // Stores location of each cell type cellLocs struct{ x, y, z float64 } // nRegion exclusive func newTesselation3d(grainsize float64, nRegion int, seed int64, startRegion int, inputShape Shape) *tesselation3d { t := tesselation3d{grainsize, nRegion, rand.New(rand.NewSource(seed)), startRegion, inputShape, make([]center3d, 0)} t.makeRandomCenters() return &t } // Permutes the slice of cell locations. I don't understand why this needs to be done if we're choosing // random (Intn()) cells out of the slice of cell locations, but hey, it seems to do the trick. func shuffleCells(src []cellLocs) []cellLocs { dest := make([]cellLocs, len(src)) perm := rand.Perm(len(src)) for i, v := range perm { dest[v] = src[i] } return dest } func (t *tesselation3d) makeRandomCenters() { //Make a list of all the cells in the shape. cells := t.tabulateCells() cells = shuffleCells(cells) //Choose number of grains to make. Assume volume of grain is given by (4/3)*pi*r^3 shapeVolume := cellVolume() * float64(len(cells)) grainVolume := (float64(1) / 6) * math.Pi * t.grainsize * t.grainsize * t.grainsize nAvgGrains := shapeVolume / grainVolume nGrains := t.truncNorm(nAvgGrains) //TODO: same cell can be chosen twice by random chance t.centers = make([]center3d, nGrains) for p := 0; p < nGrains; p++ { rndCell := cells[t.rnd.Intn(nGrains)] t.centers[p].x = rndCell.x t.centers[p].y = rndCell.y t.centers[p].z = rndCell.z randRegion := t.startRegion + t.rnd.Intn(t.maxRegion) t.centers[p].region = byte(randRegion) } return } // Creates a slice of all cells which fall in the shape specified in the constructor. func (t *tesselation3d) tabulateCells() []cellLocs { //Initialze array of cells cells := make([]cellLocs, 0) //Get the mesh size meshSize := MeshSize() //Iterate across all cells in the mesh, and append those that are inside the shape for ix := 0; ix < meshSize[0]; ix++ { for iy := 0; iy < meshSize[1]; iy++ { for iz := 0; iz < meshSize[2]; iz++ { cell := Index2Coord(ix, iy, iz) x := cell.X() y := cell.Y() z := cell.Z() if t.shape(x, y, z) { cells = append(cells, cellLocs{x, y, z}) } } } } print("Number of cells in region: ", len(cells), "\n") print("Number of cells in universe: ", meshSize[0]*meshSize[1]*meshSize[2], "\n") return cells } // Find the nearest Voronoi center to the point (x, y, z). Only points inside the given shape will be // assigned a region. func (t *tesselation3d) RegionOf(x, y, z float64) int { if t.shape(x, y, z) { nearest := center3d{x, y, z, 0} mindist := math.Inf(1) for _, c := range t.centers { dist := sqr(x-c.x) + sqr(y-c.y) + sqr(z-c.z) if dist < mindist { nearest = c mindist = dist } } return int(nearest.region) } else { return -1 //When the regions are rendered, any region < 0 will not be rastered. } } // Generate normally distributed numbers; mean = lambda, variance = lambda. If generated number < 0, return 1. // Equivalent to Poisson distribution (with mean = lambda) for large lambda (which is usually true, since the volume // of a grain is usually much less than the simulation volume. func (t *tesselation3d) truncNorm(lambda float64) int { ret := lambda + math.Sqrt(lambda)*t.rnd.NormFloat64() if ret <= 0 { return 1 } else { return int(ret + 0.5) } } mumax3-3.10/engine/ext_makegrains.go000066400000000000000000000060011371432437400174510ustar00rootroot00000000000000package engine import ( "math" "math/rand" ) func init() { DeclFunc("ext_makegrains", Voronoi, "Voronoi tesselation (grain size, num regions)") } func Voronoi(grainsize float64, numRegions, seed int) { Refer("Lel2014") SetBusy(true) defer SetBusy(false) t := newTesselation(grainsize, numRegions, int64(seed)) regions.hist = append(regions.hist, t.RegionOf) regions.render(t.RegionOf) } type tesselation struct { grainsize float64 tilesize float64 maxRegion int cache map[int2][]center seed int64 rnd *rand.Rand } // integer tile coordinate type int2 struct{ x, y int } // Voronoi center info type center struct { x, y float64 // center position (m) region byte // region for all cells near center } // nRegion exclusive func newTesselation(grainsize float64, nRegion int, seed int64) *tesselation { return &tesselation{grainsize, float64(float32(grainsize * TILE)), // expect 4 grains/block, 36 per 3x3 blocks = safe, relatively round number nRegion, make(map[int2][]center), seed, rand.New(rand.NewSource(0))} } const ( TILE = 2 // tile size in grains LAMBDA = TILE * TILE // expected grains per tile ) // Returns the region of the grain where cell at x,y,z belongs to func (t *tesselation) RegionOf(x, y, z float64) int { tile := t.tileOf(x, y) // tile containing x,y // look for nearest center in tile + neighbors nearest := center{x, y, 0} // dummy initial value, but safe should the infinite impossibility strike. mindist := math.Inf(1) for tx := tile.x - 1; tx <= tile.x+1; tx++ { for ty := tile.y - 1; ty <= tile.y+1; ty++ { centers := t.centersInTile(tx, ty) for _, c := range centers { dist := sqr(x-c.x) + sqr(y-c.y) if dist < mindist { nearest = c mindist = dist } } } } //fmt.Println("nearest", x, y, ":", nearest) return int(nearest.region) } // Returns the list of Voronoi centers in tile(ix, iy), using only ix,iy to seed the random generator func (t *tesselation) centersInTile(tx, ty int) []center { pos := int2{tx, ty} if c, ok := t.cache[pos]; ok { return c } else { // tile-specific seed that works for positive and negative tx, ty seed := (int64(ty)+(1<<24))*(1<<24) + (int64(tx) + (1 << 24)) t.rnd.Seed(seed ^ t.seed) N := t.poisson(LAMBDA) c := make([]center, N) // absolute position of tile (m) x0, y0 := float64(tx)*t.tilesize, float64(ty)*t.tilesize for i := range c { // random position inside tile c[i].x = x0 + t.rnd.Float64()*t.tilesize c[i].y = y0 + t.rnd.Float64()*t.tilesize c[i].region = byte(t.rnd.Intn(t.maxRegion)) } t.cache[pos] = c return c } } func sqr(x float64) float64 { return x * x } func (t *tesselation) tileOf(x, y float64) int2 { ix := int(math.Floor(x / t.tilesize)) iy := int(math.Floor(y / t.tilesize)) return int2{ix, iy} } // Generate poisson distributed numbers (according to Knuth) func (t *tesselation) poisson(lambda float64) int { L := math.Exp(-lambda) k := 1 p := t.rnd.Float64() for p > L { k++ p *= t.rnd.Float64() } return k - 1 } mumax3-3.10/engine/ext_rmsurfacecharge.go000066400000000000000000000052001371432437400204710ustar00rootroot00000000000000package engine import ( "fmt" "github.com/mumax/3/data" "github.com/mumax/3/mag" "github.com/mumax/3/util" "math" ) func init() { DeclFunc("ext_rmSurfaceCharge", RemoveLRSurfaceCharge, "Compensate magnetic charges on the left and right sides of an in-plane magnetized wire. Arguments: region, mx on left and right side, resp.") } // For a nanowire magnetized in-plane, with mx = mxLeft on the left end and // mx = mxRight on the right end (both -1 or +1), add a B field needed to compensate // for the surface charges on the left and right edges. // This will mimic an infinitely long wire. func RemoveLRSurfaceCharge(region int, mxLeft, mxRight float64) { SetBusy(true) defer SetBusy(false) util.Argument(mxLeft == 1 || mxLeft == -1) util.Argument(mxRight == 1 || mxRight == -1) bsat := Msat.GetRegion(region) * mag.Mu0 util.AssertMsg(bsat != 0, "RemoveSurfaceCharges: Msat is zero in region "+fmt.Sprint(region)) B_ext.Add(compensateLRSurfaceCharges(Mesh(), mxLeft, mxRight, bsat), nil) } func compensateLRSurfaceCharges(m *data.Mesh, mxLeft, mxRight float64, bsat float64) *data.Slice { h := data.NewSlice(3, m.Size()) H := h.Vectors() world := m.WorldSize() cell := m.CellSize() size := m.Size() q := cell[Z] * cell[Y] * bsat q1 := q * mxLeft q2 := q * (-mxRight) prog, maxProg := 0, (size[Z]+1)*(size[Y]+1) // surface loop (source) for I := 0; I < size[Z]; I++ { for J := 0; J < size[Y]; J++ { prog++ util.Progress(prog, maxProg, "removing surface charges") y := (float64(J) + 0.5) * cell[Y] z := (float64(I) + 0.5) * cell[Z] source1 := [3]float64{0, y, z} // left surface source source2 := [3]float64{world[X], y, z} // right surface source // volume loop (destination) for iz := range H[0] { for iy := range H[0][iz] { for ix := range H[0][iz][iy] { dst := [3]float64{ // destination coordinate (float64(ix) + 0.5) * cell[X], (float64(iy) + 0.5) * cell[Y], (float64(iz) + 0.5) * cell[Z]} h1 := hfield(q1, source1, dst) h2 := hfield(q2, source2, dst) // add this surface charges' field to grand total for c := 0; c < 3; c++ { H[c][iz][iy][ix] += float32(h1[c] + h2[c]) } } } } } } return h } // H field of charge at location source, evaluated in location dest. func hfield(charge float64, source, dest [3]float64) [3]float64 { var R [3]float64 R[0] = dest[0] - source[0] R[1] = dest[1] - source[1] R[2] = dest[2] - source[2] r := math.Sqrt(R[0]*R[0] + R[1]*R[1] + R[2]*R[2]) qr3pi4 := charge / ((4 * math.Pi) * r * r * r) var h [3]float64 h[0] = R[0] * qr3pi4 h[1] = R[1] * qr3pi4 h[2] = R[2] * qr3pi4 return h } mumax3-3.10/engine/ext_topologicalcharge.go000066400000000000000000000013451371432437400210240ustar00rootroot00000000000000package engine import ( "math" "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) var ( Ext_TopologicalCharge = NewScalarValue("ext_topologicalcharge", "", "2D topological charge", GetTopologicalCharge) Ext_TopologicalChargeDensity = NewScalarField("ext_topologicalchargedensity", "1/m2", "2D topological charge density m·(∂m/∂x ✕ ∂m/∂y)", SetTopologicalChargeDensity) ) func SetTopologicalChargeDensity(dst *data.Slice) { cuda.SetTopologicalCharge(dst, M.Buffer(), M.Mesh()) } func GetTopologicalCharge() float64 { s := ValueOf(Ext_TopologicalChargeDensity) defer cuda.Recycle(s) c := Mesh().CellSize() N := Mesh().Size() return (0.25 * c[X] * c[Y] / math.Pi / float64(N[Z])) * float64(cuda.Sum(s)) } mumax3-3.10/engine/ext_topologicalchargelattice.go000066400000000000000000000015371371432437400223750ustar00rootroot00000000000000package engine import ( "math" "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) var ( Ext_TopologicalChargeLattice = NewScalarValue("ext_topologicalchargelattice", "", "2D topological charge according to Berg and Lüscher", GetTopologicalChargeLattice) Ext_TopologicalChargeDensityLattice = NewScalarField("ext_topologicalchargedensitylattice", "1/m2", "2D topological charge density according to Berg and Lüscher", SetTopologicalChargeDensityLattice) ) func SetTopologicalChargeDensityLattice(dst *data.Slice) { Refer("Berg1981") cuda.SetTopologicalChargeLattice(dst, M.Buffer(), M.Mesh()) } func GetTopologicalChargeLattice() float64 { s := ValueOf(Ext_TopologicalChargeDensityLattice) defer cuda.Recycle(s) c := Mesh().CellSize() N := Mesh().Size() return (0.25 * c[X] * c[Y] / math.Pi / float64(N[Z])) * float64(cuda.Sum(s)) } mumax3-3.10/engine/geom.go000066400000000000000000000134771371432437400154160ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" "math/rand" ) func init() { DeclFunc("SetGeom", SetGeom, "Sets the geometry to a given shape") DeclVar("EdgeSmooth", &edgeSmooth, "Geometry edge smoothing with edgeSmooth^3 samples per cell, 0=staircase, ~8=very smooth") geometry.init() } var ( geometry geom edgeSmooth int = 0 // disabled by default ) type geom struct { info buffer *data.Slice shape Shape } func (g *geom) init() { g.buffer = nil g.info = info{1, "geom", ""} DeclROnly("geom", g, "Cell fill fraction (0..1)") } func spaceFill() float64 { if geometry.Gpu().IsNil() { return 1 } else { return float64(cuda.Sum(geometry.buffer)) / float64(geometry.Mesh().NCell()) } } func (g *geom) Gpu() *data.Slice { if g.buffer == nil { g.buffer = data.NilSlice(1, g.Mesh().Size()) } return g.buffer } func (g *geom) Slice() (*data.Slice, bool) { s := g.Gpu() if s.IsNil() { s := cuda.Buffer(g.NComp(), g.Mesh().Size()) cuda.Memset(s, 1) return s, true } else { return s, false } } func (q *geom) EvalTo(dst *data.Slice) { EvalTo(q, dst) } var _ Quantity = &geometry func (g *geom) average() []float64 { s, r := g.Slice() if r { defer cuda.Recycle(s) } return sAverageUniverse(s) } func (g *geom) Average() float64 { return g.average()[0] } func SetGeom(s Shape) { geometry.setGeom(s) } func (geometry *geom) setGeom(s Shape) { SetBusy(true) defer SetBusy(false) if s == nil { // TODO: would be nice not to save volume if entirely filled s = universe } geometry.shape = s if geometry.Gpu().IsNil() { geometry.buffer = cuda.NewSlice(1, geometry.Mesh().Size()) } host := data.NewSlice(1, geometry.Gpu().Size()) array := host.Scalars() V := host v := array n := geometry.Mesh().Size() c := geometry.Mesh().CellSize() cx, cy, cz := c[X], c[Y], c[Z] progress, progmax := 0, n[Y]*n[Z] var ok bool for iz := 0; iz < n[Z]; iz++ { for iy := 0; iy < n[Y]; iy++ { progress++ util.Progress(progress, progmax, "Initializing geometry") for ix := 0; ix < n[X]; ix++ { r := Index2Coord(ix, iy, iz) x0, y0, z0 := r[X], r[Y], r[Z] // check if center and all vertices lie inside or all outside allIn, allOut := true, true if s(x0, y0, z0) { allOut = false } else { allIn = false } if edgeSmooth != 0 { // center is sufficient if we're not really smoothing for _, Δx := range []float64{-cx / 2, cx / 2} { for _, Δy := range []float64{-cy / 2, cy / 2} { for _, Δz := range []float64{-cz / 2, cz / 2} { if s(x0+Δx, y0+Δy, z0+Δz) { // inside allOut = false } else { allIn = false } } } } } switch { case allIn: v[iz][iy][ix] = 1 ok = true case allOut: v[iz][iy][ix] = 0 default: v[iz][iy][ix] = geometry.cellVolume(ix, iy, iz) ok = ok || (v[iz][iy][ix] != 0) } } } } if !ok { util.Fatal("SetGeom: geometry completely empty") } data.Copy(geometry.buffer, V) // M inside geom but previously outside needs to be re-inited needupload := false geomlist := host.Host()[0] mhost := M.Buffer().HostCopy() m := mhost.Host() rng := rand.New(rand.NewSource(0)) for i := range m[0] { if geomlist[i] != 0 { mx, my, mz := m[X][i], m[Y][i], m[Z][i] if mx == 0 && my == 0 && mz == 0 { needupload = true rnd := randomDir(rng) m[X][i], m[Y][i], m[Z][i] = float32(rnd[X]), float32(rnd[Y]), float32(rnd[Z]) } } } if needupload { data.Copy(M.Buffer(), mhost) } M.normalize() // removes m outside vol } // Sample edgeSmooth^3 points inside the cell to estimate its volume. func (g *geom) cellVolume(ix, iy, iz int) float32 { r := Index2Coord(ix, iy, iz) x0, y0, z0 := r[X], r[Y], r[Z] c := geometry.Mesh().CellSize() cx, cy, cz := c[X], c[Y], c[Z] s := geometry.shape var vol float32 N := edgeSmooth S := float64(edgeSmooth) for dx := 0; dx < N; dx++ { Δx := -cx/2 + (cx / (2 * S)) + (cx/S)*float64(dx) for dy := 0; dy < N; dy++ { Δy := -cy/2 + (cy / (2 * S)) + (cy/S)*float64(dy) for dz := 0; dz < N; dz++ { Δz := -cz/2 + (cz / (2 * S)) + (cz/S)*float64(dz) if s(x0+Δx, y0+Δy, z0+Δz) { // inside vol++ } } } } return vol / float32(N*N*N) } func (g *geom) shift(dx int) { // empty mask, nothing to do if g == nil || g.buffer.IsNil() { return } // allocated mask: shift s := g.buffer s2 := cuda.Buffer(1, g.Mesh().Size()) defer cuda.Recycle(s2) newv := float32(1) // initially fill edges with 1's cuda.ShiftX(s2, s, dx, newv, newv) data.Copy(s, s2) n := Mesh().Size() x1, x2 := shiftDirtyRange(dx) for iz := 0; iz < n[Z]; iz++ { for iy := 0; iy < n[Y]; iy++ { for ix := x1; ix < x2; ix++ { r := Index2Coord(ix, iy, iz) // includes shift if !g.shape(r[X], r[Y], r[Z]) { cuda.SetCell(g.buffer, 0, ix, iy, iz, 0) // a bit slowish, but hardly reached } } } } } func (g *geom) shiftY(dy int) { // empty mask, nothing to do if g == nil || g.buffer.IsNil() { return } // allocated mask: shift s := g.buffer s2 := cuda.Buffer(1, g.Mesh().Size()) defer cuda.Recycle(s2) newv := float32(1) // initially fill edges with 1's cuda.ShiftY(s2, s, dy, newv, newv) data.Copy(s, s2) n := Mesh().Size() y1, y2 := shiftDirtyRange(dy) for iz := 0; iz < n[Z]; iz++ { for ix := 0; ix < n[X]; ix++ { for iy := y1; iy < y2; iy++ { r := Index2Coord(ix, iy, iz) // includes shift if !g.shape(r[X], r[Y], r[Z]) { cuda.SetCell(g.buffer, 0, ix, iy, iz, 0) // a bit slowish, but hardly reached } } } } } // x range that needs to be refreshed after shift over dx func shiftDirtyRange(dx int) (x1, x2 int) { nx := Mesh().Size()[X] util.Argument(dx != 0) if dx < 0 { x1 = nx + dx x2 = nx } else { x1 = 0 x2 = dx } return } func (g *geom) Mesh() *data.Mesh { return Mesh() } mumax3-3.10/engine/gofiles.go000066400000000000000000000027211371432437400161050ustar00rootroot00000000000000package engine // support for running Go files as if they were mx3 files. import ( "flag" "github.com/mumax/3/cuda" "github.com/mumax/3/util" "os" "path" ) var ( // These flags are shared between cmd/mumax3 and Go input files. Flag_cachedir = flag.String("cache", os.TempDir(), "Kernel cache directory (empty disables caching)") Flag_gpu = flag.Int("gpu", 0, "Specify GPU") Flag_interactive = flag.Bool("i", false, "Open interactive browser session") Flag_od = flag.String("o", "", "Override output directory") Flag_port = flag.String("http", ":35367", "Port to serve web gui") Flag_selftest = flag.Bool("paranoid", false, "Enable convolution self-test for cuFFT sanity.") Flag_silent = flag.Bool("s", false, "Silent") // provided for backwards compatibility Flag_sync = flag.Bool("sync", false, "Synchronize all CUDA calls (debug)") Flag_forceclean = flag.Bool("f", false, "Force start, clean existing output directory") ) // Usage: in every Go input file, write: // // func main(){ // defer InitAndClose()() // // ... // } // // This initialises the GPU, output directory, etc, // and makes sure pending output will get flushed. func InitAndClose() func() { flag.Parse() cuda.Init(*Flag_gpu) cuda.Synchronous = *Flag_sync od := *Flag_od if od == "" { od = path.Base(os.Args[0]) + ".out" } inFile := util.NoExt(od) InitIO(inFile, od, *Flag_forceclean) GoServe(*Flag_port) return func() { Close() } } mumax3-3.10/engine/gui.go000066400000000000000000000364251371432437400152510ustar00rootroot00000000000000package engine import ( "fmt" "github.com/mumax/3/cuda" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/gui" "github.com/mumax/3/httpfs" "github.com/mumax/3/util" "math/rand" "net" "net/http" "path" "reflect" "strconv" "sync" "time" ) // global GUI state stores what is currently shown in the web page. var ( gui_ = guistate{Quants: make(map[string]Quantity), Params: make(map[string]Param)} Timeout = 3 * time.Second // exit finished simulation this long after browser was closed ) type guistate struct { *gui.Page // GUI elements (buttons...) Quants map[string]Quantity // displayable quantities by name Params map[string]Param // displayable parameters by name render // renders displayed quantity mutex sync.Mutex // protects eventCacheBreaker and keepalive _eventCacheBreaker int // changed on any event to make sure display is updated keepalive time.Time } // Returns the time when updateKeepAlive was called. func (g *guistate) KeepAlive() time.Time { g.mutex.Lock() defer g.mutex.Unlock() return g.keepalive } // Called on each http request to signal browser is still open. func (g *guistate) UpdateKeepAlive() { g.mutex.Lock() defer g.mutex.Unlock() g.keepalive = time.Now() } func nop() {} // Enter interactive mode. Simulation is now exclusively controlled by web GUI func (g *guistate) RunInteractive() { // periodically wake up Run so it may exit on timeout go func() { for { Inject <- nop time.Sleep(1 * time.Second) } }() fmt.Println("//entering interactive mode") g.UpdateKeepAlive() for time.Since(g.KeepAlive()) < Timeout { f := <-Inject f() } fmt.Println("//browser disconnected, exiting") } // displayable quantity in GUI Parameters section type Param interface { NComp() int Name() string Unit() string getRegion(int) []float64 IsUniform() bool } func GUIAdd(name string, value interface{}) { gui_.Add(name, value) } // Internal:add a quantity to the GUI, will be visible in web interface. // Automatically called by Decl*(), still before PrepareServer() func (g *guistate) Add(name string, value interface{}) { if v, ok := value.(Param); ok { g.Params[name] = v } if v, ok := value.(Quantity); ok { g.Quants[name] = v } } // Once Params/Quants have been declared and added, // initialize the GUI Page (pre-renders template) and register http handlers func (g *guistate) PrepareServer() { g.Page = gui.NewPage(templText, g) util.SetProgress(gui_.Prog) g.OnAnyEvent(func() { g.incCacheBreaker() }) http.Handle("/", g) http.HandleFunc("/render/", g.ServeRender) http.HandleFunc("/plot/", g.servePlot) g.Set("title", util.NoExt(OD()[:len(OD())-1])) g.prepareConsole() g.prepareMesh() g.prepareGeom() g.prepareM() g.prepareSolver() g.prepareDisplay() g.prepareParam() g.prepareOnUpdate() } // see prepareServer func (g *guistate) prepareConsole() { g.OnEvent("cli", func() { cmd := g.StringValue("cli") Inject <- func() { g.EvalGUI(cmd) } g.Set("cli", "") }) } // see prepareServer func (g *guistate) prepareMesh() { //g.Disable("setmesh", true) // button only enabled if pressing makes sense const MESHWARN = "⚠ Click to update mesh (may take some time)" warnmesh := func() { //g.Disable("setmesh", false) g.Set("setmeshwarn", MESHWARN) } g.OnEvent("nx", func() { Inject <- func() { lazy_gridsize[X] = g.IntValue("nx"); warnmesh() } }) g.OnEvent("ny", func() { Inject <- func() { lazy_gridsize[Y] = g.IntValue("ny"); warnmesh() } }) g.OnEvent("nz", func() { Inject <- func() { lazy_gridsize[Z] = g.IntValue("nz"); warnmesh() } }) g.OnEvent("cx", func() { Inject <- func() { lazy_cellsize[X] = g.FloatValue("cx"); warnmesh() } }) g.OnEvent("cy", func() { Inject <- func() { lazy_cellsize[Y] = g.FloatValue("cy"); warnmesh() } }) g.OnEvent("cz", func() { Inject <- func() { lazy_cellsize[Z] = g.FloatValue("cz"); warnmesh() } }) g.OnEvent("px", func() { Inject <- func() { lazy_pbc[X] = g.IntValue("px"); warnmesh() } }) g.OnEvent("py", func() { Inject <- func() { lazy_pbc[Y] = g.IntValue("py"); warnmesh() } }) g.OnEvent("pz", func() { Inject <- func() { lazy_pbc[Z] = g.IntValue("pz"); warnmesh() } }) g.OnEvent("setmesh", func() { //g.Disable("setmesh", true) Inject <- (func() { g.EvalGUI(fmt.Sprintf("SetMesh(%v, %v, %v, %v, %v, %v, %v, %v, %v)", g.Value("nx"), g.Value("ny"), g.Value("nz"), g.Value("cx"), g.Value("cy"), g.Value("cz"), g.Value("px"), g.Value("py"), g.Value("pz"))) // update lazy_* sizes to be up-to date with proper mesh n := Mesh().Size() c := Mesh().CellSize() p := Mesh().PBC() lazy_gridsize = []int{n[X], n[Y], n[Z]} lazy_cellsize = []float64{c[X], c[Y], c[Z]} lazy_pbc = []int{p[X], p[Y], p[Z]} }) g.Set("setmeshwarn", "mesh up to date") }) } func (g *guistate) IntValue(id string) int { s := g.StringValue(id) r := fmt.Sprint(Eval1Line(s)) i, _ := strconv.Atoi(r) return i } func (g *guistate) FloatValue(id string) float64 { s := g.StringValue(id) r := fmt.Sprint(Eval1Line(s)) f, _ := strconv.ParseFloat(r, 64) return f } // see prepareServer func (g *guistate) prepareGeom() { g.OnEvent("geomselect", func() { ident := g.StringValue("geomselect") t := World.Resolve(ident).Type() // set sensible args: world size args := "(" for i := 0; i < t.NumIn(); i++ { val := 0.0 if i < 3 { val = Mesh().WorldSize()[i] } if i > 0 { args += ", " } args += fmt.Sprint(val) } args += ")" // overwrite args for special cases switch { case ident == "Cell": args = "(0, 0, 0)" case ident == "XRange" || ident == "YRange" || ident == "ZRange": args = "(0, inf)" case ident == "Layers": args = "(0, 1)" case ident == "ImageShape": args = `("filename.png")` } g.Set("geomargs", args) g.Set("geomdoc", g.Doc(ident)) }) g.OnEvent("setgeom", func() { Inject <- (func() { g.EvalGUI(fmt.Sprint("SetGeom(", g.StringValue("geomselect"), g.StringValue("geomargs"), ")")) }) }) } // see prepareServer func (g *guistate) prepareM() { g.OnEvent("mselect", func() { ident := g.StringValue("mselect") t := World.Resolve(ident).Type() args := "(" for i := 0; i < t.NumIn(); i++ { if i > 0 { args += ", " } args += "1" } args += ")" // overwrite args for special cases switch ident { case "VortexWall": args = "(1, -1, 1, 1)" } g.Set("margs", args) g.Set("mdoc", g.Doc(ident)) }) g.OnEvent("setm", func() { Inject <- (func() { g.EvalGUI(fmt.Sprint("m = ", g.StringValue("mselect"), g.StringValue("margs"))) }) }) } var ( solvertypes = map[string]int{"bw_euler": -1, "euler": 1, "heun": 2, "rk23": 3, "rk4": 4, "rk45": 5, "rkf56": 6} solvernames = map[int]string{-1: "bw_euler", 1: "euler", 2: "heun", 3: "rk23", 4: "rk4", 5: "rk45", 6: "rkf56"} ) func Break() { Inject <- func() { pause = true } } // see prepareServer func (g *guistate) prepareSolver() { g.OnEvent("run", func() { Break(); Inject <- func() { g.EvalGUI(sprint("Run(", g.StringValue("runtime"), ")")) } }) g.OnEvent("steps", func() { Break(); Inject <- func() { g.EvalGUI(sprint("Steps(", g.StringValue("runsteps"), ")")) } }) g.OnEvent("break", Break) g.OnEvent("relax", func() { Break(); Inject <- func() { g.EvalGUI("relax()") } }) g.OnEvent("mindt", func() { Inject <- func() { g.EvalGUI("MinDt=" + g.StringValue("mindt")) } }) g.OnEvent("maxdt", func() { Inject <- func() { g.EvalGUI("MaxDt=" + g.StringValue("maxdt")) } }) g.OnEvent("fixdt", func() { Inject <- func() { g.EvalGUI("FixDt=" + g.StringValue("fixdt")) } }) g.OnEvent("maxerr", func() { Inject <- func() { g.EvalGUI("MaxErr=" + g.StringValue("maxerr")) } }) g.OnEvent("solvertype", func() { Inject <- func() { typ := solvertypes[g.StringValue("solvertype")] // euler must have fixed time step if typ == EULER && FixDt == 0 { g.EvalGUI("FixDt = 1e-15") } if typ == BACKWARD_EULER && FixDt == 0 { g.EvalGUI("FixDt = 1e-13") } g.EvalGUI(fmt.Sprint("SetSolver(", typ, ")")) } }) } // see prepareServer func (g *guistate) prepareParam() { for _, p := range g.Params { p := p n := p.Name() g.OnEvent(n, func() { cmd := p.Name() r := g.Value("region") if r == -1 { cmd += " = " } else { cmd += fmt.Sprint(".SetRegion(", r, ", ") } if p.NComp() == 3 { cmd += "vector " // space needed } cmd += g.StringValue(p.Name()) if r != -1 { cmd += ")" } Inject <- func() { g.EvalGUI(cmd) } }) } // overwrite handler for temperature // do not crash when we enter bogus values (see temperature.go) g.OnEvent("Temp", func() { Inject <- func() { if FixDt == 0 { g.EvalGUI("FixDt = 10e-14") // finite temperature requires fixed time step } g.EvalGUI("Temp = " + g.StringValue("Temp")) } }) } // see prepareServer func (g *guistate) prepareDisplay() { // plot g.OnEvent("tableAutoSave", func() { Inject <- func() { g.EvalGUI("TableAutosave(" + g.StringValue("tableAutoSave") + ")") } }) // render g.OnEvent("renderQuant", func() { g.render.mutex.Lock() defer g.render.mutex.Unlock() name := g.StringValue("renderQuant") q := g.Quants[name] if q == nil { LogErr("display: unknown quantity:", name) return } g.render.quant = q g.Set("renderDoc", g.Doc(g.StringValue("renderQuant"))) }) g.OnEvent("renderComp", func() { g.render.mutex.Lock() defer g.render.mutex.Unlock() g.render.comp = g.StringValue("renderComp") // TODO: set to "" if q.Ncomp < 3 }) g.OnEvent("renderLayer", func() { g.render.mutex.Lock() defer g.render.mutex.Unlock() g.render.layer = g.IntValue("renderLayer") g.Set("renderLayerLabel", fmt.Sprint(g.render.layer, "/", Mesh().Size()[Z])) }) g.OnEvent("renderScale", func() { g.render.mutex.Lock() defer g.render.mutex.Unlock() g.render.scale = maxScale - g.IntValue("renderScale") g.Set("renderScaleLabel", fmt.Sprint("1/", g.render.scale)) }) } // see prepareServer func (g *guistate) prepareOnUpdate() { g.OnUpdate(func() { g.UpdateKeepAlive() // keep track of when browser was last seen alive if GetBusy() { // busy, e.g., calculating kernel, run loop will not accept commands. return } Inject <- (func() { // sends to run loop to be executed in between time steps g.Set("console", hist) // mesh g.Set("nx", lazy_gridsize[X]) g.Set("ny", lazy_gridsize[Y]) g.Set("nz", lazy_gridsize[Z]) g.Set("cx", lazy_cellsize[X]) g.Set("cy", lazy_cellsize[Y]) g.Set("cz", lazy_cellsize[Z]) g.Set("px", lazy_pbc[X]) g.Set("py", lazy_pbc[Y]) g.Set("pz", lazy_pbc[Z]) g.Set("wx", printf(lazy_cellsize[X]*float64(lazy_gridsize[X])*1e9)) g.Set("wy", printf(lazy_cellsize[Y]*float64(lazy_gridsize[Y])*1e9)) g.Set("wz", printf(lazy_cellsize[Z]*float64(lazy_gridsize[Z])*1e9)) // solver g.Set("nsteps", NSteps) g.Set("time", fmt.Sprintf("%1.5e", Time)) g.Set("dt", fmt.Sprintf("%1.3e", Dt_si)) g.Set("lasterr", fmt.Sprintf("%1.3e", LastErr)) g.Set("maxerr", MaxErr) g.Set("mindt", MinDt) g.Set("maxdt", MaxDt) g.Set("fixdt", FixDt) g.Set("solvertype", fmt.Sprint(solvernames[solvertype])) if pause { g.Set("busy", "Paused") } else { g.Set("busy", "Running") // Don't re-evaluate all the time if not running g.Set("maxtorque", fmt.Sprintf("%1.3e T", LastTorque)) } // display g.Set("tableAutoSave", Table.autosave.period) quant := g.StringValue("renderQuant") comp := g.StringValue("renderComp") cachebreaker := "?" + g.StringValue("nsteps") + "_" + fmt.Sprint(g.cacheBreaker()) g.Attr("renderLayer", "max", Mesh().Size()[Z]-1) g.Set("display", "/render/"+quant+"/"+comp+cachebreaker) // plot gui_.Set("plot", "/plot/"+cachebreaker) // parameters for _, p := range g.Params { n := p.Name() r := g.IntValue("region") if r == -1 && !p.IsUniform() { g.Set(n, "") } else { if r == -1 { r = 0 // uniform, so pick one } v := p.getRegion(r) if p.NComp() == 1 { g.Set(n, float32(v[0])) } else { g.Set(n, fmt.Sprintf("(%v, %v, %v)", float32(v[X]), float32(v[Y]), float32(v[Z]))) } } } // gpu memfree, _ := cu.MemGetInfo() memfree /= (1024 * 1024) g.Set("memfree", memfree) }) }) } // Returns documentation string for quantity name. E.g.: // "m" -> "Reduced magnetization" func (g *guistate) Doc(quant string) string { doc, ok := World.Doc[quant] if !ok { LogErr("no doc for", quant) } return doc } // Returns unit for quantity name. E.g.: // "Msat" -> "A/m" func (g *guistate) UnitOf(quant string) string { p := g.Params[quant] if p != nil { return p.Unit() } else { return "" } } // renders page title for PrepareServer func (g *guistate) Title() string { return util.NoExt(path.Base(OD())) } func (g *guistate) Version() string { return UNAME } func (g *guistate) GPUInfo() string { return cuda.GPUInfo } func (g *guistate) incCacheBreaker() { g.mutex.Lock() defer g.mutex.Unlock() g._eventCacheBreaker++ } func (g *guistate) cacheBreaker() int { g.mutex.Lock() defer g.mutex.Unlock() return g._eventCacheBreaker } func (g *guistate) QuantNames() []string { names := make([]string, 0, len(g.Quants)) for k, _ := range g.Quants { names = append(names, k) } sortNoCase(names) return names } // List all available shapes func (g *guistate) Shapes() []string { return g.apifilter("Shape") } func (g *guistate) Configs() []string { return g.apifilter("Config") } // List all api functions that return outputtype (Shape, Config, ...) func (g *guistate) apifilter(outputtype string) []string { var match []string for k, _ := range World.Doc { v := World.Resolve(k) t := v.Type() if t.Kind() == reflect.Func && t.NumOut() == 1 && t.Out(0).Name() == outputtype { match = append(match, k) } } sortNoCase(match) return match } func (g *guistate) Parameters() []string { var params []string for _, v := range g.Params { params = append(params, v.Name()) } sortNoCase(params) return params } // renders a
that toggles visibility on click for PrepareServer func (g *guistate) Div(heading string) string { id := fmt.Sprint("div_", rand.Int()) return fmt.Sprintf(`▾ %v
`, id, heading, id) } func GoServe(addr string) string { gui_.PrepareServer() // find a free port starting from the usual number l, err := net.Listen("tcp", addr) for err != nil { h, p, _ := net.SplitHostPort(addr) addr = fmt.Sprint(h, ":", atoi(p)+1) l, err = net.Listen("tcp", addr) } go func() { LogErr(http.Serve(l, nil)) }() httpfs.Put(OD()+"gui", []byte(l.Addr().String())) return addr } func atoi(a string) int { i, err := strconv.Atoi(a) util.PanicErr(err) return i } // Prog advances the GUI progress bar to fraction a/total and displays message. func (g *guistate) Prog(a, total int, msg string) { g.Set("progress", (a*100)/total) g.Set("busy", msg) util.PrintProgress(a, total, msg) } // Eval code + update keepalive in case the code runs long func (g *guistate) EvalGUI(code string) { defer func() { if err := recover(); err != nil { if userErr, ok := err.(UserErr); ok { LogErr(userErr) } else { panic(err) } } }() Eval(code) g.UpdateKeepAlive() } // //// round duration to 1s accuracy //func roundt(t time.Duration) time.Duration { // return t - t%1e9 //} // mumax3-3.10/engine/heun.go000066400000000000000000000020561371432437400154150ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/util" "math" ) // Adaptive Heun solver. type Heun struct{} // Adaptive Heun method, can be used as solver.Step func (_ *Heun) Step() { y := M.Buffer() dy0 := cuda.Buffer(VECTOR, y.Size()) defer cuda.Recycle(dy0) if FixDt != 0 { Dt_si = FixDt } dt := float32(Dt_si * GammaLL) util.Assert(dt > 0) // stage 1 torqueFn(dy0) cuda.Madd2(y, y, dy0, 1, dt) // y = y + dt * dy // stage 2 dy := cuda.Buffer(3, y.Size()) defer cuda.Recycle(dy) Time += Dt_si torqueFn(dy) err := cuda.MaxVecDiff(dy0, dy) * float64(dt) // adjust next time step if err < MaxErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop // step OK cuda.Madd3(y, y, dy, dy0, 1, 0.5*dt, -0.5*dt) M.normalize() NSteps++ adaptDt(math.Pow(MaxErr/err, 1./2.)) setLastErr(err) setMaxTorque(dy) } else { // undo bad step util.Assert(FixDt == 0) Time -= Dt_si cuda.Madd2(y, y, dy0, 1, -dt) NUndone++ adaptDt(math.Pow(MaxErr/err, 1./3.)) } } func (_ *Heun) Free() {} mumax3-3.10/engine/html.go000066400000000000000000000243761371432437400154330ustar00rootroot00000000000000package engine const templText = ` mumax3 ` + CSS + ` {{.JS}} {{.Span "title" "mumax3"}}   {{.Progress "progress" 100 0}} {{.Span "busy" "" }}   {{.ErrorBox}}

{{.Data.Div "console"}} {{.Console "console" 16 84 "" "onfocus=\"console_focus=true\"" "onblur=\"console_focus=false\"" "onmouseover=\"console_focus=true\"" "onmouseout=\"console_focus=false\"" "readonly" "style=\"font-family:monospace; font-size:0.8em;\"" }}
{{.CliBox "cli" "" "onkeydown=\"clikeydown(event);\"" "placeholder=\"type commands here, or up/down\"" "size=86" "style=\"font-family:monospace; font-size:0.8em;\"" }}
{{.Data.Div "mesh"}}
gridsize: {{.TextBox "nx" "" "size=8"}} × {{.TextBox "ny" "" "size=8"}} × {{.TextBox "nz" "" "size=8"}} cells
cellsize: {{.TextBox "cx" "" "size=8"}} × {{.TextBox "cy" "" "size=8"}} × {{.TextBox "cz" "" "size=8"}} m3
PBC: {{.TextBox "px" "" "size=8"}} × {{.TextBox "py" "" "size=8"}} × {{.TextBox "pz" "" "size=8"}} repetitions
worldsize: {{.Span "wx" ""}} × {{.Span "wy" ""}} × {{.Span "wz" ""}} nm3
{{.Button "setmesh" "update"}} {{.Span "setmeshwarn" ""}}
{{.Data.Div "geometry"}} SetGeom( {{.Data.Shapes | .SelectArray "geomselect" "Universe"}} {{.TextBox "geomargs" "()" }} ) {{.Button "setgeom" "Set"}}
{{.Span "geomdoc" "" "style=\"color:gray\""}} {{.Data.Div "initial m"}} m = {{.Data.Configs | .SelectArray "mselect" "Uniform"}} {{.TextBox "margs" "(1, 0, 0)" }} {{.Button "setm" "Set"}}
{{.Span "mdoc" "" "style=\"color:gray\""}} {{.Data.Div "solver"}} Type: {{.Select "solvertype" "rk45" "bw_euler" "euler" "heun" "rk4" "rk23" "rk45" "rkf56"}}
{{.Button "run" "Run" }} {{.TextBox "runtime" 1e-9 "size=8"}}s
{{.Button "steps" "Steps"}} {{.TextBox "runsteps" "1000" "size=8"}}
{{.Button "relax" "Relax"}}
{{.Button "break" "Break"}}
       
step: {{.Span "nsteps" "0"}}
time: {{.Span "time" "0"}} s
dt: {{.Span "dt" "0"}} s
err/step: {{.Span "lasterr" "0"}}
MaxTorque:{{.Span "maxtorque" "--"}}
       
fixdt: {{.TextBox "fixdt" "0" "size=8"}} s
mindt: {{.TextBox "mindt" "0" "size=8"}} s
maxdt: {{.TextBox "maxdt" "0" "size=8"}} s
maxerr: {{.TextBox "maxerr" "0" "size=8"}}/step
{{.Data.Div "display"}}

Quantity: {{.Data.QuantNames | .SelectArray "renderQuant" "m"}} {{.Select "renderComp" "" "" "x" "y" "z"}} {{.Span "renderDoc" "" "style=\"color:gray\""}}
Slice: {{.Range "renderLayer" 0 0 0 }} {{.Span "renderLayerLabel" "0"}}
Scale: {{.Range "renderScale" 0 31 31}} {{.Span "renderScaleLabel" "1/1"}}

{{.Img "display" "/render/m" "alt=\"display\""}}

{{.Data.Div "gnuplot"}}

TableAutosave: {{.TextBox "tableAutoSave" "0" }} s

Plot of "table.txt", provided table is being autosaved and gnuplot installed.
plot "table.txt" using {{.TextBox "usingx" "1"}} : {{.TextBox "usingy" "2"}} with lines

{{.Span "plotErr" ""}}

{{.Img "plot" "/plot/"}} {{.Data.Div "parameters"}} Region: {{.Number "region" -1 255 -1}}
{{range .Data.Parameters}} {{end}}
{{.}} {{$.TextBox . ""}} {{$.Data.UnitOf . }}

{{.Data.Version}}
{{.Data.GPUInfo}} ({{.Span "memfree" ""}} MB free)
© 2013 Arne Vansteenkiste, DyNaMat LAB, UGent.
` const CSS = ` ` mumax3-3.10/engine/log.go000066400000000000000000000031601371432437400152340ustar00rootroot00000000000000package engine import ( "fmt" "github.com/mumax/3/httpfs" "github.com/mumax/3/util" "io" "os" ) var ( hist string // console history for GUI logfile io.WriteCloser // saves history of input commands + output ) // Special error that is not fatal when paniced on and called from GUI // E.g.: try to set bad grid size: panic on UserErr, recover, print error, carry on. type UserErr string func (e UserErr) Error() string { return string(e) } func CheckRecoverable(err error) { if err != nil { panic(UserErr(err.Error())) } } func LogIn(msg ...interface{}) { str := sprint(msg...) log2GUI(str) log2File(str) fmt.Println(str) } func LogOut(msg ...interface{}) { str := "//" + sprint(msg...) log2GUI(str) log2File(str) fmt.Println(str) } func LogErr(msg ...interface{}) { str := "//" + sprint(msg...) log2GUI(str) log2File(str) fprintln(os.Stderr, str) } func log2File(msg string) { if logfile != nil { fprintln(logfile, msg) } } func initLog() { if logfile != nil { panic("log already inited") } // open log file and flush what was logged before the file existed var err error logfile, err = httpfs.Create(OD() + "log.txt") if err != nil { panic(err) } util.FatalErr(err) logfile.Write(([]byte)(hist)) logfile.Write([]byte{'\n'}) } func log2GUI(msg string) { if len(msg) > 1000 { msg = msg[:1000-len("...")] + "..." } if hist != "" { // prepend newline hist += "\n" } hist += msg // TODO: push to web ? } // like fmt.Sprint but with spaces between args func sprint(msg ...interface{}) string { str := fmt.Sprintln(msg...) str = str[:len(str)-1] // strip newline return str } mumax3-3.10/engine/lutdata.go000066400000000000000000000043771371432437400161240ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/cuda/cu" "github.com/mumax/3/data" "github.com/mumax/3/util" "unsafe" ) // look-up table for region based parameters type lut struct { gpu_buf cuda.LUTPtrs // gpu copy of cpu buffer, only transferred when needed gpu_ok bool // gpu cache up-to date with cpu source? cpu_buf [][NREGION]float32 // table data on cpu source updater // updates cpu data } type updater interface { update() // updates cpu lookup table } func (p *lut) init(nComp int, source updater) { p.gpu_buf = make(cuda.LUTPtrs, nComp) p.cpu_buf = make([][NREGION]float32, nComp) p.source = source } // get an up-to-date version of the lookup-table on CPU func (p *lut) cpuLUT() [][NREGION]float32 { p.source.update() return p.cpu_buf } // get an up-to-date version of the lookup-table on GPU func (p *lut) gpuLUT() cuda.LUTPtrs { p.source.update() if !p.gpu_ok { // upload to GPU p.assureAlloc() cuda.Sync() // sync previous kernels, may still be using gpu lut for c := range p.gpu_buf { cuda.MemCpyHtoD(p.gpu_buf[c], unsafe.Pointer(&p.cpu_buf[c][0]), cu.SIZEOF_FLOAT32*NREGION) } p.gpu_ok = true cuda.Sync() //sync upload } return p.gpu_buf } // utility for LUT of single-component data func (p *lut) gpuLUT1() cuda.LUTPtr { util.Assert(len(p.gpu_buf) == 1) return cuda.LUTPtr(p.gpuLUT()[0]) } // all data is 0? func (p *lut) isZero() bool { v := p.cpuLUT() for c := range v { for i := 0; i < NREGION; i++ { if v[c][i] != 0 { return false } } } return true } func (p *lut) nonZero() bool { return !p.isZero() } func (p *lut) assureAlloc() { if p.gpu_buf[0] == nil { for i := range p.gpu_buf { p.gpu_buf[i] = cuda.MemAlloc(NREGION * cu.SIZEOF_FLOAT32) } } } func (b *lut) NComp() int { return len(b.cpu_buf) } // uncompress the table to a full array with parameter values per cell. func (p *lut) Slice() (*data.Slice, bool) { b := cuda.Buffer(p.NComp(), Mesh().Size()) p.EvalTo(b) return b, true } // uncompress the table to a full array in the dst Slice with parameter values per cell. func (p *lut) EvalTo(dst *data.Slice) { gpu := p.gpuLUT() for c := 0; c < p.NComp(); c++ { cuda.RegionDecode(dst.Comp(c), cuda.LUTPtr(gpu[c]), regions.Gpu()) } } mumax3-3.10/engine/magnetization.go000066400000000000000000000104471371432437400173320ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" "reflect" ) var M magnetization // reduced magnetization (unit length) func init() { DeclLValue("m", &M, `Reduced magnetization (unit length)`) } // Special buffered quantity to store magnetization // makes sure it's normalized etc. type magnetization struct { buffer_ *data.Slice } func (m *magnetization) Mesh() *data.Mesh { return Mesh() } func (m *magnetization) NComp() int { return 3 } func (m *magnetization) Name() string { return "m" } func (m *magnetization) Unit() string { return "" } func (m *magnetization) Buffer() *data.Slice { return m.buffer_ } // todo: rename Gpu()? func (m *magnetization) Comp(c int) ScalarField { return Comp(m, c) } func (m *magnetization) SetValue(v interface{}) { m.SetInShape(nil, v.(Config)) } func (m *magnetization) InputType() reflect.Type { return reflect.TypeOf(Config(nil)) } func (m *magnetization) Type() reflect.Type { return reflect.TypeOf(new(magnetization)) } func (m *magnetization) Eval() interface{} { return m } func (m *magnetization) average() []float64 { return sAverageMagnet(M.Buffer()) } func (m *magnetization) Average() data.Vector { return unslice(m.average()) } func (m *magnetization) normalize() { cuda.Normalize(m.Buffer(), geometry.Gpu()) } // allocate storage (not done by init, as mesh size may not yet be known then) func (m *magnetization) alloc() { m.buffer_ = cuda.NewSlice(3, m.Mesh().Size()) m.Set(RandomMag()) // sane starting config } func (b *magnetization) SetArray(src *data.Slice) { if src.Size() != b.Mesh().Size() { src = data.Resample(src, b.Mesh().Size()) } data.Copy(b.Buffer(), src) b.normalize() } func (m *magnetization) Set(c Config) { checkMesh() m.SetInShape(nil, c) } func (m *magnetization) LoadFile(fname string) { m.SetArray(LoadFile(fname)) } func (m *magnetization) Slice() (s *data.Slice, recycle bool) { return m.Buffer(), false } func (m *magnetization) EvalTo(dst *data.Slice) { data.Copy(dst, m.buffer_) } func (m *magnetization) Region(r int) *vOneReg { return vOneRegion(m, r) } func (m *magnetization) String() string { return util.Sprint(m.Buffer().HostCopy()) } // Set the value of one cell. func (m *magnetization) SetCell(ix, iy, iz int, v data.Vector) { r := Index2Coord(ix, iy, iz) if geometry.shape != nil && !geometry.shape(r[X], r[Y], r[Z]) { return } vNorm := v.Len() for c := 0; c < 3; c++ { cuda.SetCell(m.Buffer(), c, ix, iy, iz, float32(v[c]/vNorm)) } } // Get the value of one cell. func (m *magnetization) GetCell(ix, iy, iz int) data.Vector { mx := float64(cuda.GetCell(m.Buffer(), X, ix, iy, iz)) my := float64(cuda.GetCell(m.Buffer(), Y, ix, iy, iz)) mz := float64(cuda.GetCell(m.Buffer(), Z, ix, iy, iz)) return Vector(mx, my, mz) } func (m *magnetization) Quantity() []float64 { return slice(m.Average()) } // Sets the magnetization inside the shape func (m *magnetization) SetInShape(region Shape, conf Config) { checkMesh() if region == nil { region = universe } host := m.Buffer().HostCopy() h := host.Vectors() n := m.Mesh().Size() for iz := 0; iz < n[Z]; iz++ { for iy := 0; iy < n[Y]; iy++ { for ix := 0; ix < n[X]; ix++ { r := Index2Coord(ix, iy, iz) x, y, z := r[X], r[Y], r[Z] if region(x, y, z) { // inside m := conf(x, y, z) h[X][iz][iy][ix] = float32(m[X]) h[Y][iz][iy][ix] = float32(m[Y]) h[Z][iz][iy][ix] = float32(m[Z]) } } } } m.SetArray(host) } // set m to config in region func (m *magnetization) SetRegion(region int, conf Config) { host := m.Buffer().HostCopy() h := host.Vectors() n := m.Mesh().Size() r := byte(region) regionsArr := regions.HostArray() for iz := 0; iz < n[Z]; iz++ { for iy := 0; iy < n[Y]; iy++ { for ix := 0; ix < n[X]; ix++ { pos := Index2Coord(ix, iy, iz) x, y, z := pos[X], pos[Y], pos[Z] if regionsArr[iz][iy][ix] == r { m := conf(x, y, z) h[X][iz][iy][ix] = float32(m[X]) h[Y][iz][iy][ix] = float32(m[Y]) h[Z][iz][iy][ix] = float32(m[Z]) } } } } m.SetArray(host) } func (m *magnetization) resize() { backup := m.Buffer().HostCopy() s2 := Mesh().Size() resized := data.Resample(backup, s2) m.buffer_.Free() m.buffer_ = cuda.NewSlice(VECTOR, s2) data.Copy(m.buffer_, resized) } mumax3-3.10/engine/maxangle.go000066400000000000000000000010751371432437400162520ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) var ( MaxAngle = NewScalarValue("MaxAngle", "rad", "maximum angle between neighboring spins", GetMaxAngle) SpinAngle = NewScalarField("spinAngle", "rad", "Angle between neighboring spins", SetSpinAngle) ) func SetSpinAngle(dst *data.Slice) { cuda.SetMaxAngle(dst, M.Buffer(), lex2.Gpu(), regions.Gpu(), M.Mesh()) } func GetMaxAngle() float64 { s := ValueOf(SpinAngle) defer cuda.Recycle(s) return float64(cuda.MaxAbs(s)) // just a max would be fine, but not currently implemented } mumax3-3.10/engine/mesh.go000066400000000000000000000064311371432437400154130ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) var globalmesh_ data.Mesh // mesh for m and everything that has the same size func init() { DeclFunc("SetGridSize", SetGridSize, `Sets the number of cells for X,Y,Z`) DeclFunc("SetCellSize", SetCellSize, `Sets the X,Y,Z cell size in meters`) DeclFunc("SetMesh", SetMesh, `Sets GridSize, CellSize and PBC at the same time`) DeclFunc("SetPBC", SetPBC, "Sets the number of repetitions in X,Y,Z to create periodic boundary "+ "conditions. The number of repetitions determines the cutoff range for the demagnetization.") } func Mesh() *data.Mesh { checkMesh() return &globalmesh_ } func arg(msg string, test bool) { if !test { panic(UserErr(msg + ": illegal arugment")) } } // Set the simulation mesh to Nx x Ny x Nz cells of given size. // Can be set only once at the beginning of the simulation. // TODO: dedup arguments from globals func SetMesh(Nx, Ny, Nz int, cellSizeX, cellSizeY, cellSizeZ float64, pbcx, pbcy, pbcz int) { SetBusy(true) defer SetBusy(false) arg("GridSize", Nx > 0 && Ny > 0 && Nz > 0) arg("CellSize", cellSizeX > 0 && cellSizeY > 0 && cellSizeZ > 0) arg("PBC", pbcx >= 0 && pbcy >= 0 && pbcz >= 0) prevSize := globalmesh_.Size() pbc := []int{pbcx, pbcy, pbcz} if globalmesh_.Size() == [3]int{0, 0, 0} { // first time mesh is set globalmesh_ = *data.NewMesh(Nx, Ny, Nz, cellSizeX, cellSizeY, cellSizeZ, pbc...) M.alloc() regions.alloc() } else { // here be dragons LogOut("resizing...") // free everything conv_.Free() conv_ = nil mfmconv_.Free() mfmconv_ = nil cuda.FreeBuffers() // resize everything globalmesh_ = *data.NewMesh(Nx, Ny, Nz, cellSizeX, cellSizeY, cellSizeZ, pbc...) M.resize() regions.resize() geometry.buffer.Free() geometry.buffer = data.NilSlice(1, Mesh().Size()) geometry.setGeom(geometry.shape) // remove excitation extra terms if they don't fit anymore // up to the user to add them again if Mesh().Size() != prevSize { B_ext.RemoveExtraTerms() J.RemoveExtraTerms() } if Mesh().Size() != prevSize { B_therm.noise.Free() B_therm.noise = nil } } lazy_gridsize = []int{Nx, Ny, Nz} lazy_cellsize = []float64{cellSizeX, cellSizeY, cellSizeZ} lazy_pbc = []int{pbcx, pbcy, pbcz} } func printf(f float64) float32 { return float32(f) } // for lazy setmesh: set gridsize and cellsize in separate calls var ( lazy_gridsize []int lazy_cellsize []float64 lazy_pbc = []int{0, 0, 0} ) func SetGridSize(Nx, Ny, Nz int) { lazy_gridsize = []int{Nx, Ny, Nz} if lazy_cellsize != nil { SetMesh(Nx, Ny, Nz, lazy_cellsize[X], lazy_cellsize[Y], lazy_cellsize[Z], lazy_pbc[X], lazy_pbc[Y], lazy_pbc[Z]) } } func SetCellSize(cx, cy, cz float64) { lazy_cellsize = []float64{cx, cy, cz} if lazy_gridsize != nil { SetMesh(lazy_gridsize[X], lazy_gridsize[Y], lazy_gridsize[Z], cx, cy, cz, lazy_pbc[X], lazy_pbc[Y], lazy_pbc[Z]) } } func SetPBC(nx, ny, nz int) { lazy_pbc = []int{nx, ny, nz} if lazy_gridsize != nil && lazy_cellsize != nil { SetMesh(lazy_gridsize[X], lazy_gridsize[Y], lazy_gridsize[Z], lazy_cellsize[X], lazy_cellsize[Y], lazy_cellsize[Z], lazy_pbc[X], lazy_pbc[Y], lazy_pbc[Z]) } } // check if mesh is set func checkMesh() { if globalmesh_.Size() == [3]int{0, 0, 0} { panic("need to set mesh first") } } mumax3-3.10/engine/mfm.go000066400000000000000000000020311371432437400152260ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) var ( MFM = NewScalarField("MFM", "arb.", "MFM image", SetMFM) MFMLift inputValue MFMTipSize inputValue mfmconv_ *cuda.MFMConvolution ) func init() { MFMLift = numParam(50e-9, "MFMLift", "m", reinitmfmconv) MFMTipSize = numParam(1e-3, "MFMDipole", "m", reinitmfmconv) DeclLValue("MFMLift", &MFMLift, "MFM lift height") DeclLValue("MFMDipole", &MFMTipSize, "Height of vertically magnetized part of MFM tip") } func SetMFM(dst *data.Slice) { buf := cuda.Buffer(3, Mesh().Size()) defer cuda.Recycle(buf) if mfmconv_ == nil { reinitmfmconv() } msat := Msat.MSlice() defer msat.Recycle() mfmconv_.Exec(buf, M.Buffer(), geometry.Gpu(), msat) cuda.Madd3(dst, buf.Comp(0), buf.Comp(1), buf.Comp(2), 1, 1, 1) } func reinitmfmconv() { SetBusy(true) defer SetBusy(false) if mfmconv_ == nil { mfmconv_ = cuda.NewMFM(Mesh(), MFMLift.v, MFMTipSize.v, *Flag_cachedir) } else { mfmconv_.Reinit(MFMLift.v, MFMTipSize.v, *Flag_cachedir) } } mumax3-3.10/engine/minimizer.go000066400000000000000000000064331371432437400164640ustar00rootroot00000000000000package engine // Minimize follows the steepest descent method as per Exl et al., JAP 115, 17D118 (2014). import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) var ( DmSamples int = 10 // number of dm to keep for convergence check StopMaxDm float64 = 1e-6 // stop minimizer if sampled dm is smaller than this ) func init() { DeclFunc("Minimize", Minimize, "Use steepest conjugate gradient method to minimize the total energy") DeclVar("MinimizerStop", &StopMaxDm, "Stopping max dM for Minimize") DeclVar("MinimizerSamples", &DmSamples, "Number of max dM to collect for Minimize convergence check.") } // fixed length FIFO. Items can be added but not removed type fifoRing struct { count int tail int // index to put next item. Will loop to 0 after exceeding length data []float64 } func FifoRing(length int) fifoRing { return fifoRing{data: make([]float64, length)} } func (r *fifoRing) Add(item float64) { r.data[r.tail] = item r.count++ r.tail = (r.tail + 1) % len(r.data) if r.count > len(r.data) { r.count = len(r.data) } } func (r *fifoRing) Max() float64 { max := r.data[0] for i := 1; i < r.count; i++ { if r.data[i] > max { max = r.data[i] } } return max } type Minimizer struct { k *data.Slice // torque saved to calculate time step lastDm fifoRing h float32 } func (mini *Minimizer) Step() { m := M.Buffer() size := m.Size() if mini.k == nil { mini.k = cuda.Buffer(3, size) torqueFn(mini.k) } k := mini.k h := mini.h // save original magnetization m0 := cuda.Buffer(3, size) defer cuda.Recycle(m0) data.Copy(m0, m) // make descent cuda.Minimize(m, m0, k, h) // calculate new torque for next step k0 := cuda.Buffer(3, size) defer cuda.Recycle(k0) data.Copy(k0, k) torqueFn(k) setMaxTorque(k) // report to user // just to make the following readable dm := m0 dk := k0 // calculate step difference of m and k cuda.Madd2(dm, m, m0, 1., -1.) cuda.Madd2(dk, k, k0, -1., 1.) // reversed due to LLNoPrecess sign // get maxdiff and add to list max_dm := cuda.MaxVecNorm(dm) mini.lastDm.Add(max_dm) setLastErr(mini.lastDm.Max()) // report maxDm to user as LastErr // adjust next time step var nom, div float32 if NSteps%2 == 0 { nom = cuda.Dot(dm, dm) div = cuda.Dot(dm, dk) } else { nom = cuda.Dot(dm, dk) div = cuda.Dot(dk, dk) } if div != 0. { mini.h = nom / div } else { // in case of division by zero mini.h = 1e-4 } M.normalize() // as a convention, time does not advance during relax NSteps++ } func (mini *Minimizer) Free() { mini.k.Free() } func Minimize() { Refer("exl2014") SanityCheck() // Save the settings we are changing... prevType := solvertype prevFixDt := FixDt prevPrecess := Precess t0 := Time relaxing = true // disable temperature noise // ...to restore them later defer func() { SetSolver(prevType) FixDt = prevFixDt Precess = prevPrecess Time = t0 relaxing = false }() Precess = false // disable precession for torque calculation // remove previous stepper if stepper != nil { stepper.Free() } // set stepper to the minimizer mini := Minimizer{ h: 1e-4, k: nil, lastDm: FifoRing(DmSamples)} stepper = &mini cond := func() bool { return (mini.lastDm.count < DmSamples || mini.lastDm.Max() > StopMaxDm) } RunWhile(cond) pause = true } mumax3-3.10/engine/number.go000066400000000000000000000016031371432437400157430ustar00rootroot00000000000000package engine import ( "reflect" ) // TODO: wrap around outputValue // inputValue is like outputValue, but settable type inputValue struct { v float64 onSet func() name, unit string } func numParam(v float64, name, unit string, onSet func()) inputValue { return inputValue{v: v, onSet: onSet, name: name, unit: unit} } func (p *inputValue) NComp() int { return 1 } func (p *inputValue) Name() string { return p.name } func (p *inputValue) Unit() string { return p.unit } func (p *inputValue) getRegion(int) []float64 { return []float64{float64(p.v)} } func (p *inputValue) Type() reflect.Type { return reflect.TypeOf(float64(0)) } func (p *inputValue) IsUniform() bool { return true } func (p *inputValue) Eval() interface{} { return p.v } func (p *inputValue) SetValue(v interface{}) { p.v = v.(float64) p.onSet() } mumax3-3.10/engine/od.go000066400000000000000000000014461371432437400150620ustar00rootroot00000000000000package engine // Management of output directory. import ( "github.com/mumax/3/httpfs" "strings" ) var ( outputdir string // Output directory InputFile string ) func OD() string { if outputdir == "" { panic("output not yet initialized") } return outputdir } // SetOD sets the output directory where auto-saved files will be stored. // The -o flag can also be used for this purpose. func InitIO(inputfile, od string, force bool) { if outputdir != "" { panic("output directory already set") } InputFile = inputfile if !strings.HasSuffix(od, "/") { od += "/" } outputdir = od if strings.HasPrefix(outputdir, "http://") { httpfs.SetWD(outputdir + "/../") } LogOut("output directory:", outputdir) if force { httpfs.Remove(od) } _ = httpfs.Mkdir(od) initLog() initBib() } mumax3-3.10/engine/oneregion.go000066400000000000000000000035761371432437400164530ustar00rootroot00000000000000package engine import ( "fmt" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" ) func sInRegion(q Quantity, r int) ScalarField { return AsScalarField(inRegion(q, r)) } func vInRegion(q Quantity, r int) VectorField { return AsVectorField(inRegion(q, r)) } func sOneRegion(q Quantity, r int) *sOneReg { util.Argument(q.NComp() == 1) return &sOneReg{oneReg{q, r}} } func vOneRegion(q Quantity, r int) *vOneReg { util.Argument(q.NComp() == 3) return &vOneReg{oneReg{q, r}} } type sOneReg struct{ oneReg } func (q *sOneReg) Average() float64 { return q.average()[0] } type vOneReg struct{ oneReg } func (q *vOneReg) Average() data.Vector { return unslice(q.average()) } // represents a new quantity equal to q in the given region, 0 outside. type oneReg struct { parent Quantity region int } func inRegion(q Quantity, region int) Quantity { return &oneReg{q, region} } func (q *oneReg) NComp() int { return q.parent.NComp() } func (q *oneReg) Name() string { return fmt.Sprint(NameOf(q.parent), ".region", q.region) } func (q *oneReg) Unit() string { return UnitOf(q.parent) } func (q *oneReg) Mesh() *data.Mesh { return MeshOf(q.parent) } func (q *oneReg) EvalTo(dst *data.Slice) { EvalTo(q, dst) } // returns a new slice equal to q in the given region, 0 outside. func (q *oneReg) Slice() (*data.Slice, bool) { src := ValueOf(q.parent) defer cuda.Recycle(src) out := cuda.Buffer(q.NComp(), q.Mesh().Size()) cuda.RegionSelect(out, src, regions.Gpu(), byte(q.region)) return out, true } func (q *oneReg) average() []float64 { slice, r := q.Slice() if r { defer cuda.Recycle(slice) } avg := sAverageUniverse(slice) sDiv(avg, regions.volume(q.region)) return avg } func (q *oneReg) Average() []float64 { return q.average() } // slice division func sDiv(v []float64, x float64) { for i := range v { v[i] /= x } } mumax3-3.10/engine/outputquantities.go000066400000000000000000000131101371432437400201160ustar00rootroot00000000000000package engine /* The metadata layer wraps basic micromagnetic functions (e.g. func SetDemagField()) in objects that provide: - additional information (Name, Unit, ...) used for saving output, - additional methods (Comp, Region, ...) handy for input scripting. */ import ( "fmt" "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) // The Info interface defines the bare minimum methods a quantity must implement // to be accessible for scripting and I/O. type Info interface { Name() string // number of components (scalar, vector, ...) Unit() string // name used for output file (e.g. "m") NComp() int // unit, e.g. "A/m" } // info provides an Info implementation intended for embedding in other types. type info struct { nComp int name string unit string } func (i *info) Name() string { return i.name } func (i *info) Unit() string { return i.unit } func (i *info) NComp() int { return i.nComp } // outputFunc is an outputValue implementation where a function provides the output value. // It can be scalar or vector. // Used internally by NewScalarValue and NewVectorValue. type valueFunc struct { info f func() []float64 } func (g *valueFunc) get() []float64 { return g.f() } func (g *valueFunc) average() []float64 { return g.get() } func (g *valueFunc) EvalTo(dst *data.Slice) { v := g.get() for c, v := range v { cuda.Memset(dst.Comp(c), float32(v)) } } // ScalarValue enhances an outputValue with methods specific to // a space-independent scalar quantity (e.g. total energy). type ScalarValue struct { *valueFunc } // NewScalarValue constructs an outputable space-independent scalar quantity whose // value is provided by function f. func NewScalarValue(name, unit, desc string, f func() float64) *ScalarValue { g := func() []float64 { return []float64{f()} } v := &ScalarValue{&valueFunc{info{1, name, unit}, g}} Export(v, desc) return v } func (s ScalarValue) Get() float64 { return s.average()[0] } func (s ScalarValue) Average() float64 { return s.Get() } // VectorValue enhances an outputValue with methods specific to // a space-independent vector quantity (e.g. averaged magnetization). type VectorValue struct { *valueFunc } // NewVectorValue constructs an outputable space-independent vector quantity whose // value is provided by function f. func NewVectorValue(name, unit, desc string, f func() []float64) *VectorValue { v := &VectorValue{&valueFunc{info{3, name, unit}, f}} Export(v, desc) return v } func (v *VectorValue) Get() data.Vector { return unslice(v.average()) } func (v *VectorValue) Average() data.Vector { return v.Get() } // NewVectorField constructs an outputable space-dependent vector quantity whose // value is provided by function f. func NewVectorField(name, unit, desc string, f func(dst *data.Slice)) VectorField { v := AsVectorField(&fieldFunc{info{3, name, unit}, f}) DeclROnly(name, v, cat(desc, unit)) return v } // NewVectorField constructs an outputable space-dependent scalar quantity whose // value is provided by function f. func NewScalarField(name, unit, desc string, f func(dst *data.Slice)) ScalarField { q := AsScalarField(&fieldFunc{info{1, name, unit}, f}) DeclROnly(name, q, cat(desc, unit)) return q } type fieldFunc struct { info f func(*data.Slice) } func (c *fieldFunc) Mesh() *data.Mesh { return Mesh() } func (c *fieldFunc) average() []float64 { return qAverageUniverse(c) } func (c *fieldFunc) EvalTo(dst *data.Slice) { EvalTo(c, dst) } // Calculates and returns the quantity. // recycle is true: slice needs to be recycled. func (q *fieldFunc) Slice() (s *data.Slice, recycle bool) { buf := cuda.Buffer(q.NComp(), q.Mesh().Size()) cuda.Zero(buf) q.f(buf) return buf, true } // ScalarField enhances an outputField with methods specific to // a space-dependent scalar quantity. type ScalarField struct { Quantity } // AsScalarField promotes a quantity to a ScalarField, // enabling convenience methods particular to scalars. func AsScalarField(q Quantity) ScalarField { if q.NComp() != 1 { panic(fmt.Errorf("ScalarField(%v): need 1 component, have: %v", NameOf(q), q.NComp())) } return ScalarField{q} } func (s ScalarField) average() []float64 { return AverageOf(s.Quantity) } func (s ScalarField) Average() float64 { return s.average()[0] } func (s ScalarField) Region(r int) ScalarField { return AsScalarField(inRegion(s.Quantity, r)) } func (s ScalarField) Name() string { return NameOf(s.Quantity) } func (s ScalarField) Unit() string { return UnitOf(s.Quantity) } // VectorField enhances an outputField with methods specific to // a space-dependent vector quantity. type VectorField struct { Quantity } // AsVectorField promotes a quantity to a VectorField, // enabling convenience methods particular to vectors. func AsVectorField(q Quantity) VectorField { if q.NComp() != 3 { panic(fmt.Errorf("VectorField(%v): need 3 components, have: %v", NameOf(q), q.NComp())) } return VectorField{q} } func (v VectorField) average() []float64 { return AverageOf(v.Quantity) } func (v VectorField) Average() data.Vector { return unslice(v.average()) } func (v VectorField) Region(r int) VectorField { return AsVectorField(inRegion(v.Quantity, r)) } func (v VectorField) Comp(c int) ScalarField { return AsScalarField(Comp(v.Quantity, c)) } func (v VectorField) Mesh() *data.Mesh { return MeshOf(v.Quantity) } func (v VectorField) Name() string { return NameOf(v.Quantity) } func (v VectorField) Unit() string { return UnitOf(v.Quantity) } func (v VectorField) HostCopy() *data.Slice { s := ValueOf(v.Quantity) defer cuda.Recycle(s) return s.HostCopy() } mumax3-3.10/engine/parameter.go000066400000000000000000000221411371432437400164330ustar00rootroot00000000000000package engine /* parameters are region- and time dependent input values, like material parameters. */ import ( "fmt" "math" "reflect" "strings" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/script" "github.com/mumax/3/util" ) // input parameter, settable by user type regionwise struct { lut upd_reg [NREGION]func() []float64 // time-dependent values timestamp float64 // used not to double-evaluate f(t) children []derived // derived parameters name, unit string } func (p *regionwise) init(nComp int, name, unit string, children []derived) { p.lut.init(nComp, p) p.name = name p.unit = unit p.children = children p.timestamp = math.Inf(-1) } func (p *regionwise) MSlice() cuda.MSlice { if p.IsUniform() { return cuda.MakeMSlice(data.NilSlice(p.NComp(), Mesh().Size()), p.getRegion(0)) } else { buf, r := p.Slice() util.Assert(r == true) return cuda.ToMSlice(buf) } } func (p *regionwise) Name() string { return p.name } func (p *regionwise) Unit() string { return p.unit } func (p *regionwise) Mesh() *data.Mesh { return Mesh() } func (p *regionwise) addChild(c ...derived) { for _, c := range c { // TODO: no duplicates if !contains(p.children, c) { p.children = append(p.children, c) fmt.Println(p, ".addChild", c) } } } func contains(s []derived, x derived) bool { for _, y := range s { if y == x { return true } } return false } func (p *regionwise) update() { if p.timestamp != Time { changed := false // update functions of time for r := 0; r < NREGION; r++ { updFunc := p.upd_reg[r] if updFunc != nil { p.bufset_(r, updFunc()) changed = true } } p.timestamp = Time if changed { p.invalidate() } } } // set in one region func (p *regionwise) setRegion(region int, v []float64) { if region == -1 { p.setUniform(v) } else { p.setRegions(region, region+1, v) } } // set in all regions func (p *regionwise) setUniform(v []float64) { p.setRegions(0, NREGION, v) } // set in regions r1..r2(excl) func (p *regionwise) setRegions(r1, r2 int, v []float64) { util.Argument(len(v) == len(p.cpu_buf)) util.Argument(r1 < r2) // exclusive upper bound for r := r1; r < r2; r++ { p.upd_reg[r] = nil p.bufset_(r, v) } p.invalidate() } func (p *regionwise) bufset_(region int, v []float64) { for c := range p.cpu_buf { p.cpu_buf[c][region] = float32(v[c]) } } func (p *regionwise) setFunc(r1, r2 int, f func() []float64) { util.Argument(r1 < r2) // exclusive upper bound for r := r1; r < r2; r++ { p.upd_reg[r] = f } p.invalidate() } // mark my GPU copy and my children as invalid (need update) func (p *regionwise) invalidate() { p.gpu_ok = false for _, c := range p.children { c.invalidate() } } func (p *regionwise) getRegion(region int) []float64 { cpu := p.cpuLUT() v := make([]float64, p.NComp()) for i := range v { v[i] = float64(cpu[i][region]) } return v } func (p *regionwise) IsUniform() bool { cpu := p.cpuLUT() v1 := p.getRegion(0) for r := 1; r < NREGION; r++ { for c := range v1 { if cpu[c][r] != float32(v1[c]) { return false } } } return true } func (p *regionwise) average() []float64 { return qAverageUniverse(p) } // parameter derived from others (not directly settable). E.g.: Bsat derived from Msat type DerivedParam struct { lut // GPU storage updater func(*DerivedParam) // called to update my value uptodate bool // cleared if parents' value change parents []updater // parents updated before I'm updated } // any parameter that depends on an inputParam type derived interface { invalidate() } type parent interface { update() addChild(...derived) } func NewDerivedParam(nComp int, parents []parent, updater func(*DerivedParam)) *DerivedParam { p := new(DerivedParam) p.lut.init(nComp, p) // pass myself to update me if needed p.updater = updater for _, P := range parents { p.parents = append(p.parents, P) } return p } func (d *DerivedParam) init(nComp int, parents []parent, updater func(*DerivedParam)) { d.lut.init(nComp, d) // pass myself to update me if needed d.updater = updater for _, p := range parents { d.parents = append(d.parents, p) p.addChild(d) } } func (p *DerivedParam) invalidate() { p.uptodate = false } func (p *DerivedParam) update() { for _, par := range p.parents { par.update() // may invalidate me } if !p.uptodate { p.updater(p) p.gpu_ok = false p.uptodate = true } } // Get value in region r. func (p *DerivedParam) GetRegion(r int) []float64 { lut := p.cpuLUT() // updates me if needed v := make([]float64, p.NComp()) for c := range v { v[c] = float64(lut[c][r]) } return v } // specialized param with 1 component type RegionwiseScalar struct { regionwise } func (p *RegionwiseScalar) init(name, unit, desc string, children []derived) { p.regionwise.init(SCALAR, name, unit, children) if !strings.HasPrefix(name, "_") { // don't export names beginning with "_" (e.g. from exciation) DeclLValue(name, p, cat(desc, unit)) } } // TODO: auto derived func NewScalarParam(name, unit, desc string, children ...derived) *RegionwiseScalar { p := new(RegionwiseScalar) p.regionwise.init(SCALAR, name, unit, children) if !strings.HasPrefix(name, "_") { // don't export names beginning with "_" (e.g. from exciation) DeclLValue(name, p, cat(desc, unit)) } return p } func (p *RegionwiseScalar) SetRegion(region int, f script.ScalarFunction) { if region == -1 { p.setRegionsFunc(0, NREGION, f) // uniform } else { p.setRegionsFunc(region, region+1, f) // upper bound exclusive } } func (p *RegionwiseScalar) SetValue(v interface{}) { f := v.(script.ScalarFunction) p.setRegionsFunc(0, NREGION, f) } func (p *RegionwiseScalar) Set(v float64) { p.setRegions(0, NREGION, []float64{v}) } func (p *RegionwiseScalar) setRegionsFunc(r1, r2 int, f script.ScalarFunction) { if IsConst(f) { p.setRegions(r1, r2, []float64{f.Float()}) } else { f := f.Fix() // fix values of all variables except t p.setFunc(r1, r2, func() []float64 { return []float64{f.Eval().(script.ScalarFunction).Float()} }) } } func (p *RegionwiseScalar) GetRegion(region int) float64 { return float64(p.getRegion(region)[0]) } func (p *RegionwiseScalar) Eval() interface{} { return p } func (p *RegionwiseScalar) Type() reflect.Type { return reflect.TypeOf(new(RegionwiseScalar)) } func (p *RegionwiseScalar) InputType() reflect.Type { return script.ScalarFunction_t } func (p *RegionwiseScalar) Average() float64 { return qAverageUniverse(p)[0] } func (p *RegionwiseScalar) Region(r int) *sOneReg { return sOneRegion(p, r) } // checks if a script expression contains t (time) func IsConst(e script.Expr) bool { t := World.Resolve("t") return !script.Contains(e, t) } func cat(desc, unit string) string { if unit == "" { return desc } else { return desc + " (" + unit + ")" } } // these methods should only be accesible from Go func (p *RegionwiseScalar) SetRegionValueGo(region int, v float64) { if region == -1 { p.setRegions(0, NREGION, []float64{v}) } else { p.setRegions(region, region+1, []float64{v}) } } func (p *RegionwiseScalar) SetRegionFuncGo(region int, f func() float64) { if region == -1 { p.setFunc(0, NREGION, func() []float64 { return []float64{f()} }) } else { p.setFunc(region, region+1, func() []float64 { return []float64{f()} }) } } // vector input parameter, settable by user type RegionwiseVector struct { regionwise } func NewVectorParam(name, unit, desc string) *RegionwiseVector { p := new(RegionwiseVector) p.regionwise.init(VECTOR, name, unit, nil) // no vec param has children (yet) if !strings.HasPrefix(name, "_") { // don't export names beginning with "_" (e.g. from exciation) DeclLValue(name, p, cat(desc, unit)) } return p } func (p *RegionwiseVector) SetRegion(region int, f script.VectorFunction) { if region == -1 { p.setRegionsFunc(0, NREGION, f) //uniform } else { p.setRegionsFunc(region, region+1, f) } } func (p *RegionwiseVector) SetValue(v interface{}) { f := v.(script.VectorFunction) p.setRegionsFunc(0, NREGION, f) } func (p *RegionwiseVector) setRegionsFunc(r1, r2 int, f script.VectorFunction) { if IsConst(f) { p.setRegions(r1, r2, slice(f.Float3())) } else { f := f.Fix() // fix values of all variables except t p.setFunc(r1, r2, func() []float64 { return slice(f.Eval().(script.VectorFunction).Float3()) }) } } func (p *RegionwiseVector) SetRegionFn(region int, f func() [3]float64) { p.setFunc(region, region+1, func() []float64 { return slice(f()) }) } func (p *RegionwiseVector) GetRegion(region int) [3]float64 { v := p.getRegion(region) return unslice(v) } func (p *RegionwiseVector) Eval() interface{} { return p } func (p *RegionwiseVector) Type() reflect.Type { return reflect.TypeOf(new(RegionwiseVector)) } func (p *RegionwiseVector) InputType() reflect.Type { return script.VectorFunction_t } func (p *RegionwiseVector) Region(r int) *vOneReg { return vOneRegion(p, r) } func (p *RegionwiseVector) Average() data.Vector { return unslice(qAverageUniverse(p)) } func (p *RegionwiseVector) Comp(c int) ScalarField { return Comp(p, c) } mumax3-3.10/engine/plot.go000066400000000000000000000035711371432437400154370ustar00rootroot00000000000000package engine import ( "bytes" "errors" "fmt" "github.com/mumax/3/httpfs" "image" "image/png" "io/ioutil" "net/http" "os/exec" "sync/atomic" ) var nPlots int32 // counts number of active gnuplot processes const MAX_GNUPLOTS = 5 // maximum allowed number of gnuplot processes func (g *guistate) servePlot(w http.ResponseWriter, r *http.Request) { out := []byte{} // handle error and return wheter err != nil. handle := func(err error) bool { if err != nil { w.Write(emptyIMG()) g.Set("plotErr", err.Error()+string(out)) return true } else { return false } } // limit max processes atomic.AddInt32(&nPlots, 1) defer atomic.AddInt32(&nPlots, -1) if atomic.LoadInt32(&nPlots) > MAX_GNUPLOTS { handle(errors.New("too many gnuplot processes")) return } a := g.StringValue("usingx") b := g.StringValue("usingy") cmd := "gnuplot" args := []string{"-e", fmt.Sprintf(`set format x "%%g"; set key off; set format y "%%g"; set term svg size 480,320 font 'Arial,10'; plot "-" u %v:%v w li; set output;exit;`, a, b)} excmd := exec.Command(cmd, args...) stdin, err := excmd.StdinPipe() if handle(err) { return } stdout, err := excmd.StdoutPipe() if handle(err) { return } data, err := httpfs.Read(fmt.Sprintf(`%vtable.txt`, OD())) if handle(err) { return } err = excmd.Start() if handle(err) { return } defer excmd.Wait() _, err = stdin.Write(data) if handle(err) { return } err = stdin.Close() if handle(err) { return } out, err = ioutil.ReadAll(stdout) if handle(err) { return } w.Header().Set("Content-Type", "image/svg+xml") w.Write(out) g.Set("plotErr", "") return } var empty_img []byte // empty image to show if there's no plot... func emptyIMG() []byte { if empty_img == nil { o := bytes.NewBuffer(nil) png.Encode(o, image.NewNRGBA(image.Rect(0, 0, 4, 4))) empty_img = o.Bytes() } return empty_img } mumax3-3.10/engine/quantity.go000066400000000000000000000032131371432437400163300ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "reflect" ) // Arbitrary physical quantity. type Quantity interface { NComp() int EvalTo(dst *data.Slice) } func MeshSize() [3]int { return Mesh().Size() } func SizeOf(q Quantity) [3]int { // quantity defines its own, custom, implementation: if s, ok := q.(interface { Mesh() *data.Mesh }); ok { return s.Mesh().Size() } // otherwise: default mesh return MeshSize() } func AverageOf(q Quantity) []float64 { // quantity defines its own, custom, implementation: if s, ok := q.(interface { average() []float64 }); ok { return s.average() } // otherwise: default mesh buf := ValueOf(q) defer cuda.Recycle(buf) return sAverageMagnet(buf) } func NameOf(q Quantity) string { // quantity defines its own, custom, implementation: if s, ok := q.(interface { Name() string }); ok { return s.Name() } return "unnamed." + reflect.TypeOf(q).String() } func UnitOf(q Quantity) string { // quantity defines its own, custom, implementation: if s, ok := q.(interface { Unit() string }); ok { return s.Unit() } return "?" } func MeshOf(q Quantity) *data.Mesh { // quantity defines its own, custom, implementation: if s, ok := q.(interface { Mesh() *data.Mesh }); ok { return s.Mesh() } return Mesh() } func ValueOf(q Quantity) *data.Slice { // TODO: check for Buffered() implementation buf := cuda.Buffer(q.NComp(), SizeOf(q)) q.EvalTo(buf) return buf } // Temporary shim to fit Slice into EvalTo func EvalTo(q interface { Slice() (*data.Slice, bool) }, dst *data.Slice) { v, r := q.Slice() if r { defer cuda.Recycle(v) } data.Copy(dst, v) } mumax3-3.10/engine/regions.go000066400000000000000000000146611371432437400161310ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" ) var regions = Regions{info: info{1, "regions", ""}} // global regions map const NREGION = 256 // maximum number of regions, limited by size of byte. func init() { DeclFunc("DefRegion", DefRegion, "Define a material region with given index (0-255) and shape") DeclROnly("regions", ®ions, "Outputs the region index for each cell") DeclFunc("DefRegionCell", DefRegionCell, "Set a material region (first argument) in one cell "+ "by the index of the cell (last three arguments)") } // stores the region index for each cell type Regions struct { gpuCache *cuda.Bytes // TODO: rename: buffer hist []func(x, y, z float64) int // history of region set operations info } func (r *Regions) alloc() { mesh := r.Mesh() r.gpuCache = cuda.NewBytes(mesh.NCell()) DefRegion(0, universe) } func (r *Regions) resize() { newSize := Mesh().Size() r.gpuCache.Free() r.gpuCache = cuda.NewBytes(prod(newSize)) for _, f := range r.hist { r.render(f) } } // Define a region with id (0-255) to be inside the Shape. func DefRegion(id int, s Shape) { defRegionId(id) f := func(x, y, z float64) int { if s(x, y, z) { return id } else { return -1 } } regions.render(f) regions.hist = append(regions.hist, f) } // renders (rasterizes) shape, filling it with region number #id, between x1 and x2 // TODO: a tidbit expensive func (r *Regions) render(f func(x, y, z float64) int) { n := Mesh().Size() l := r.HostList() // need to start from previous state arr := reshapeBytes(l, r.Mesh().Size()) for iz := 0; iz < n[Z]; iz++ { for iy := 0; iy < n[Y]; iy++ { for ix := 0; ix < n[X]; ix++ { r := Index2Coord(ix, iy, iz) region := f(r[X], r[Y], r[Z]) if region >= 0 { arr[iz][iy][ix] = byte(region) } } } } //log.Print("regions.upload") r.gpuCache.Upload(l) } // get the region for position R based on the history func (r *Regions) get(R data.Vector) int { // reverse order, last one set wins. for i := len(r.hist) - 1; i >= 0; i-- { f := r.hist[i] region := f(R[X], R[Y], R[Z]) if region >= 0 { return region } } return 0 } func (r *Regions) HostArray() [][][]byte { return reshapeBytes(r.HostList(), r.Mesh().Size()) } func (r *Regions) HostList() []byte { regionsList := make([]byte, r.Mesh().NCell()) regions.gpuCache.Download(regionsList) return regionsList } func DefRegionCell(id int, x, y, z int) { defRegionId(id) index := data.Index(Mesh().Size(), x, y, z) regions.gpuCache.Set(index, byte(id)) } // Load regions from ovf file, use first component. // Regions should be between 0 and 256 func (r *Regions) LoadFile(fname string) { inSlice := LoadFile(fname) n := r.Mesh().Size() inSlice = data.Resample(inSlice, n) inArr := inSlice.Tensors()[0] l := r.HostList() arr := reshapeBytes(l, n) for iz := 0; iz < n[Z]; iz++ { for iy := 0; iy < n[Y]; iy++ { for ix := 0; ix < n[X]; ix++ { val := inArr[iz][iy][ix] if val < 0 || val > 256 { util.Fatal("regions.LoadFile(", fname, "): all values should be between 0 & 256, have: ", val) } arr[iz][iy][ix] = byte(val) } } } r.gpuCache.Upload(l) } func (r *Regions) average() []float64 { s, recycle := r.Slice() if recycle { defer cuda.Recycle(s) } return sAverageUniverse(s) } func (r *Regions) Average() float64 { return r.average()[0] } // Set the region of one cell func (r *Regions) SetCell(ix, iy, iz int, region int) { size := Mesh().Size() i := data.Index(size, ix, iy, iz) r.gpuCache.Set(i, byte(region)) } func (r *Regions) GetCell(ix, iy, iz int) int { size := Mesh().Size() i := data.Index(size, ix, iy, iz) return int(r.gpuCache.Get(i)) } func defRegionId(id int) { if id < 0 || id > NREGION { util.Fatalf("region id should be 0 -%v, have: %v", NREGION, id) } checkMesh() } // normalized volume (0..1) of region. // TODO: a tidbit too expensive func (r *Regions) volume(region_ int) float64 { region := byte(region_) vol := 0 list := r.HostList() for _, reg := range list { if reg == region { vol++ } } V := float64(vol) / float64(r.Mesh().NCell()) return V } // Get the region data on GPU func (r *Regions) Gpu() *cuda.Bytes { return r.gpuCache } var unitMap regionwise // unit map used to output regions quantity func init() { unitMap.init(1, "unit", "", nil) for r := 0; r < NREGION; r++ { unitMap.setRegion(r, []float64{float64(r)}) } } // Get returns the regions as a slice of floats, so it can be output. func (r *Regions) Slice() (*data.Slice, bool) { buf := cuda.Buffer(1, r.Mesh().Size()) cuda.RegionDecode(buf, unitMap.gpuLUT1(), regions.Gpu()) return buf, true } func (r *Regions) EvalTo(dst *data.Slice) { EvalTo(r, dst) } var _ Quantity = ®ions // Re-interpret a contiguous array as a multi-dimensional array of given size. func reshapeBytes(array []byte, size [3]int) [][][]byte { Nx, Ny, Nz := size[X], size[Y], size[Z] util.Argument(Nx*Ny*Nz == len(array)) sliced := make([][][]byte, Nz) for i := range sliced { sliced[i] = make([][]byte, Ny) } for i := range sliced { for j := range sliced[i] { sliced[i][j] = array[(i*Ny+j)*Nx+0 : (i*Ny+j)*Nx+Nx] } } return sliced } func (b *Regions) shift(dx int) { // TODO: return if no regions defined r1 := b.Gpu() r2 := cuda.NewBytes(b.Mesh().NCell()) // TODO: somehow recycle defer r2.Free() newreg := byte(0) // new region at edge cuda.ShiftBytes(r2, r1, b.Mesh(), dx, newreg) r1.Copy(r2) n := Mesh().Size() x1, x2 := shiftDirtyRange(dx) for iz := 0; iz < n[Z]; iz++ { for iy := 0; iy < n[Y]; iy++ { for ix := x1; ix < x2; ix++ { r := Index2Coord(ix, iy, iz) // includes shift reg := b.get(r) if reg != 0 { b.SetCell(ix, iy, iz, reg) // a bit slowish, but hardly reached } } } } } func (b *Regions) shiftY(dy int) { // TODO: return if no regions defined r1 := b.Gpu() r2 := cuda.NewBytes(b.Mesh().NCell()) // TODO: somehow recycle defer r2.Free() newreg := byte(0) // new region at edge cuda.ShiftBytesY(r2, r1, b.Mesh(), dy, newreg) r1.Copy(r2) n := Mesh().Size() y1, y2 := shiftDirtyRange(dy) for iz := 0; iz < n[Z]; iz++ { for ix := 0; ix < n[X]; ix++ { for iy := y1; iy < y2; iy++ { r := Index2Coord(ix, iy, iz) // includes shift reg := b.get(r) if reg != 0 { b.SetCell(ix, iy, iz, reg) // a bit slowish, but hardly reached } } } } } func (r *Regions) Mesh() *data.Mesh { return Mesh() } func prod(s [3]int) int { return s[0] * s[1] * s[2] } mumax3-3.10/engine/relax.go000066400000000000000000000055701371432437400155750ustar00rootroot00000000000000package engine // Relax tries to find the minimum energy state. import ( "github.com/mumax/3/cuda" "math" ) //Stopping relax Maxtorque in T. The user can check MaxTorque for sane values (e.g. 1e-3). // If set to 0, relax() will stop when the average torque is steady or increasing. var RelaxTorqueThreshold float64 = -1. func init() { DeclFunc("Relax", Relax, "Try to minimize the total energy") DeclVar("RelaxTorqueThreshold", &RelaxTorqueThreshold, "MaxTorque threshold for relax(). If set to -1 (default), relax() will stop when the average torque is steady or increasing.") } // are we relaxing? var relaxing = false func Relax() { SanityCheck() pause = false // Save the settings we are changing... prevType := solvertype prevErr := MaxErr prevFixDt := FixDt prevPrecess := Precess // ...to restore them later defer func() { SetSolver(prevType) MaxErr = prevErr FixDt = prevFixDt Precess = prevPrecess relaxing = false // Temp.upd_reg = prevTemp // Temp.invalidate() // Temp.update() }() // Set good solver for relax SetSolver(BOGAKISHAMPINE) FixDt = 0 Precess = false relaxing = true // Minimize energy: take steps as long as energy goes down. // This stops when energy reaches the numerical noise floor. const N = 3 // evaluate energy (expensive) every N steps relaxSteps(N) E0 := GetTotalEnergy() relaxSteps(N) E1 := GetTotalEnergy() for E1 < E0 && !pause { relaxSteps(N) E0, E1 = E1, GetTotalEnergy() } // Now we are already close to equilibrium, but energy is too noisy to be used any further. // So now we minimize the torque which is less noisy. solver := stepper.(*RK23) defer stepper.Free() // purge previous rk.k1 because FSAL will be dead wrong. maxTorque := func() float64 { return cuda.MaxVecNorm(solver.k1) } avgTorque := func() float32 { return cuda.Dot(solver.k1, solver.k1) } if RelaxTorqueThreshold > 0 { // run as long as the max torque is above threshold. Then increase the accuracy and step more. for !pause { for maxTorque() > RelaxTorqueThreshold && !pause { relaxSteps(N) } MaxErr /= math.Sqrt2 if MaxErr < 1e-9 { break } } } else { // previous ( 1e-9 && !pause { MaxErr /= math.Sqrt2 relaxSteps(N) // TODO: Play with other values T0, T1 = T1, avgTorque() for T1 < T0 && !pause { relaxSteps(N) // TODO: Play with other values T0, T1 = T1, avgTorque() } } } pause = true } // take n steps without setting pause when done or advancing time func relaxSteps(n int) { t0 := Time stop := NSteps + n cond := func() bool { return NSteps < stop } const output = false runWhile(cond, output) Time = t0 } mumax3-3.10/engine/render.go000066400000000000000000000063401371432437400157350ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/draw" "image" "image/jpeg" "math" "net/http" "sync" ) type render struct { mutex sync.Mutex quant Quantity comp string layer, scale int saveCount int // previous max slider value of time rescaleBuf *data.Slice // GPU imgBuf *data.Slice // CPU img_ *image.RGBA } const ( maxScale = 32 // maximum zoom-out setting maxImgSize = 512 // maximum render image size ) // Render image of quantity. func (g *guistate) ServeRender(w http.ResponseWriter, r *http.Request) { g.render.mutex.Lock() defer g.render.mutex.Unlock() g.render.render() jpeg.Encode(w, g.render.img_, &jpeg.Options{Quality: 100}) } // rescale and download quantity, save in rescaleBuf func (ren *render) download() { InjectAndWait(func() { if ren.quant == nil { // not yet set, default = m ren.quant = &M } quant := ren.quant size := MeshOf(quant).Size() // don't slice out of bounds renderLayer := ren.layer if renderLayer >= size[Z] { renderLayer = size[Z] - 1 } if renderLayer < 0 { renderLayer = 0 } // scaling sanity check if ren.scale < 1 { ren.scale = 1 } if ren.scale > maxScale { ren.scale = maxScale } // Don't render too large images or we choke for size[X]/ren.scale > maxImgSize { ren.scale++ } for size[Y]/ren.scale > maxImgSize { ren.scale++ } for i := range size { size[i] /= ren.scale if size[i] == 0 { size[i] = 1 } } size[Z] = 1 // selects one layer // make sure buffers are there if ren.imgBuf.Size() != size { ren.imgBuf = data.NewSlice(3, size) // always 3-comp, may be re-used } buf := ValueOf(quant) defer cuda.Recycle(buf) if !buf.GPUAccess() { ren.imgBuf = Download(quant) // fallback (no zoom) return } // make sure buffers are there (in CUDA context) if ren.rescaleBuf.Size() != size { ren.rescaleBuf.Free() ren.rescaleBuf = cuda.NewSlice(1, size) } for c := 0; c < quant.NComp(); c++ { cuda.Resize(ren.rescaleBuf, buf.Comp(c), renderLayer) data.Copy(ren.imgBuf.Comp(c), ren.rescaleBuf) } }) } var arrowSize = 16 func (ren *render) render() { ren.download() // imgBuf always has 3 components, we may need just one... d := ren.imgBuf comp := ren.comp quant := ren.quant if comp == "" { normalize(d) } if comp != "" && quant.NComp() > 1 { // ... if one has been selected by gui d = d.Comp(compstr[comp]) } if quant.NComp() == 1 { // ...or if the original data only had one (!) d = d.Comp(0) } if ren.img_ == nil { ren.img_ = new(image.RGBA) } draw.On(ren.img_, d, "auto", "auto", arrowSize) } var compstr = map[string]int{"x": 0, "y": 1, "z": 2} func normalize(f *data.Slice) { a := f.Vectors() maxnorm := 0. for i := range a[0] { for j := range a[0][i] { for k := range a[0][i][j] { x, y, z := a[0][i][j][k], a[1][i][j][k], a[2][i][j][k] norm := math.Sqrt(float64(x*x + y*y + z*z)) if norm > maxnorm { maxnorm = norm } } } } factor := float32(1 / maxnorm) for i := range a[0] { for j := range a[0][i] { for k := range a[0][i][j] { a[0][i][j][k] *= factor a[1][i][j][k] *= factor a[2][i][j][k] *= factor } } } } mumax3-3.10/engine/rk23.go000066400000000000000000000050631371432437400152400ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" "math" ) // Bogacki-Shampine solver. 3rd order, 3 evaluations per step, adaptive step. // http://en.wikipedia.org/wiki/Bogacki-Shampine_method // // k1 = f(tn, yn) // k2 = f(tn + 1/2 h, yn + 1/2 h k1) // k3 = f(tn + 3/4 h, yn + 3/4 h k2) // y{n+1} = yn + 2/9 h k1 + 1/3 h k2 + 4/9 h k3 // 3rd order // k4 = f(tn + h, y{n+1}) // z{n+1} = yn + 7/24 h k1 + 1/4 h k2 + 1/3 h k3 + 1/8 h k4 // 2nd order type RK23 struct { k1 *data.Slice // torque at end of step is kept for beginning of next step } func (rk *RK23) Step() { m := M.Buffer() size := m.Size() if FixDt != 0 { Dt_si = FixDt } // upon resize: remove wrongly sized k1 if rk.k1.Size() != m.Size() { rk.Free() } // first step ever: one-time k1 init and eval if rk.k1 == nil { rk.k1 = cuda.NewSlice(3, size) torqueFn(rk.k1) } // FSAL cannot be used with temperature if !Temp.isZero() { torqueFn(rk.k1) } t0 := Time // backup magnetization m0 := cuda.Buffer(3, size) defer cuda.Recycle(m0) data.Copy(m0, m) k2, k3, k4 := cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size) defer cuda.Recycle(k2) defer cuda.Recycle(k3) defer cuda.Recycle(k4) h := float32(Dt_si * GammaLL) // internal time step = Dt * gammaLL // there is no explicit stage 1: k1 from previous step // stage 2 Time = t0 + (1./2.)*Dt_si cuda.Madd2(m, m, rk.k1, 1, (1./2.)*h) // m = m*1 + k1*h/2 M.normalize() torqueFn(k2) // stage 3 Time = t0 + (3./4.)*Dt_si cuda.Madd2(m, m0, k2, 1, (3./4.)*h) // m = m0*1 + k2*3/4 M.normalize() torqueFn(k3) // 3rd order solution cuda.Madd4(m, m0, rk.k1, k2, k3, 1, (2./9.)*h, (1./3.)*h, (4./9.)*h) M.normalize() // error estimate Time = t0 + Dt_si torqueFn(k4) Err := k2 // re-use k2 as error // difference of 3rd and 2nd order torque without explicitly storing them first cuda.Madd4(Err, rk.k1, k2, k3, k4, (7./24.)-(2./9.), (1./4.)-(1./3.), (1./3.)-(4./9.), (1. / 8.)) // determine error err := cuda.MaxVecNorm(Err) * float64(h) // adjust next time step if err < MaxErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop // step OK setLastErr(err) setMaxTorque(k4) NSteps++ Time = t0 + Dt_si adaptDt(math.Pow(MaxErr/err, 1./3.)) data.Copy(rk.k1, k4) // FSAL } else { // undo bad step //util.Println("Bad step at t=", t0, ", err=", err) util.Assert(FixDt == 0) Time = t0 data.Copy(m, m0) NUndone++ adaptDt(math.Pow(MaxErr/err, 1./4.)) } } func (rk *RK23) Free() { rk.k1.Free() rk.k1 = nil } mumax3-3.10/engine/rk4.go000066400000000000000000000030351371432437400151540ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" "math" ) // Classical 4th order RK solver. type RK4 struct { } func (rk *RK4) Step() { m := M.Buffer() size := m.Size() if FixDt != 0 { Dt_si = FixDt } t0 := Time // backup magnetization m0 := cuda.Buffer(3, size) defer cuda.Recycle(m0) data.Copy(m0, m) k1, k2, k3, k4 := cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size) defer cuda.Recycle(k1) defer cuda.Recycle(k2) defer cuda.Recycle(k3) defer cuda.Recycle(k4) h := float32(Dt_si * GammaLL) // internal time step = Dt * gammaLL // stage 1 torqueFn(k1) // stage 2 Time = t0 + (1./2.)*Dt_si cuda.Madd2(m, m, k1, 1, (1./2.)*h) // m = m*1 + k1*h/2 M.normalize() torqueFn(k2) // stage 3 cuda.Madd2(m, m0, k2, 1, (1./2.)*h) // m = m0*1 + k2*1/2 M.normalize() torqueFn(k3) // stage 4 Time = t0 + Dt_si cuda.Madd2(m, m0, k3, 1, 1.*h) // m = m0*1 + k3*1 M.normalize() torqueFn(k4) err := cuda.MaxVecDiff(k1, k4) * float64(h) // adjust next time step if err < MaxErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop // step OK // 4th order solution cuda.Madd5(m, m0, k1, k2, k3, k4, 1, (1./6.)*h, (1./3.)*h, (1./3.)*h, (1./6.)*h) M.normalize() NSteps++ adaptDt(math.Pow(MaxErr/err, 1./4.)) setLastErr(err) setMaxTorque(k4) } else { // undo bad step util.Assert(FixDt == 0) Time = t0 data.Copy(m, m0) NUndone++ adaptDt(math.Pow(MaxErr/err, 1./5.)) } } func (_ *RK4) Free() {} mumax3-3.10/engine/rk45dp.go000066400000000000000000000057761371432437400156030ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" "math" ) type RK45DP struct { k1 *data.Slice // torque at end of step is kept for beginning of next step } func (rk *RK45DP) Step() { m := M.Buffer() size := m.Size() if FixDt != 0 { Dt_si = FixDt } // upon resize: remove wrongly sized k1 if rk.k1.Size() != m.Size() { rk.Free() } // first step ever: one-time k1 init and eval if rk.k1 == nil { rk.k1 = cuda.NewSlice(3, size) torqueFn(rk.k1) } // FSAL cannot be used with finite temperature if !Temp.isZero() { torqueFn(rk.k1) } t0 := Time // backup magnetization m0 := cuda.Buffer(3, size) defer cuda.Recycle(m0) data.Copy(m0, m) k2, k3, k4, k5, k6 := cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size) defer cuda.Recycle(k2) defer cuda.Recycle(k3) defer cuda.Recycle(k4) defer cuda.Recycle(k5) defer cuda.Recycle(k6) // k2 will be re-used as k7 h := float32(Dt_si * GammaLL) // internal time step = Dt * gammaLL // there is no explicit stage 1: k1 from previous step // stage 2 Time = t0 + (1./5.)*Dt_si cuda.Madd2(m, m, rk.k1, 1, (1./5.)*h) // m = m*1 + k1*h/5 M.normalize() torqueFn(k2) // stage 3 Time = t0 + (3./10.)*Dt_si cuda.Madd3(m, m0, rk.k1, k2, 1, (3./40.)*h, (9./40.)*h) M.normalize() torqueFn(k3) // stage 4 Time = t0 + (4./5.)*Dt_si cuda.Madd4(m, m0, rk.k1, k2, k3, 1, (44./45.)*h, (-56./15.)*h, (32./9.)*h) M.normalize() torqueFn(k4) // stage 5 Time = t0 + (8./9.)*Dt_si cuda.Madd5(m, m0, rk.k1, k2, k3, k4, 1, (19372./6561.)*h, (-25360./2187.)*h, (64448./6561.)*h, (-212./729.)*h) M.normalize() torqueFn(k5) // stage 6 Time = t0 + (1.)*Dt_si cuda.Madd6(m, m0, rk.k1, k2, k3, k4, k5, 1, (9017./3168.)*h, (-355./33.)*h, (46732./5247.)*h, (49./176.)*h, (-5103./18656.)*h) M.normalize() torqueFn(k6) // stage 7: 5th order solution Time = t0 + (1.)*Dt_si // no k2 cuda.Madd6(m, m0, rk.k1, k3, k4, k5, k6, 1, (35./384.)*h, (500./1113.)*h, (125./192.)*h, (-2187./6784.)*h, (11./84.)*h) // 5th M.normalize() k7 := k2 // re-use k2 torqueFn(k7) // next torque if OK // error estimate Err := cuda.Buffer(3, size) //k3 // re-use k3 as error estimate defer cuda.Recycle(Err) cuda.Madd6(Err, rk.k1, k3, k4, k5, k6, k7, (35./384.)-(5179./57600.), (500./1113.)-(7571./16695.), (125./192.)-(393./640.), (-2187./6784.)-(-92097./339200.), (11./84.)-(187./2100.), (0.)-(1./40.)) // determine error err := cuda.MaxVecNorm(Err) * float64(h) // adjust next time step if err < MaxErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop // step OK setLastErr(err) setMaxTorque(k7) NSteps++ Time = t0 + Dt_si adaptDt(math.Pow(MaxErr/err, 1./5.)) data.Copy(rk.k1, k7) // FSAL } else { // undo bad step //util.Println("Bad step at t=", t0, ", err=", err) util.Assert(FixDt == 0) Time = t0 data.Copy(m, m0) NUndone++ adaptDt(math.Pow(MaxErr/err, 1./6.)) } } func (rk *RK45DP) Free() { rk.k1.Free() rk.k1 = nil } mumax3-3.10/engine/rk56.go000066400000000000000000000056041371432437400152470ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" "math" ) type RK56 struct { } func (rk *RK56) Step() { m := M.Buffer() size := m.Size() if FixDt != 0 { Dt_si = FixDt } t0 := Time // backup magnetization m0 := cuda.Buffer(3, size) defer cuda.Recycle(m0) data.Copy(m0, m) k1, k2, k3, k4, k5, k6, k7, k8 := cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size), cuda.Buffer(3, size) defer cuda.Recycle(k1) defer cuda.Recycle(k2) defer cuda.Recycle(k3) defer cuda.Recycle(k4) defer cuda.Recycle(k5) defer cuda.Recycle(k6) defer cuda.Recycle(k7) defer cuda.Recycle(k8) //k2 will be recyled as k9 h := float32(Dt_si * GammaLL) // internal time step = Dt * gammaLL // stage 1 torqueFn(k1) // stage 2 Time = t0 + (1./6.)*Dt_si cuda.Madd2(m, m, k1, 1, (1./6.)*h) // m = m*1 + k1*h/6 M.normalize() torqueFn(k2) // stage 3 Time = t0 + (4./15.)*Dt_si cuda.Madd3(m, m0, k1, k2, 1, (4./75.)*h, (16./75.)*h) M.normalize() torqueFn(k3) // stage 4 Time = t0 + (2./3.)*Dt_si cuda.Madd4(m, m0, k1, k2, k3, 1, (5./6.)*h, (-8./3.)*h, (5./2.)*h) M.normalize() torqueFn(k4) // stage 5 Time = t0 + (4./5.)*Dt_si cuda.Madd5(m, m0, k1, k2, k3, k4, 1, (-8./5.)*h, (144./25.)*h, (-4.)*h, (16./25.)*h) M.normalize() torqueFn(k5) // stage 6 Time = t0 + (1.)*Dt_si cuda.Madd6(m, m0, k1, k2, k3, k4, k5, 1, (361./320.)*h, (-18./5.)*h, (407./128.)*h, (-11./80.)*h, (55./128.)*h) M.normalize() torqueFn(k6) // stage 7 Time = t0 cuda.Madd5(m, m0, k1, k3, k4, k5, 1, (-11./640.)*h, (11./256.)*h, (-11/160.)*h, (11./256.)*h) M.normalize() torqueFn(k7) // stage 8 Time = t0 + (1.)*Dt_si cuda.Madd7(m, m0, k1, k2, k3, k4, k5, k7, 1, (93./640.)*h, (-18./5.)*h, (803./256.)*h, (-11./160.)*h, (99./256.)*h, (1.)*h) M.normalize() torqueFn(k8) // stage 9: 6th order solution Time = t0 + (1.)*Dt_si //madd6(m, m0, k1, k3, k4, k5, k6, 1, (31./384.)*h, (1125./2816.)*h, (9./32.)*h, (125./768.)*h, (5./66.)*h) cuda.Madd7(m, m0, k1, k3, k4, k5, k7, k8, 1, (7./1408.)*h, (1125./2816.)*h, (9./32.)*h, (125./768.)*h, (5./66.)*h, (5./66.)*h) M.normalize() torqueFn(k2) // re-use k2 // error estimate Err := cuda.Buffer(3, size) defer cuda.Recycle(Err) cuda.Madd4(Err, k1, k6, k7, k8, (-5. / 66.), (-5. / 66.), (5. / 66.), (5. / 66.)) // determine error err := cuda.MaxVecNorm(Err) * float64(h) // adjust next time step if err < MaxErr || Dt_si <= MinDt || FixDt != 0 { // mindt check to avoid infinite loop // step OK setLastErr(err) setMaxTorque(k2) NSteps++ Time = t0 + Dt_si adaptDt(math.Pow(MaxErr/err, 1./6.)) } else { // undo bad step //util.Println("Bad step at t=", t0, ", err=", err) util.Assert(FixDt == 0) Time = t0 data.Copy(m, m0) NUndone++ adaptDt(math.Pow(MaxErr/err, 1./7.)) } } func (rk *RK56) Free() { } mumax3-3.10/engine/run.go000066400000000000000000000146151371432437400152660ustar00rootroot00000000000000package engine import ( "fmt" "math" "os" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" ) // Solver globals var ( Time float64 // time in seconds alarm float64 // alarm clock marks end time of run, dt adaptation must not cross it! pause = true // set pause at any time to stop running after the current step postStep []func() // called on after every full time step Inject = make(chan func()) // injects code in between time steps. Used by web interface. Dt_si float64 = 1e-15 // time step = dt_si (seconds) *dt_mul, which should be nice float32 MinDt, MaxDt float64 // minimum and maximum time step MaxErr float64 = 1e-5 // maximum error/step Headroom float64 = 0.8 // solver headroom, (Gustafsson, 1992, Control of Error and Convergence in ODE Solvers) LastErr, PeakErr float64 // error of last step, highest error ever LastTorque float64 // maxTorque of last time step NSteps, NUndone, NEvals int // number of good steps, undone steps FixDt float64 // fixed time step? stepper Stepper // generic step, can be EulerStep, HeunStep, etc solvertype int ) func init() { DeclFunc("Run", Run, "Run the simulation for a time in seconds") DeclFunc("Steps", Steps, "Run the simulation for a number of time steps") DeclFunc("RunWhile", RunWhile, "Run while condition function is true") DeclFunc("SetSolver", SetSolver, "Set solver type. 1:Euler, 2:Heun, 3:Bogaki-Shampine, 4: Runge-Kutta (RK45), 5: Dormand-Prince, 6: Fehlberg, -1: Backward Euler") DeclTVar("t", &Time, "Total simulated time (s)") DeclVar("step", &NSteps, "Total number of time steps taken") DeclVar("MinDt", &MinDt, "Minimum time step the solver can take (s)") DeclVar("MaxDt", &MaxDt, "Maximum time step the solver can take (s)") DeclVar("MaxErr", &MaxErr, "Maximum error per step the solver can tolerate (default = 1e-5)") DeclVar("Headroom", &Headroom, "Solver headroom (default = 0.8)") DeclVar("FixDt", &FixDt, "Set a fixed time step, 0 disables fixed step (which is the default)") DeclFunc("Exit", Exit, "Exit from the program") SetSolver(DORMANDPRINCE) _ = NewScalarValue("dt", "s", "Time Step", func() float64 { return Dt_si }) _ = NewScalarValue("LastErr", "", "Error of last step", func() float64 { return LastErr }) _ = NewScalarValue("PeakErr", "", "Overall maxium error per step", func() float64 { return PeakErr }) _ = NewScalarValue("NEval", "", "Total number of torque evaluations", func() float64 { return float64(NEvals) }) } // Time stepper like Euler, Heun, RK23 type Stepper interface { Step() // take time step using solver globals Free() // free resources, if any (e.g.: RK23 previous torque) } // Arguments for SetSolver const ( BACKWARD_EULER = -1 EULER = 1 HEUN = 2 BOGAKISHAMPINE = 3 RUNGEKUTTA = 4 DORMANDPRINCE = 5 FEHLBERG = 6 ) func SetSolver(typ int) { // free previous solver, if any if stepper != nil { stepper.Free() } switch typ { default: util.Fatalf("SetSolver: unknown solver type: %v", typ) case BACKWARD_EULER: stepper = new(BackwardEuler) case EULER: stepper = new(Euler) case HEUN: stepper = new(Heun) case BOGAKISHAMPINE: stepper = new(RK23) case RUNGEKUTTA: stepper = new(RK4) case DORMANDPRINCE: stepper = new(RK45DP) case FEHLBERG: stepper = new(RK56) } solvertype = typ } // write torque to dst and increment NEvals func torqueFn(dst *data.Slice) { SetTorque(dst) NEvals++ } // returns number of torque evaluations func getNEval() int { return NEvals } // update lastErr and peakErr func setLastErr(err float64) { LastErr = err if err > PeakErr { PeakErr = err } } func setMaxTorque(τ *data.Slice) { LastTorque = cuda.MaxVecNorm(τ) } // adapt time step: dt *= corr, but limited to sensible values. func adaptDt(corr float64) { if FixDt != 0 { Dt_si = FixDt return } // corner case triggered by err = 0: just keep time step. // see test/regression017.mx3 if math.IsNaN(corr) { corr = 1 } util.AssertMsg(corr != 0, "Time step too small, check if parameters are sensible") corr *= Headroom if corr > 2 { corr = 2 } if corr < 1./2. { corr = 1. / 2. } Dt_si *= corr if MinDt != 0 && Dt_si < MinDt { Dt_si = MinDt } if MaxDt != 0 && Dt_si > MaxDt { Dt_si = MaxDt } if Dt_si == 0 { util.Fatal("time step too small") } // do not cross alarm time if Time < alarm && Time+Dt_si > alarm { Dt_si = alarm - Time } util.AssertMsg(Dt_si > 0, fmt.Sprint("Time step too small: ", Dt_si)) } // Run the simulation for a number of seconds. func Run(seconds float64) { stop := Time + seconds alarm = stop // don't have dt adapt to go over alarm RunWhile(func() bool { return Time < stop }) } // Run the simulation for a number of steps. func Steps(n int) { stop := NSteps + n RunWhile(func() bool { return NSteps < stop }) } // Runs as long as condition returns true, saves output. func RunWhile(condition func() bool) { SanityCheck() pause = false // may be set by <-Inject const output = true stepper.Free() // start from a clean state runWhile(condition, output) pause = true } func runWhile(condition func() bool, output bool) { DoOutput() // allow t=0 output for condition() && !pause { select { default: step(output) // accept tasks form Inject channel case f := <-Inject: f() } } } // Runs as long as browser is connected to gui. func RunInteractive() { gui_.RunInteractive() } // take one time step func step(output bool) { stepper.Step() for _, f := range postStep { f() } if output { DoOutput() } } // Register function f to be called after every time step. // Typically used, e.g., to manipulate the magnetization. func PostStep(f func()) { postStep = append(postStep, f) } // inject code into engine and wait for it to complete. func InjectAndWait(task func()) { ready := make(chan int) Inject <- func() { task(); ready <- 1 } <-ready } func SanityCheck() { if Msat.isZero() { util.Log("Note: Msat = 0") } if Aex.isZero() { util.Log("Note: Aex = 0") } } func Exit() { Close() os.Exit(0) } mumax3-3.10/engine/save.go000066400000000000000000000106701371432437400154150ustar00rootroot00000000000000package engine import ( "fmt" "path" "reflect" "strings" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/draw" "github.com/mumax/3/dump" "github.com/mumax/3/httpfs" "github.com/mumax/3/oommf" "github.com/mumax/3/util" ) func init() { DeclFunc("Save", Save, "Save space-dependent quantity once, with auto filename") DeclFunc("SaveAs", SaveAs, "Save space-dependent quantity with custom filename") DeclLValue("FilenameFormat", &fformat{}, "printf formatting string for output filenames.") DeclLValue("OutputFormat", &oformat{}, "Format for data files: OVF1_TEXT, OVF1_BINARY, OVF2_TEXT or OVF2_BINARY") DeclROnly("OVF1_BINARY", OVF1_BINARY, "OutputFormat = OVF1_BINARY sets binary OVF1 output") DeclROnly("OVF2_BINARY", OVF2_BINARY, "OutputFormat = OVF2_BINARY sets binary OVF2 output") DeclROnly("OVF1_TEXT", OVF1_TEXT, "OutputFormat = OVF1_TEXT sets text OVF1 output") DeclROnly("OVF2_TEXT", OVF2_TEXT, "OutputFormat = OVF2_TEXT sets text OVF2 output") DeclROnly("DUMP", DUMP, "OutputFormat = DUMP sets text DUMP output") DeclFunc("Snapshot", Snapshot, "Save image of quantity") DeclFunc("SnapshotAs", SnapshotAs, "Save image of quantity with custom filename") DeclVar("SnapshotFormat", &SnapshotFormat, "Image format for snapshots: jpg, png or gif.") } var ( FilenameFormat = "%s%06d" // formatting string for auto filenames. SnapshotFormat = "jpg" // user-settable snapshot format outputFormat = OVF2_BINARY // user-settable output format ) type fformat struct{} func (*fformat) Eval() interface{} { return FilenameFormat } func (*fformat) SetValue(v interface{}) { drainOutput(); FilenameFormat = v.(string) } func (*fformat) Type() reflect.Type { return reflect.TypeOf("") } type oformat struct{} func (*oformat) Eval() interface{} { return outputFormat } func (*oformat) SetValue(v interface{}) { drainOutput(); outputFormat = v.(OutputFormat) } func (*oformat) Type() reflect.Type { return reflect.TypeOf(OutputFormat(OVF2_BINARY)) } // Save once, with auto file name func Save(q Quantity) { qname := NameOf(q) fname := autoFname(NameOf(q), outputFormat, autonum[qname]) SaveAs(q, fname) autonum[qname]++ } // Save under given file name (transparent async I/O). func SaveAs(q Quantity, fname string) { if !strings.HasPrefix(fname, OD()) { fname = OD() + fname // don't clean, turns http:// in http:/ } if path.Ext(fname) == "" { fname += ("." + StringFromOutputFormat[outputFormat]) } buffer := ValueOf(q) // TODO: check and optimize for Buffer() defer cuda.Recycle(buffer) info := data.Meta{Time: Time, Name: NameOf(q), Unit: UnitOf(q), CellSize: MeshOf(q).CellSize()} data := buffer.HostCopy() // must be copy (async io) queOutput(func() { saveAs_sync(fname, data, info, outputFormat) }) } // Save image once, with auto file name func Snapshot(q Quantity) { qname := NameOf(q) fname := fmt.Sprintf(OD()+FilenameFormat+"."+SnapshotFormat, qname, autonum[qname]) s := ValueOf(q) defer cuda.Recycle(s) data := s.HostCopy() // must be copy (asyncio) queOutput(func() { snapshot_sync(fname, data) }) autonum[qname]++ } func SnapshotAs(q Quantity, fname string) { if !strings.HasPrefix(fname, OD()) { fname = OD() + fname // don't clean, turns http:// in http:/ } if path.Ext(fname) == "" { fname += ("." + StringFromOutputFormat[outputFormat]) } s := ValueOf(q) defer cuda.Recycle(s) data := s.HostCopy() // must be copy (asyncio) queOutput(func() { snapshot_sync(fname, data) }) } // synchronous snapshot func snapshot_sync(fname string, output *data.Slice) { f, err := httpfs.Create(fname) util.FatalErr(err) defer f.Close() draw.RenderFormat(f, output, "auto", "auto", arrowSize, path.Ext(fname)) } // synchronous save func saveAs_sync(fname string, s *data.Slice, info data.Meta, format OutputFormat) { f, err := httpfs.Create(fname) util.FatalErr(err) defer f.Close() switch format { case OVF1_TEXT: oommf.WriteOVF1(f, s, info, "text") case OVF1_BINARY: oommf.WriteOVF1(f, s, info, "binary 4") case OVF2_TEXT: oommf.WriteOVF2(f, s, info, "text") case OVF2_BINARY: oommf.WriteOVF2(f, s, info, "binary 4") case DUMP: dump.Write(f, s, info) default: panic("invalid output format") } } type OutputFormat int const ( OVF1_TEXT OutputFormat = iota + 1 OVF1_BINARY OVF2_TEXT OVF2_BINARY DUMP ) var ( StringFromOutputFormat = map[OutputFormat]string{ OVF1_TEXT: "ovf", OVF1_BINARY: "ovf", OVF2_TEXT: "ovf", OVF2_BINARY: "ovf", DUMP: "dump"} ) mumax3-3.10/engine/scalar_excitation.go000066400000000000000000000077151371432437400201610ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/script" "github.com/mumax/3/util" "reflect" ) // An excitation, typically field or current, // can be defined region-wise plus extra mask*multiplier terms. type ScalarExcitation struct { name string perRegion RegionwiseScalar // Region-based excitation extraTerms []mulmask // add extra mask*multiplier terms } func NewScalarExcitation(name, unit, desc string) *ScalarExcitation { e := new(ScalarExcitation) e.name = name e.perRegion.init("_"+name+"_perRegion", unit, desc, nil) // name starts with underscore: unexported DeclLValue(name, e, cat(desc, unit)) return e } func (p *ScalarExcitation) MSlice() cuda.MSlice { buf, r := p.Slice() util.Assert(r == true) return cuda.ToMSlice(buf) } func (e *ScalarExcitation) AddTo(dst *data.Slice) { if !e.perRegion.isZero() { cuda.RegionAddS(dst, e.perRegion.gpuLUT1(), regions.Gpu()) } for _, t := range e.extraTerms { var mul float32 = 1 if t.mul != nil { mul = float32(t.mul()) } cuda.Madd2(dst, dst, t.mask, 1, mul) } } func (e *ScalarExcitation) isZero() bool { return e.perRegion.isZero() && len(e.extraTerms) == 0 } func (e *ScalarExcitation) Slice() (*data.Slice, bool) { buf := cuda.Buffer(e.NComp(), e.Mesh().Size()) cuda.Zero(buf) e.AddTo(buf) return buf, true } // After resizing the mesh, the extra terms don't fit the grid anymore // and there is no reasonable way to resize them. So remove them and have // the user re-add them. func (e *ScalarExcitation) RemoveExtraTerms() { if len(e.extraTerms) == 0 { return } LogOut("REMOVING EXTRA TERMS FROM", e.Name()) for _, m := range e.extraTerms { m.mask.Free() } e.extraTerms = nil } // Add an extra mask*multiplier term to the excitation. func (e *ScalarExcitation) Add(mask *data.Slice, f script.ScalarFunction) { var mul func() float64 if f != nil { if IsConst(f) { val := f.Float() mul = func() float64 { return val } } else { mul = func() float64 { return f.Float() } } } e.AddGo(mask, mul) } // An Add(mask, f) equivalent for Go use func (e *ScalarExcitation) AddGo(mask *data.Slice, mul func() float64) { if mask != nil { checkNaN(mask, e.Name()+".add()") // TODO: in more places mask = data.Resample(mask, e.Mesh().Size()) mask = assureGPU(mask) } e.extraTerms = append(e.extraTerms, mulmask{mul, mask}) } func (e *ScalarExcitation) SetRegion(region int, f script.ScalarFunction) { e.perRegion.SetRegion(region, f) } func (e *ScalarExcitation) SetValue(v interface{}) { e.perRegion.SetValue(v) } func (e *ScalarExcitation) Set(v float64) { e.perRegion.setRegions(0, NREGION, []float64{v}) } func (e *ScalarExcitation) getRegion(region int) []float64 { return e.perRegion.getRegion(region) } // for gui func (e *ScalarExcitation) SetRegionFn(region int, f func() [3]float64) { e.perRegion.setFunc(region, region+1, func() []float64 { return slice(f()) }) } func (e *ScalarExcitation) average() float64 { return qAverageUniverse(e)[0] } func (e *ScalarExcitation) Average() float64 { return e.average() } func (e *ScalarExcitation) IsUniform() bool { return e.perRegion.IsUniform() } func (e *ScalarExcitation) Name() string { return e.name } func (e *ScalarExcitation) Unit() string { return e.perRegion.Unit() } func (e *ScalarExcitation) NComp() int { return e.perRegion.NComp() } func (e *ScalarExcitation) Mesh() *data.Mesh { return Mesh() } func (e *ScalarExcitation) Region(r int) *vOneReg { return vOneRegion(e, r) } func (e *ScalarExcitation) Comp(c int) ScalarField { return Comp(e, c) } func (e *ScalarExcitation) Eval() interface{} { return e } func (e *ScalarExcitation) Type() reflect.Type { return reflect.TypeOf(new(ScalarExcitation)) } func (e *ScalarExcitation) InputType() reflect.Type { return script.ScalarFunction_t } func (e *ScalarExcitation) EvalTo(dst *data.Slice) { EvalTo(e, dst) } mumax3-3.10/engine/script.go000066400000000000000000000055771371432437400157750ustar00rootroot00000000000000package engine // declare functionality for interpreted input scripts import ( "github.com/mumax/3/httpfs" "github.com/mumax/3/script" "reflect" ) func CompileFile(fname string) (*script.BlockStmt, error) { bytes, err := httpfs.Read(fname) if err != nil { return nil, err } return World.Compile(string(bytes)) } func Eval(code string) { tree, err := World.Compile(code) if err != nil { LogIn(code) LogErr(err.Error()) return } LogIn(rmln(tree.Format())) tree.Eval() } func Eval1Line(code string) interface{} { tree, err := World.Compile(code) if err != nil { LogErr(err.Error()) return nil } if len(tree.Children) != 1 { LogErr("expected single statement:" + code) return nil } return tree.Children[0].Eval() } // holds the script state (variables etc) var World = script.NewWorld() // Add a function to the script world func DeclFunc(name string, f interface{}, doc string) { World.Func(name, f, doc) } // Add a constant to the script world func DeclConst(name string, value float64, doc string) { World.Const(name, value, doc) } // Add a read-only variable to the script world. // It can be changed, but not by the user. func DeclROnly(name string, value interface{}, doc string) { World.ROnly(name, value, doc) GUIAdd(name, value) } func Export(q interface { Name() string Unit() string }, doc string) { DeclROnly(q.Name(), q, cat(doc, q.Unit())) } // Add a (pointer to) variable to the script world func DeclVar(name string, value interface{}, doc string) { World.Var(name, value, doc) GUIAdd(name, value) } // Hack for fixing the closure caveat: // Defines "t", the time variable, handled specially by Fix() func DeclTVar(name string, value interface{}, doc string) { World.TVar(name, value, doc) GUIAdd(name, value) } // Add an LValue to the script world. // Assign to LValue invokes SetValue() func DeclLValue(name string, value LValue, doc string) { World.LValue(name, newLValueWrapper(value), doc) GUIAdd(name, value) } // LValue is settable type LValue interface { SetValue(interface{}) // assigns a new value Eval() interface{} // evaluate and return result (nil for void) Type() reflect.Type // type that can be assigned and will be returned by Eval } // evaluate code, exit on error (behavior for input files) func EvalFile(code *script.BlockStmt) { for i := range code.Children { formatted := rmln(script.Format(code.Node[i])) LogIn(formatted) code.Children[i].Eval() } } // wraps LValue and provides empty Child() type lValueWrapper struct { LValue } func newLValueWrapper(lv LValue) script.LValue { return &lValueWrapper{lv} } func (w *lValueWrapper) Child() []script.Expr { return nil } func (w *lValueWrapper) Fix() script.Expr { return script.NewConst(w) } func (w *lValueWrapper) InputType() reflect.Type { if i, ok := w.LValue.(interface { InputType() reflect.Type }); ok { return i.InputType() } else { return w.Type() } } mumax3-3.10/engine/shape.go000066400000000000000000000176441371432437400155670ustar00rootroot00000000000000package engine import ( "image" _ "image/jpeg" _ "image/png" "math" "github.com/mumax/3/httpfs" "github.com/mumax/3/util" ) func init() { DeclFunc("Ellipsoid", Ellipsoid, "3D Ellipsoid with axes in meter") DeclFunc("Ellipse", Ellipse, "2D Ellipse with axes in meter") DeclFunc("Cone", Cone, "3D Cone with diameter and height in meter. The top of the cone points in the +z direction.") DeclFunc("Cylinder", Cylinder, "3D Cylinder with diameter and height in meter") DeclFunc("Circle", Circle, "2D Circle with diameter in meter") DeclFunc("Cuboid", Cuboid, "Cuboid with sides in meter") DeclFunc("Rect", Rect, "2D rectangle with size in meter") DeclFunc("Square", Square, "2D square with size in meter") DeclFunc("XRange", XRange, "Part of space between x1 (inclusive) and x2 (exclusive), in meter") DeclFunc("YRange", YRange, "Part of space between y1 (inclusive) and y2 (exclusive), in meter") DeclFunc("ZRange", ZRange, "Part of space between z1 (inclusive) and z2 (exclusive), in meter") DeclFunc("Layers", Layers, "Part of space between cell layer1 (inclusive) and layer2 (exclusive), in integer indices") DeclFunc("Layer", Layer, "Single layer (along z), by integer index starting from 0") DeclFunc("Universe", Universe, "Entire space") DeclFunc("Cell", Cell, "Single cell with given integer index (i, j, k)") DeclFunc("ImageShape", ImageShape, "Use black/white image as shape") DeclFunc("GrainRoughness", GrainRoughness, "Grainy surface with different heights per grain "+ "with a typical grain size (first argument), minimal height (second argument), and maximal "+ "height (third argument). The last argument is a seed for the random number generator.") } // geometrical shape for setting sample geometry type Shape func(x, y, z float64) bool // Ellipsoid with given diameters func Ellipsoid(diamx, diamy, diamz float64) Shape { return func(x, y, z float64) bool { return sqr64(x/diamx)+sqr64(y/diamy)+sqr64(z/diamz) <= 0.25 } } func Ellipse(diamx, diamy float64) Shape { return Ellipsoid(diamx, diamy, math.Inf(1)) } // 3D Cone with the vertex down. func Cone(diam, height float64) Shape { return func(x, y, z float64) bool { return z >= 0 && sqr64(x/diam)+sqr64(y/diam)+0.25*sqr64(z/height) <= 0.25 } } func Circle(diam float64) Shape { return Cylinder(diam, math.Inf(1)) } // cylinder along z. func Cylinder(diam, height float64) Shape { return func(x, y, z float64) bool { return z <= height/2 && z >= -height/2 && sqr64(x/diam)+sqr64(y/diam) <= 0.25 } } // 3D Rectangular slab with given sides. func Cuboid(sidex, sidey, sidez float64) Shape { return func(x, y, z float64) bool { rx, ry, rz := sidex/2, sidey/2, sidez/2 return x < rx && x > -rx && y < ry && y > -ry && z < rz && z > -rz } } // 2D Rectangle with given sides. func Rect(sidex, sidey float64) Shape { return func(x, y, z float64) bool { rx, ry := sidex/2, sidey/2 return x < rx && x > -rx && y < ry && y > -ry } } // 2D square with given side. func Square(side float64) Shape { return Rect(side, side) } // All cells with x-coordinate between a and b func XRange(a, b float64) Shape { return func(x, y, z float64) bool { return x >= a && x < b } } // All cells with y-coordinate between a and b func YRange(a, b float64) Shape { return func(x, y, z float64) bool { return y >= a && y < b } } // All cells with z-coordinate between a and b func ZRange(a, b float64) Shape { return func(x, y, z float64) bool { return z >= a && z < b } } // Cell layers #a (inclusive) up to #b (exclusive). func Layers(a, b int) Shape { Nz := Mesh().Size()[Z] if a < 0 || a > Nz || b < 0 || b < a { util.Fatal("layers ", a, ":", b, " out of bounds (0 - ", Nz, ")") } c := Mesh().CellSize()[Z] z1 := Index2Coord(0, 0, a)[Z] - c/2 z2 := Index2Coord(0, 0, b)[Z] - c/2 return ZRange(z1, z2) } func Layer(index int) Shape { return Layers(index, index+1) } // Single cell with given index func Cell(ix, iy, iz int) Shape { c := Mesh().CellSize() pos := Index2Coord(ix, iy, iz) x1 := pos[X] - c[X]/2 y1 := pos[Y] - c[Y]/2 z1 := pos[Z] - c[Z]/2 x2 := pos[X] + c[X]/2 y2 := pos[Y] + c[Y]/2 z2 := pos[Z] + c[Z]/2 return func(x, y, z float64) bool { return x > x1 && x < x2 && y > y1 && y < y2 && z > z1 && z < z2 } } func Universe() Shape { return universe } // The entire space. func universe(x, y, z float64) bool { return true } func ImageShape(fname string) Shape { r, err1 := httpfs.Open(fname) CheckRecoverable(err1) defer r.Close() img, _, err2 := image.Decode(r) CheckRecoverable(err2) width := img.Bounds().Max.X height := img.Bounds().Max.Y // decode image into bool matrix for fast pixel lookup inside := make([][]bool, height) for iy := range inside { inside[iy] = make([]bool, width) } for iy := 0; iy < height; iy++ { for ix := 0; ix < width; ix++ { r, g, b, a := img.At(ix, height-1-iy).RGBA() if a > 128 && r+g+b < (0xFFFF*3)/2 { inside[iy][ix] = true } } } // stretch the image onto the gridsize c := Mesh().CellSize() cx, cy := c[X], c[Y] N := Mesh().Size() nx, ny := float64(N[X]), float64(N[Y]) w, h := float64(width), float64(height) return func(x, y, z float64) bool { ix := int((w/nx)*(x/cx) + 0.5*w) iy := int((h/ny)*(y/cy) + 0.5*h) if ix < 0 || ix >= width || iy < 0 || iy >= height { return false } else { return inside[iy][ix] } } } func GrainRoughness(grainsize, zmin, zmax float64, seed int) Shape { t := newTesselation(grainsize, 256, int64(seed)) return func(x, y, z float64) bool { if z <= zmin { return true } if z >= zmax { return false } r := t.RegionOf(x, y, z) return (z-zmin)/(zmax-zmin) < (float64(r) / 256) } } // Transl returns a translated copy of the shape. func (s Shape) Transl(dx, dy, dz float64) Shape { return func(x, y, z float64) bool { return s(x-dx, y-dy, z-dz) } } // Infinitely repeats the shape with given period in x, y, z. // A period of 0 or infinity means no repetition. func (s Shape) Repeat(periodX, periodY, periodZ float64) Shape { return func(x, y, z float64) bool { return s(fmod(x, periodX), fmod(y, periodY), fmod(z, periodZ)) } } func fmod(a, b float64) float64 { if b == 0 || math.IsInf(b, 1) { return a } if math.Abs(a) > b/2 { return sign(a) * (math.Mod(math.Abs(a+b/2), b) - b/2) } else { return a } } // Scale returns a scaled copy of the shape. func (s Shape) Scale(sx, sy, sz float64) Shape { return func(x, y, z float64) bool { return s(x/sx, y/sy, z/sz) } } // Rotates the shape around the Z-axis, over θ radians. func (s Shape) RotZ(θ float64) Shape { cos := math.Cos(θ) sin := math.Sin(θ) return func(x, y, z float64) bool { x_ := x*cos + y*sin y_ := -x*sin + y*cos return s(x_, y_, z) } } // Rotates the shape around the Y-axis, over θ radians. func (s Shape) RotY(θ float64) Shape { cos := math.Cos(θ) sin := math.Sin(θ) return func(x, y, z float64) bool { x_ := x*cos - z*sin z_ := x*sin + z*cos return s(x_, y, z_) } } // Rotates the shape around the X-axis, over θ radians. func (s Shape) RotX(θ float64) Shape { cos := math.Cos(θ) sin := math.Sin(θ) return func(x, y, z float64) bool { y_ := y*cos + z*sin z_ := -y*sin + z*cos return s(x, y_, z_) } } // Union of shapes a and b (logical OR). func (a Shape) Add(b Shape) Shape { return func(x, y, z float64) bool { return a(x, y, z) || b(x, y, z) } } // Intersection of shapes a and b (logical AND). func (a Shape) Intersect(b Shape) Shape { return func(x, y, z float64) bool { return a(x, y, z) && b(x, y, z) } } // Inverse (outside) of shape (logical NOT). func (s Shape) Inverse() Shape { return func(x, y, z float64) bool { return !s(x, y, z) } } // Removes b from a (logical a AND NOT b) func (a Shape) Sub(b Shape) Shape { return func(x, y, z float64) bool { return a(x, y, z) && !b(x, y, z) } } // Logical XOR of shapes a and b func (a Shape) Xor(b Shape) Shape { return func(x, y, z float64) bool { A, B := a(x, y, z), b(x, y, z) return (A || B) && !(A && B) } } func sqr64(x float64) float64 { return x * x } mumax3-3.10/engine/shift.go000066400000000000000000000047521371432437400156000ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/data" ) var ( TotalShift, TotalYShift float64 // accumulated window shift (X and Y) in meter ShiftMagL, ShiftMagR, ShiftMagU, ShiftMagD data.Vector // when shifting m, put these value at the left/right edge. ShiftM, ShiftGeom, ShiftRegions bool = true, true, true // should shift act on magnetization, geometry, regions? ) func init() { DeclFunc("Shift", Shift, "Shifts the simulation by +1/-1 cells along X") DeclVar("ShiftMagL", &ShiftMagL, "Upon shift, insert this magnetization from the left") DeclVar("ShiftMagR", &ShiftMagR, "Upon shift, insert this magnetization from the right") DeclVar("ShiftMagU", &ShiftMagU, "Upon shift, insert this magnetization from the top") DeclVar("ShiftMagD", &ShiftMagD, "Upon shift, insert this magnetization from the bottom") DeclVar("ShiftM", &ShiftM, "Whether Shift() acts on magnetization") DeclVar("ShiftGeom", &ShiftGeom, "Whether Shift() acts on geometry") DeclVar("ShiftRegions", &ShiftRegions, "Whether Shift() acts on regions") DeclVar("TotalShift", &TotalShift, "Amount by which the simulation has been shifted (m).") } // position of the window lab frame func GetShiftPos() float64 { return -TotalShift } func GetShiftYPos() float64 { return -TotalYShift } // shift the simulation window over dx cells in X direction func Shift(dx int) { TotalShift += float64(dx) * Mesh().CellSize()[X] // needed to re-init geom, regions if ShiftM { shiftMag(M.Buffer(), dx) // TODO: M.shift? } if ShiftRegions { regions.shift(dx) } if ShiftGeom { geometry.shift(dx) } M.normalize() } func shiftMag(m *data.Slice, dx int) { m2 := cuda.Buffer(1, m.Size()) defer cuda.Recycle(m2) for c := 0; c < m.NComp(); c++ { comp := m.Comp(c) cuda.ShiftX(m2, comp, dx, float32(ShiftMagL[c]), float32(ShiftMagR[c])) data.Copy(comp, m2) // str0 ? } } // shift the simulation window over dy cells in Y direction func YShift(dy int) { TotalYShift += float64(dy) * Mesh().CellSize()[Y] // needed to re-init geom, regions if ShiftM { shiftMagY(M.Buffer(), dy) } if ShiftRegions { regions.shiftY(dy) } if ShiftGeom { geometry.shiftY(dy) } M.normalize() } func shiftMagY(m *data.Slice, dy int) { m2 := cuda.Buffer(1, m.Size()) defer cuda.Recycle(m2) for c := 0; c < m.NComp(); c++ { comp := m.Comp(c) cuda.ShiftY(m2, comp, dy, float32(ShiftMagU[c]), float32(ShiftMagD[c])) data.Copy(comp, m2) // str0 ? } } mumax3-3.10/engine/table.go000066400000000000000000000102061371432437400155410ustar00rootroot00000000000000package engine import ( "fmt" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/httpfs" "github.com/mumax/3/script" "github.com/mumax/3/timer" "github.com/mumax/3/util" "io" "sync" "time" ) var Table = *newTable("table") // output handle for tabular data (average magnetization etc.) const TableAutoflushRate = 5 // auto-flush table every X seconds func init() { DeclFunc("TableAdd", TableAdd, "Add quantity as a column to the data table.") DeclFunc("TableAddVar", TableAddVariable, "Add user-defined variable + name + unit to data table.") DeclFunc("TableSave", TableSave, "Save the data table right now (appends one line).") DeclFunc("TableAutoSave", TableAutoSave, "Auto-save the data table every period (s). Zero disables save.") DeclFunc("TablePrint", TablePrint, "Print anyting in the data table") Table.Add(&M) } type DataTable struct { output interface { io.Writer Flush() error } info outputs []Quantity autosave flushlock sync.Mutex } func (t *DataTable) Write(p []byte) (int, error) { n, err := t.output.Write(p) util.FatalErr(err) return n, err } func (t *DataTable) Flush() error { if t.output == nil { return nil } if cuda.Synchronous { timer.Start("io") } err := t.output.Flush() if cuda.Synchronous { timer.Stop("io") } util.FatalErr(err) return err } func newTable(name string) *DataTable { t := new(DataTable) t.name = name return t } func TableAdd(col Quantity) { Table.Add(col) } func TableAddVariable(x script.ScalarFunction, name, unit string) { Table.AddVariable(x, name, unit) } func (t *DataTable) AddVariable(x script.ScalarFunction, name, unit string) { TableAdd(&userVar{x, name, unit}) } type userVar struct { value script.ScalarFunction name, unit string } func (x *userVar) Name() string { return x.name } func (x *userVar) NComp() int { return 1 } func (x *userVar) Unit() string { return x.unit } func (x *userVar) average() []float64 { return []float64{x.value.Float()} } func (x *userVar) EvalTo(dst *data.Slice) { avg := x.average() for c := 0; c < x.NComp(); c++ { cuda.Memset(dst.Comp(c), float32(avg[c])) } } func TableSave() { Table.Save() } func TableAutoSave(period float64) { Table.autosave = autosave{period, Time, -1, nil} // count -1 allows output on t=0 } func (t *DataTable) Add(output Quantity) { if t.inited() { util.Fatal("data table add ", NameOf(output), ": need to add quantity before table is output the first time") } t.outputs = append(t.outputs, output) } func (t *DataTable) Save() { t.flushlock.Lock() // flush during write gives errShortWrite defer t.flushlock.Unlock() if cuda.Synchronous { timer.Start("io") } t.init() fprint(t, Time) for _, o := range t.outputs { vec := AverageOf(o) for _, v := range vec { fprint(t, "\t", float32(v)) } } fprintln(t) //t.flush() t.count++ if cuda.Synchronous { timer.Stop("io") } } func (t *DataTable) Println(msg ...interface{}) { t.init() fprintln(t, msg...) } func TablePrint(msg ...interface{}) { Table.Println(msg...) } // open writer and write header func (t *DataTable) init() { if t.inited() { return } f, err := httpfs.Create(OD() + t.name + ".txt") util.FatalErr(err) t.output = f // write header fprint(t, "# t (s)") for _, o := range t.outputs { if o.NComp() == 1 { fprint(t, "\t", NameOf(o), " (", UnitOf(o), ")") } else { for c := 0; c < o.NComp(); c++ { fprint(t, "\t", NameOf(o)+string('x'+c), " (", UnitOf(o), ")") } } } fprintln(t) t.Flush() // periodically flush so GUI shows graph, // but don't flush after every output for performance // (httpfs flush is expensive) go func() { for { time.Sleep(TableAutoflushRate * time.Second) Table.flush() } }() } func (t *DataTable) inited() bool { return t.output != nil } func (t *DataTable) flush() { t.flushlock.Lock() defer t.flushlock.Unlock() t.Flush() } // Safe fmt.Fprint, will fail on error func fprint(out io.Writer, x ...interface{}) { _, err := fmt.Fprint(out, x...) util.FatalErr(err) } // Safe fmt.Fprintln, will fail on error func fprintln(out io.Writer, x ...interface{}) { _, err := fmt.Fprintln(out, x...) util.FatalErr(err) } mumax3-3.10/engine/temperature.go000066400000000000000000000073351371432437400170200ustar00rootroot00000000000000package engine import ( "github.com/mumax/3/cuda" "github.com/mumax/3/cuda/curand" "github.com/mumax/3/data" "github.com/mumax/3/mag" //"github.com/mumax/3/util" "math" ) var ( Temp = NewScalarParam("Temp", "K", "Temperature") E_therm = NewScalarValue("E_therm", "J", "Thermal energy", GetThermalEnergy) Edens_therm = NewScalarField("Edens_therm", "J/m3", "Thermal energy density", AddThermalEnergyDensity) B_therm thermField // Thermal effective field (T) ) var AddThermalEnergyDensity = makeEdensAdder(&B_therm, -1) // thermField calculates and caches thermal noise. type thermField struct { seed int64 // seed for generator generator curand.Generator // noise *data.Slice // noise buffer step int // solver step corresponding to noise dt float64 // solver timestep corresponding to noise } func init() { DeclFunc("ThermSeed", ThermSeed, "Set a random seed for thermal noise") registerEnergy(GetThermalEnergy, AddThermalEnergyDensity) B_therm.step = -1 // invalidate noise cache DeclROnly("B_therm", &B_therm, "Thermal field (T)") } func (b *thermField) AddTo(dst *data.Slice) { if !Temp.isZero() { b.update() cuda.Add(dst, dst, b.noise) } } func (b *thermField) update() { // we need to fix the time step here because solver will not yet have done it before the first step. // FixDt as an lvalue that sets Dt_si on change might be cleaner. if FixDt != 0 { Dt_si = FixDt } if b.generator == 0 { b.generator = curand.CreateGenerator(curand.PSEUDO_DEFAULT) b.generator.SetSeed(b.seed) } if b.noise == nil { b.noise = cuda.NewSlice(b.NComp(), b.Mesh().Size()) // when noise was (re-)allocated it's invalid for sure. B_therm.step = -1 B_therm.dt = -1 } if Temp.isZero() { cuda.Memset(b.noise, 0, 0, 0) b.step = NSteps b.dt = Dt_si return } // keep constant during time step if NSteps == b.step && Dt_si == b.dt { return } // after a bad step the timestep is rescaled and the noise should be rescaled accordingly, instead of redrawing the random numbers if NSteps == b.step && Dt_si != b.dt { for c := 0; c < 3; c++ { cuda.Madd2(b.noise.Comp(c), b.noise.Comp(c), b.noise.Comp(c), float32(math.Sqrt(b.dt/Dt_si)), 0.) } b.dt = Dt_si return } if FixDt == 0 { Refer("leliaert2017") //uncomment to not allow adaptive step //util.Fatal("Finite temperature requires fixed time step. Set FixDt != 0.") } N := Mesh().NCell() k2_VgammaDt := 2 * mag.Kb / (GammaLL * cellVolume() * Dt_si) noise := cuda.Buffer(1, Mesh().Size()) defer cuda.Recycle(noise) const mean = 0 const stddev = 1 dst := b.noise ms := Msat.MSlice() defer ms.Recycle() temp := Temp.MSlice() defer temp.Recycle() alpha := Alpha.MSlice() defer alpha.Recycle() for i := 0; i < 3; i++ { b.generator.GenerateNormal(uintptr(noise.DevPtr(0)), int64(N), mean, stddev) cuda.SetTemperature(dst.Comp(i), noise, k2_VgammaDt, ms, temp, alpha) } b.step = NSteps b.dt = Dt_si } func GetThermalEnergy() float64 { if Temp.isZero() || relaxing { return 0 } else { return -cellVolume() * dot(&M_full, &B_therm) } } // Seeds the thermal noise generator func ThermSeed(seed int) { B_therm.seed = int64(seed) if B_therm.generator != 0 { B_therm.generator.SetSeed(B_therm.seed) } } func (b *thermField) Mesh() *data.Mesh { return Mesh() } func (b *thermField) NComp() int { return 3 } func (b *thermField) Name() string { return "Thermal field" } func (b *thermField) Unit() string { return "T" } func (b *thermField) average() []float64 { return qAverageUniverse(b) } func (b *thermField) EvalTo(dst *data.Slice) { EvalTo(b, dst) } func (b *thermField) Slice() (*data.Slice, bool) { b.update() return b.noise, false } mumax3-3.10/engine/torque.go000066400000000000000000000125331371432437400157760ustar00rootroot00000000000000package engine import ( "reflect" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/util" ) var ( Alpha = NewScalarParam("alpha", "", "Landau-Lifshitz damping constant") Xi = NewScalarParam("xi", "", "Non-adiabaticity of spin-transfer-torque") Pol = NewScalarParam("Pol", "", "Electrical current polarization") Lambda = NewScalarParam("Lambda", "", "Slonczewski Λ parameter") EpsilonPrime = NewScalarParam("EpsilonPrime", "", "Slonczewski secondairy STT term ε'") FrozenSpins = NewScalarParam("frozenspins", "", "Defines spins that should be fixed") // 1 - frozen, 0 - free. TODO: check if it only contains 0/1 values FreeLayerThickness = NewScalarParam("FreeLayerThickness", "m", "Slonczewski free layer thickness (if set to zero (default), then the thickness will be deduced from the mesh size)") FixedLayer = NewExcitation("FixedLayer", "", "Slonczewski fixed layer polarization") Torque = NewVectorField("torque", "T", "Total torque/γ0", SetTorque) LLTorque = NewVectorField("LLtorque", "T", "Landau-Lifshitz torque/γ0", SetLLTorque) STTorque = NewVectorField("STTorque", "T", "Spin-transfer torque/γ0", AddSTTorque) J = NewExcitation("J", "A/m2", "Electrical current density") MaxTorque = NewScalarValue("maxTorque", "T", "Maximum torque/γ0, over all cells", GetMaxTorque) GammaLL float64 = 1.7595e11 // Gyromagnetic ratio of spins, in rad/Ts Precess = true DisableZhangLiTorque = false DisableSlonczewskiTorque = false fixedLayerPosition = FIXEDLAYER_TOP // instructs mumax3 how free and fixed layers are stacked along +z direction ) func init() { Pol.setUniform([]float64{1}) // default spin polarization Lambda.Set(1) // sensible default value (?). DeclVar("GammaLL", &GammaLL, "Gyromagnetic ratio in rad/Ts") DeclVar("DisableZhangLiTorque", &DisableZhangLiTorque, "Disables Zhang-Li torque (default=false)") DeclVar("DisableSlonczewskiTorque", &DisableSlonczewskiTorque, "Disables Slonczewski torque (default=false)") DeclVar("DoPrecess", &Precess, "Enables LL precession (default=true)") DeclLValue("FixedLayerPosition", &flposition{}, "Position of the fixed layer: FIXEDLAYER_TOP, FIXEDLAYER_BOTTOM (default=FIXEDLAYER_TOP)") DeclROnly("FIXEDLAYER_TOP", FIXEDLAYER_TOP, "FixedLayerPosition = FIXEDLAYER_TOP instructs mumax3 that fixed layer is on top of the free layer") DeclROnly("FIXEDLAYER_BOTTOM", FIXEDLAYER_BOTTOM, "FixedLayerPosition = FIXEDLAYER_BOTTOM instructs mumax3 that fixed layer is underneath of the free layer") } // Sets dst to the current total torque func SetTorque(dst *data.Slice) { SetLLTorque(dst) AddSTTorque(dst) FreezeSpins(dst) } // Sets dst to the current Landau-Lifshitz torque func SetLLTorque(dst *data.Slice) { SetEffectiveField(dst) // calc and store B_eff alpha := Alpha.MSlice() defer alpha.Recycle() if Precess { cuda.LLTorque(dst, M.Buffer(), dst, alpha) // overwrite dst with torque } else { cuda.LLNoPrecess(dst, M.Buffer(), dst) } } // Adds the current spin transfer torque to dst func AddSTTorque(dst *data.Slice) { if J.isZero() { return } util.AssertMsg(!Pol.isZero(), "spin polarization should not be 0") jspin, rec := J.Slice() if rec { defer cuda.Recycle(jspin) } fl, rec := FixedLayer.Slice() if rec { defer cuda.Recycle(fl) } if !DisableZhangLiTorque { msat := Msat.MSlice() defer msat.Recycle() j := J.MSlice() defer j.Recycle() alpha := Alpha.MSlice() defer alpha.Recycle() xi := Xi.MSlice() defer xi.Recycle() pol := Pol.MSlice() defer pol.Recycle() cuda.AddZhangLiTorque(dst, M.Buffer(), msat, j, alpha, xi, pol, Mesh()) } if !DisableSlonczewskiTorque && !FixedLayer.isZero() { msat := Msat.MSlice() defer msat.Recycle() j := J.MSlice() defer j.Recycle() fixedP := FixedLayer.MSlice() defer fixedP.Recycle() alpha := Alpha.MSlice() defer alpha.Recycle() pol := Pol.MSlice() defer pol.Recycle() lambda := Lambda.MSlice() defer lambda.Recycle() epsPrime := EpsilonPrime.MSlice() defer epsPrime.Recycle() thickness := FreeLayerThickness.MSlice() defer thickness.Recycle() cuda.AddSlonczewskiTorque2(dst, M.Buffer(), msat, j, fixedP, alpha, pol, lambda, epsPrime, thickness, CurrentSignFromFixedLayerPosition[fixedLayerPosition], Mesh()) } } func FreezeSpins(dst *data.Slice) { if !FrozenSpins.isZero() { cuda.ZeroMask(dst, FrozenSpins.gpuLUT1(), regions.Gpu()) } } func GetMaxTorque() float64 { torque := ValueOf(Torque) defer cuda.Recycle(torque) return cuda.MaxVecNorm(torque) } type FixedLayerPosition int const ( FIXEDLAYER_TOP FixedLayerPosition = iota + 1 FIXEDLAYER_BOTTOM ) var ( CurrentSignFromFixedLayerPosition = map[FixedLayerPosition]float64{ FIXEDLAYER_TOP: 1.0, FIXEDLAYER_BOTTOM: -1.0, } ) type flposition struct{} func (*flposition) Eval() interface{} { return fixedLayerPosition } func (*flposition) SetValue(v interface{}) { drainOutput() fixedLayerPosition = v.(FixedLayerPosition) } func (*flposition) Type() reflect.Type { return reflect.TypeOf(FixedLayerPosition(FIXEDLAYER_TOP)) } mumax3-3.10/engine/unsafe.go000066400000000000000000000004311371432437400157320ustar00rootroot00000000000000package engine func init() { // There are no unsafe features since version 3.10, but we want maximal backwards compatibility DeclFunc("ext_EnableUnsafe", EnableUnsafe, "Deprecated. Only here to ensure maximal backwards compatibility with mumax3.9c.") } func EnableUnsafe() { } mumax3-3.10/engine/util.go000066400000000000000000000124611371432437400154340ustar00rootroot00000000000000package engine import ( "fmt" "math" "os" "path" "sort" "strings" "github.com/mumax/3/cuda" "github.com/mumax/3/data" "github.com/mumax/3/dump" "github.com/mumax/3/httpfs" "github.com/mumax/3/mag" "github.com/mumax/3/oommf" "github.com/mumax/3/util" ) func init() { DeclFunc("Expect", Expect, "Used for automated tests: checks if a value is close enough to the expected value") DeclFunc("ExpectV", ExpectV, "Used for automated tests: checks if a vector is close enough to the expected value") DeclFunc("Fprintln", Fprintln, "Print to file") DeclFunc("Sign", sign, "Signum function") DeclFunc("Vector", Vector, "Constructs a vector with given components") DeclConst("Mu0", mag.Mu0, "Permittivity of vaccum (Tm/A)") DeclFunc("Print", myprint, "Print to standard output") DeclFunc("LoadFile", LoadFile, "Load a data file (ovf or dump)") DeclFunc("Index2Coord", Index2Coord, "Convert cell index to x,y,z coordinate in meter") DeclFunc("NewSlice", NewSlice, "Makes a 4D array with a specified number of components (first argument) "+ "and a specified size nx,ny,nz (remaining arguments)") DeclFunc("NewVectorMask", NewVectorMask, "Makes a 3D array of vectors") DeclFunc("NewScalarMask", NewScalarMask, "Makes a 3D array of scalars") } // Returns a new new slice (3D array) with given number of components and size. func NewSlice(ncomp, Nx, Ny, Nz int) *data.Slice { return data.NewSlice(ncomp, [3]int{Nx, Ny, Nz}) } func NewVectorMask(Nx, Ny, Nz int) *data.Slice { return data.NewSlice(3, [3]int{Nx, Ny, Nz}) } func NewScalarMask(Nx, Ny, Nz int) *data.Slice { return data.NewSlice(1, [3]int{Nx, Ny, Nz}) } // Constructs a vector func Vector(x, y, z float64) data.Vector { return data.Vector{x, y, z} } // Test if have lies within want +/- maxError, // and print suited message. func Expect(msg string, have, want, maxError float64) { if math.IsNaN(have) || math.IsNaN(want) || math.Abs(have-want) > maxError { LogOut(msg, ":", " have: ", have, " want: ", want, "±", maxError) Close() os.Exit(1) } else { LogOut(msg, ":", have, "OK") } // note: we also check "want" for NaN in case "have" and "want" are switched. } func ExpectV(msg string, have, want data.Vector, maxErr float64) { for c := 0; c < 3; c++ { Expect(fmt.Sprintf("%v[%v]", msg, c), have[c], want[c], maxErr) } } // Append msg to file. Used to write aggregated output of many simulations in one file. func Fprintln(filename string, msg ...interface{}) { if !path.IsAbs(filename) { filename = OD() + filename } httpfs.Touch(filename) err := httpfs.Append(filename, []byte(fmt.Sprintln(myFmt(msg)...))) util.FatalErr(err) } // Read a magnetization state from .dump file. func LoadFile(fname string) *data.Slice { in, err := httpfs.Open(fname) util.FatalErr(err) var s *data.Slice if path.Ext(fname) == ".dump" { s, _, err = dump.Read(in) } else { s, _, err = oommf.Read(in) } util.FatalErr(err) return s } // Download a quantity to host, // or just return its data when already on host. func Download(q Quantity) *data.Slice { // TODO: optimize for Buffer() buf := ValueOf(q) defer cuda.Recycle(buf) if buf.CPUAccess() { return buf } else { return buf.HostCopy() } } // print with special formatting for some known types func myprint(msg ...interface{}) { LogOut(myFmt(msg)...) } // mumax specific formatting (Slice -> average, etc). func myFmt(msg []interface{}) []interface{} { for i, m := range msg { if e, ok := m.(*float64); ok { msg[i] = *e } // Tabledata: print average if m, ok := m.(Quantity); ok { str := fmt.Sprint(AverageOf(m)) msg[i] = str[1 : len(str)-1] // remove [ ] continue } } return msg } // converts cell index to coordinate, internal coordinates func Index2Coord(ix, iy, iz int) data.Vector { m := Mesh() n := m.Size() c := m.CellSize() x := c[X]*(float64(ix)-0.5*float64(n[X]-1)) - TotalShift y := c[Y]*(float64(iy)-0.5*float64(n[Y]-1)) - TotalYShift z := c[Z] * (float64(iz) - 0.5*float64(n[Z]-1)) return data.Vector{x, y, z} } func sign(x float64) float64 { switch { case x > 0: return 1 case x < 0: return -1 default: return 0 } } // returns a/b, or 0 when b == 0 func safediv(a, b float32) float32 { if b == 0 { return 0 } else { return a / b } } // dst = a/b, unless b == 0 func paramDiv(dst, a, b [][NREGION]float32) { util.Assert(len(dst) == 1 && len(a) == 1 && len(b) == 1) for i := 0; i < NREGION; i++ { // not regions.maxreg dst[0][i] = safediv(a[0][i], b[0][i]) } } // shortcut for slicing unaddressable_vector()[:] func slice(v [3]float64) []float64 { return v[:] } func unslice(v []float64) [3]float64 { util.Assert(len(v) == 3) return [3]float64{v[0], v[1], v[2]} } func assureGPU(s *data.Slice) *data.Slice { if s.GPUAccess() { return s } else { return cuda.GPUCopy(s) } } type caseIndep []string func (s *caseIndep) Len() int { return len(*s) } func (s *caseIndep) Less(i, j int) bool { return strings.ToLower((*s)[i]) < strings.ToLower((*s)[j]) } func (s *caseIndep) Swap(i, j int) { (*s)[i], (*s)[j] = (*s)[j], (*s)[i] } func sortNoCase(s []string) { i := caseIndep(s) sort.Sort(&i) } func checkNaN1(x float64) { if math.IsNaN(x) { panic("NaN") } } // trim trailing newlines func rmln(a string) string { for strings.HasSuffix(a, "\n") { a = a[:len(a)-1] } return a } const ( X = 0 Y = 1 Z = 2 ) const ( SCALAR = 1 VECTOR = 3 ) mumax3-3.10/engine/zeeman.go000066400000000000000000000007331371432437400157350ustar00rootroot00000000000000package engine var ( B_ext = NewExcitation("B_ext", "T", "Externally applied field") Edens_zeeman = NewScalarField("Edens_Zeeman", "J/m3", "Zeeman energy density", AddEdens_zeeman) E_Zeeman = NewScalarValue("E_Zeeman", "J", "Zeeman energy", GetZeemanEnergy) ) var AddEdens_zeeman = makeEdensAdder(B_ext, -1) func init() { registerEnergy(GetZeemanEnergy, AddEdens_zeeman) } func GetZeemanEnergy() float64 { return -1 * cellVolume() * dot(&M_full, B_ext) } mumax3-3.10/freetype/000077500000000000000000000000001371432437400145025ustar00rootroot00000000000000mumax3-3.10/freetype/AUTHORS000066400000000000000000000011401371432437400155460ustar00rootroot00000000000000# This is the official list of Freetype-Go authors for copyright purposes. # This file is distinct from the CONTRIBUTORS files. # See the latter for an explanation. # # Freetype-Go is derived from Freetype, which is written in C. The latter # is copyright 1996-2010 David Turner, Robert Wilhelm, and Werner Lemberg. # Names should be added to this file as # Name or Organization # The email address is not required for organizations. # Please keep the list sorted. Google Inc. Jeff R. Allen Rémy Oudompheng Roger Peppe mumax3-3.10/freetype/CONTRIBUTORS000066400000000000000000000025771371432437400163750ustar00rootroot00000000000000# This is the official list of people who can contribute # (and typically have contributed) code to the Freetype-Go repository. # The AUTHORS file lists the copyright holders; this file # lists people. For example, Google employees are listed here # but not in AUTHORS, because Google holds the copyright. # # The submission process automatically checks to make sure # that people submitting code are listed in this file (by email address). # # Names should be added to this file only after verifying that # the individual or the individual's organization has agreed to # the appropriate Contributor License Agreement, found here: # # http://code.google.com/legal/individual-cla-v1.0.html # http://code.google.com/legal/corporate-cla-v1.0.html # # The agreement for individuals can be filled out on the web. # # When adding J Random Contributor's name to this file, # either J's name or J's organization's name should be # added to the AUTHORS file, depending on whether the # individual or corporate CLA was used. # Names should be added to this file like so: # Name # Please keep the list sorted. Andrew Gerrand Jeff R. Allen Nigel Tao Rémy Oudompheng Rob Pike Roger Peppe Russ Cox mumax3-3.10/freetype/LICENSE000066400000000000000000000010651371432437400155110ustar00rootroot00000000000000Use of the Freetype-Go software is subject to your choice of exactly one of the following two licenses: * The FreeType License, which is similar to the original BSD license with an advertising clause, or * The GNU General Public License (GPL), version 2 or later. The text of these licenses are available in the licenses/ftl.txt and the licenses/gpl.txt files respectively. They are also available at http://freetype.sourceforge.net/license.html The Luxi fonts in the testdata directory are licensed separately. See the testdata/COPYING file for details. mumax3-3.10/freetype/README000066400000000000000000000016041371432437400153630ustar00rootroot00000000000000This is a port of the Freetype font rasterizer (www.freetype.org) to the Go programming language (golang.org). To download and install from source: $ go get code.google.com/p/freetype-go/freetype It is an incomplete port: * It only supports TrueType fonts, and not Type 1 fonts nor bitmap fonts. * It only supports the Unicode encoding. There are also some implementation differences: * It uses a 24.8 fixed point co-ordinate system everywhere internally, as opposed to the original Freetype's mix of 26.6 (or 10.6 for 16-bit systems) in some places, and 24.8 in the "smooth" rasterizer. Freetype-Go is derived from Freetype, which is written in C. Freetype is copyright 1996-2010 David Turner, Robert Wilhelm, and Werner Lemberg. Freetype-Go is copyright The Freetype-Go Authors, who are listed in the AUTHORS file. The Freetype-Go homepage is http://code.google.com/p/freetype-go/ mumax3-3.10/freetype/raster/000077500000000000000000000000001371432437400160025ustar00rootroot00000000000000mumax3-3.10/freetype/raster/geom.go000066400000000000000000000173531371432437400172710ustar00rootroot00000000000000// Copyright 2010 The Freetype-Go Authors. All rights reserved. // Use of this source code is governed by your choice of either the // FreeType License or the GNU General Public License version 2 (or // any later version), both of which can be found in the LICENSE file. package raster import ( "fmt" "math" ) // A Fix32 is a 24.8 fixed point number. type Fix32 int32 // A Fix64 is a 48.16 fixed point number. type Fix64 int64 // String returns a human-readable representation of a 24.8 fixed point number. // For example, the number one-and-a-quarter becomes "1:064". func (x Fix32) String() string { if x < 0 { x = -x return fmt.Sprintf("-%d:%03d", int32(x/256), int32(x%256)) } return fmt.Sprintf("%d:%03d", int32(x/256), int32(x%256)) } // String returns a human-readable representation of a 48.16 fixed point number. // For example, the number one-and-a-quarter becomes "1:16384". func (x Fix64) String() string { if x < 0 { x = -x return fmt.Sprintf("-%d:%05d", int64(x/65536), int64(x%65536)) } return fmt.Sprintf("%d:%05d", int64(x/65536), int64(x%65536)) } // maxAbs returns the maximum of abs(a) and abs(b). func maxAbs(a, b Fix32) Fix32 { if a < 0 { a = -a } if b < 0 { b = -b } if a < b { return b } return a } // A Point represents a two-dimensional point or vector, in 24.8 fixed point // format. type Point struct { X, Y Fix32 } // String returns a human-readable representation of a Point. func (p Point) String() string { return "(" + p.X.String() + ", " + p.Y.String() + ")" } // Add returns the vector p + q. func (p Point) Add(q Point) Point { return Point{p.X + q.X, p.Y + q.Y} } // Sub returns the vector p - q. func (p Point) Sub(q Point) Point { return Point{p.X - q.X, p.Y - q.Y} } // Mul returns the vector k * p. func (p Point) Mul(k Fix32) Point { return Point{p.X * k / 256, p.Y * k / 256} } // Neg returns the vector -p, or equivalently p rotated by 180 degrees. func (p Point) Neg() Point { return Point{-p.X, -p.Y} } // Dot returns the dot product p·q. func (p Point) Dot(q Point) Fix64 { px, py := int64(p.X), int64(p.Y) qx, qy := int64(q.X), int64(q.Y) return Fix64(px*qx + py*qy) } // Len returns the length of the vector p. func (p Point) Len() Fix32 { // TODO(nigeltao): use fixed point math. x := float64(p.X) y := float64(p.Y) return Fix32(math.Sqrt(x*x + y*y)) } // Norm returns the vector p normalized to the given length, or the zero Point // if p is degenerate. func (p Point) Norm(length Fix32) Point { d := p.Len() if d == 0 { return Point{0, 0} } s, t := int64(length), int64(d) x := int64(p.X) * s / t y := int64(p.Y) * s / t return Point{Fix32(x), Fix32(y)} } // Rot45CW returns the vector p rotated clockwise by 45 degrees. // Note that the Y-axis grows downwards, so {1, 0}.Rot45CW is {1/√2, 1/√2}. func (p Point) Rot45CW() Point { // 181/256 is approximately 1/√2, or sin(π/4). px, py := int64(p.X), int64(p.Y) qx := (+px - py) * 181 / 256 qy := (+px + py) * 181 / 256 return Point{Fix32(qx), Fix32(qy)} } // Rot90CW returns the vector p rotated clockwise by 90 degrees. // Note that the Y-axis grows downwards, so {1, 0}.Rot90CW is {0, 1}. func (p Point) Rot90CW() Point { return Point{-p.Y, p.X} } // Rot135CW returns the vector p rotated clockwise by 135 degrees. // Note that the Y-axis grows downwards, so {1, 0}.Rot135CW is {-1/√2, 1/√2}. func (p Point) Rot135CW() Point { // 181/256 is approximately 1/√2, or sin(π/4). px, py := int64(p.X), int64(p.Y) qx := (-px - py) * 181 / 256 qy := (+px - py) * 181 / 256 return Point{Fix32(qx), Fix32(qy)} } // Rot45CCW returns the vector p rotated counter-clockwise by 45 degrees. // Note that the Y-axis grows downwards, so {1, 0}.Rot45CCW is {1/√2, -1/√2}. func (p Point) Rot45CCW() Point { // 181/256 is approximately 1/√2, or sin(π/4). px, py := int64(p.X), int64(p.Y) qx := (+px + py) * 181 / 256 qy := (-px + py) * 181 / 256 return Point{Fix32(qx), Fix32(qy)} } // Rot90CCW returns the vector p rotated counter-clockwise by 90 degrees. // Note that the Y-axis grows downwards, so {1, 0}.Rot90CCW is {0, -1}. func (p Point) Rot90CCW() Point { return Point{p.Y, -p.X} } // Rot135CCW returns the vector p rotated counter-clockwise by 135 degrees. // Note that the Y-axis grows downwards, so {1, 0}.Rot135CCW is {-1/√2, -1/√2}. func (p Point) Rot135CCW() Point { // 181/256 is approximately 1/√2, or sin(π/4). px, py := int64(p.X), int64(p.Y) qx := (-px + py) * 181 / 256 qy := (-px - py) * 181 / 256 return Point{Fix32(qx), Fix32(qy)} } // An Adder accumulates points on a curve. type Adder interface { // Start starts a new curve at the given point. Start(a Point) // Add1 adds a linear segment to the current curve. Add1(b Point) // Add2 adds a quadratic segment to the current curve. Add2(b, c Point) // Add3 adds a cubic segment to the current curve. Add3(b, c, d Point) } // A Path is a sequence of curves, and a curve is a start point followed by a // sequence of linear, quadratic or cubic segments. type Path []Fix32 // String returns a human-readable representation of a Path. func (p Path) String() string { s := "" for i := 0; i < len(p); { if i != 0 { s += " " } switch p[i] { case 0: s += "S0" + fmt.Sprint([]Fix32(p[i+1:i+3])) i += 4 case 1: s += "A1" + fmt.Sprint([]Fix32(p[i+1:i+3])) i += 4 case 2: s += "A2" + fmt.Sprint([]Fix32(p[i+1:i+5])) i += 6 case 3: s += "A3" + fmt.Sprint([]Fix32(p[i+1:i+7])) i += 8 default: panic("freetype/raster: bad path") } } return s } // grow adds n elements to p. func (p *Path) grow(n int) { n += len(*p) if n > cap(*p) { old := *p *p = make([]Fix32, n, 2*n+8) copy(*p, old) return } *p = (*p)[0:n] } // Clear cancels any previous calls to p.Start or p.AddXxx. func (p *Path) Clear() { *p = (*p)[0:0] } // Start starts a new curve at the given point. func (p *Path) Start(a Point) { n := len(*p) p.grow(4) (*p)[n] = 0 (*p)[n+1] = a.X (*p)[n+2] = a.Y (*p)[n+3] = 0 } // Add1 adds a linear segment to the current curve. func (p *Path) Add1(b Point) { n := len(*p) p.grow(4) (*p)[n] = 1 (*p)[n+1] = b.X (*p)[n+2] = b.Y (*p)[n+3] = 1 } // Add2 adds a quadratic segment to the current curve. func (p *Path) Add2(b, c Point) { n := len(*p) p.grow(6) (*p)[n] = 2 (*p)[n+1] = b.X (*p)[n+2] = b.Y (*p)[n+3] = c.X (*p)[n+4] = c.Y (*p)[n+5] = 2 } // Add3 adds a cubic segment to the current curve. func (p *Path) Add3(b, c, d Point) { n := len(*p) p.grow(8) (*p)[n] = 3 (*p)[n+1] = b.X (*p)[n+2] = b.Y (*p)[n+3] = c.X (*p)[n+4] = c.Y (*p)[n+5] = d.X (*p)[n+6] = d.Y (*p)[n+7] = 3 } // AddPath adds the Path q to p. func (p *Path) AddPath(q Path) { n, m := len(*p), len(q) p.grow(m) copy((*p)[n:n+m], q) } // AddStroke adds a stroked Path. func (p *Path) AddStroke(q Path, width Fix32, cr Capper, jr Joiner) { Stroke(p, q, width, cr, jr) } // firstPoint returns the first point in a non-empty Path. func (p Path) firstPoint() Point { return Point{p[1], p[2]} } // lastPoint returns the last point in a non-empty Path. func (p Path) lastPoint() Point { return Point{p[len(p)-3], p[len(p)-2]} } // addPathReversed adds q reversed to p. // For example, if q consists of a linear segment from A to B followed by a // quadratic segment from B to C to D, then the values of q looks like: // index: 01234567890123 // value: 0AA01BB12CCDD2 // So, when adding q backwards to p, we want to Add2(C, B) followed by Add1(A). func addPathReversed(p Adder, q Path) { if len(q) == 0 { return } i := len(q) - 1 for { switch q[i] { case 0: return case 1: i -= 4 p.Add1(Point{q[i-2], q[i-1]}) case 2: i -= 6 p.Add2(Point{q[i+2], q[i+3]}, Point{q[i-2], q[i-1]}) case 3: i -= 8 p.Add3(Point{q[i+4], q[i+5]}, Point{q[i+2], q[i+3]}, Point{q[i-2], q[i-1]}) default: panic("freetype/raster: bad path") } } } mumax3-3.10/freetype/raster/paint.go000066400000000000000000000171451371432437400174540ustar00rootroot00000000000000// Copyright 2010 The Freetype-Go Authors. All rights reserved. // Use of this source code is governed by your choice of either the // FreeType License or the GNU General Public License version 2 (or // any later version), both of which can be found in the LICENSE file. package raster import ( "image" "image/color" "image/draw" "math" ) // A Span is a horizontal segment of pixels with constant alpha. X0 is an // inclusive bound and X1 is exclusive, the same as for slices. A fully // opaque Span has A == 1<<32 - 1. type Span struct { Y, X0, X1 int A uint32 } // A Painter knows how to paint a batch of Spans. Rasterization may involve // Painting multiple batches, and done will be true for the final batch. // The Spans' Y values are monotonically increasing during a rasterization. // Paint may use all of ss as scratch space during the call. type Painter interface { Paint(ss []Span, done bool) } // The PainterFunc type adapts an ordinary function to the Painter interface. type PainterFunc func(ss []Span, done bool) // Paint just delegates the call to f. func (f PainterFunc) Paint(ss []Span, done bool) { f(ss, done) } // An AlphaOverPainter is a Painter that paints Spans onto an image.Alpha // using the Over Porter-Duff composition operator. type AlphaOverPainter struct { Image *image.Alpha } // Paint satisfies the Painter interface by painting ss onto an image.Alpha. func (r AlphaOverPainter) Paint(ss []Span, done bool) { b := r.Image.Bounds() for _, s := range ss { if s.Y < b.Min.Y { continue } if s.Y >= b.Max.Y { return } if s.X0 < b.Min.X { s.X0 = b.Min.X } if s.X1 > b.Max.X { s.X1 = b.Max.X } if s.X0 >= s.X1 { continue } base := (s.Y-r.Image.Rect.Min.Y)*r.Image.Stride - r.Image.Rect.Min.X p := r.Image.Pix[base+s.X0 : base+s.X1] a := int(s.A >> 24) for i, c := range p { v := int(c) p[i] = uint8((v*255 + (255-v)*a) / 255) } } } // NewAlphaOverPainter creates a new AlphaOverPainter for the given image. func NewAlphaOverPainter(m *image.Alpha) AlphaOverPainter { return AlphaOverPainter{m} } // An AlphaSrcPainter is a Painter that paints Spans onto an image.Alpha // using the Src Porter-Duff composition operator. type AlphaSrcPainter struct { Image *image.Alpha } // Paint satisfies the Painter interface by painting ss onto an image.Alpha. func (r AlphaSrcPainter) Paint(ss []Span, done bool) { b := r.Image.Bounds() for _, s := range ss { if s.Y < b.Min.Y { continue } if s.Y >= b.Max.Y { return } if s.X0 < b.Min.X { s.X0 = b.Min.X } if s.X1 > b.Max.X { s.X1 = b.Max.X } if s.X0 >= s.X1 { continue } base := (s.Y-r.Image.Rect.Min.Y)*r.Image.Stride - r.Image.Rect.Min.X p := r.Image.Pix[base+s.X0 : base+s.X1] color := uint8(s.A >> 24) for i := range p { p[i] = color } } } // NewAlphaSrcPainter creates a new AlphaSrcPainter for the given image. func NewAlphaSrcPainter(m *image.Alpha) AlphaSrcPainter { return AlphaSrcPainter{m} } type RGBAPainter struct { // The image to compose onto. Image *image.RGBA // The Porter-Duff composition operator. Op draw.Op // The 16-bit color to paint the spans. cr, cg, cb, ca uint32 } // Paint satisfies the Painter interface by painting ss onto an image.RGBA. func (r *RGBAPainter) Paint(ss []Span, done bool) { b := r.Image.Bounds() for _, s := range ss { if s.Y < b.Min.Y { continue } if s.Y >= b.Max.Y { return } if s.X0 < b.Min.X { s.X0 = b.Min.X } if s.X1 > b.Max.X { s.X1 = b.Max.X } if s.X0 >= s.X1 { continue } // This code is similar to drawGlyphOver in $GOROOT/src/pkg/image/draw/draw.go. ma := s.A >> 16 const m = 1<<16 - 1 i0 := (s.Y-r.Image.Rect.Min.Y)*r.Image.Stride + (s.X0-r.Image.Rect.Min.X)*4 i1 := i0 + (s.X1-s.X0)*4 if r.Op == draw.Over { for i := i0; i < i1; i += 4 { dr := uint32(r.Image.Pix[i+0]) dg := uint32(r.Image.Pix[i+1]) db := uint32(r.Image.Pix[i+2]) da := uint32(r.Image.Pix[i+3]) a := (m - (r.ca * ma / m)) * 0x101 r.Image.Pix[i+0] = uint8((dr*a + r.cr*ma) / m >> 8) r.Image.Pix[i+1] = uint8((dg*a + r.cg*ma) / m >> 8) r.Image.Pix[i+2] = uint8((db*a + r.cb*ma) / m >> 8) r.Image.Pix[i+3] = uint8((da*a + r.ca*ma) / m >> 8) } } else { for i := i0; i < i1; i += 4 { r.Image.Pix[i+0] = uint8(r.cr * ma / m >> 8) r.Image.Pix[i+1] = uint8(r.cg * ma / m >> 8) r.Image.Pix[i+2] = uint8(r.cb * ma / m >> 8) r.Image.Pix[i+3] = uint8(r.ca * ma / m >> 8) } } } } // SetColor sets the color to paint the spans. func (r *RGBAPainter) SetColor(c color.Color) { r.cr, r.cg, r.cb, r.ca = c.RGBA() } // NewRGBAPainter creates a new RGBAPainter for the given image. func NewRGBAPainter(m *image.RGBA) *RGBAPainter { return &RGBAPainter{Image: m} } // A MonochromePainter wraps another Painter, quantizing each Span's alpha to // be either fully opaque or fully transparent. type MonochromePainter struct { Painter Painter y, x0, x1 int } // Paint delegates to the wrapped Painter after quantizing each Span's alpha // value and merging adjacent fully opaque Spans. func (m *MonochromePainter) Paint(ss []Span, done bool) { // We compact the ss slice, discarding any Spans whose alpha quantizes to zero. j := 0 for _, s := range ss { if s.A >= 1<<31 { if m.y == s.Y && m.x1 == s.X0 { m.x1 = s.X1 } else { ss[j] = Span{m.y, m.x0, m.x1, 1<<32 - 1} j++ m.y, m.x0, m.x1 = s.Y, s.X0, s.X1 } } } if done { // Flush the accumulated Span. finalSpan := Span{m.y, m.x0, m.x1, 1<<32 - 1} if j < len(ss) { ss[j] = finalSpan j++ m.Painter.Paint(ss[0:j], true) } else if j == len(ss) { m.Painter.Paint(ss, false) if cap(ss) > 0 { ss = ss[0:1] } else { ss = make([]Span, 1) } ss[0] = finalSpan m.Painter.Paint(ss, true) } else { panic("unreachable") } // Reset the accumulator, so that this Painter can be re-used. m.y, m.x0, m.x1 = 0, 0, 0 } else { m.Painter.Paint(ss[0:j], false) } } // NewMonochromePainter creates a new MonochromePainter that wraps the given // Painter. func NewMonochromePainter(p Painter) *MonochromePainter { return &MonochromePainter{Painter: p} } // A GammaCorrectionPainter wraps another Painter, performing gamma-correction // on each Span's alpha value. type GammaCorrectionPainter struct { // The wrapped Painter. Painter Painter // Precomputed alpha values for linear interpolation, with fully opaque == 1<<16-1. a [256]uint16 // Whether gamma correction is a no-op. gammaIsOne bool } // Paint delegates to the wrapped Painter after performing gamma-correction // on each Span. func (g *GammaCorrectionPainter) Paint(ss []Span, done bool) { if !g.gammaIsOne { const ( M = 0x1010101 // 255*M == 1<<32-1 N = 0x8080 // N = M>>9, and N < 1<<16-1 ) for i, _ := range ss { if ss[i].A == 0 || ss[i].A == 1<<32-1 { continue } p, q := ss[i].A/M, (ss[i].A%M)>>9 // The resultant alpha is a linear interpolation of g.a[p] and g.a[p+1]. a := uint32(g.a[p])*(N-q) + uint32(g.a[p+1])*q a = (a + N/2) / N // Convert the alpha from 16-bit (which is g.a's range) to 32-bit. a |= a << 16 ss[i].A = a } } g.Painter.Paint(ss, done) } // SetGamma sets the gamma value. func (g *GammaCorrectionPainter) SetGamma(gamma float64) { if gamma == 1.0 { g.gammaIsOne = true return } g.gammaIsOne = false for i := 0; i < 256; i++ { a := float64(i) / 0xff a = math.Pow(a, gamma) g.a[i] = uint16(0xffff * a) } } // NewGammaCorrectionPainter creates a new GammaCorrectionPainter that wraps // the given Painter. func NewGammaCorrectionPainter(p Painter, gamma float64) *GammaCorrectionPainter { g := &GammaCorrectionPainter{Painter: p} g.SetGamma(gamma) return g } mumax3-3.10/freetype/raster/raster.go000066400000000000000000000351321371432437400176350ustar00rootroot00000000000000// Copyright 2010 The Freetype-Go Authors. All rights reserved. // Use of this source code is governed by your choice of either the // FreeType License or the GNU General Public License version 2 (or // any later version), both of which can be found in the LICENSE file. // The raster package provides an anti-aliasing 2-D rasterizer. // // It is part of the larger Freetype-Go suite of font-related packages, // but the raster package is not specific to font rasterization, and can // be used standalone without any other Freetype-Go package. // // Rasterization is done by the same area/coverage accumulation algorithm // as the Freetype "smooth" module, and the Anti-Grain Geometry library. // A description of the area/coverage algorithm is at // http://projects.tuxee.net/cl-vectors/section-the-cl-aa-algorithm package raster import ( "strconv" ) // A cell is part of a linked list (for a given yi co-ordinate) of accumulated // area/coverage for the pixel at (xi, yi). type cell struct { xi int area, cover int next int } type Rasterizer struct { // If false, the default behavior is to use the even-odd winding fill // rule during Rasterize. UseNonZeroWinding bool // An offset (in pixels) to the painted spans. Dx, Dy int // The width of the Rasterizer. The height is implicit in len(cellIndex). width int // splitScaleN is the scaling factor used to determine how many times // to decompose a quadratic or cubic segment into a linear approximation. splitScale2, splitScale3 int // The current pen position. a Point // The current cell and its area/coverage being accumulated. xi, yi int area, cover int // Saved cells. cell []cell // Linked list of cells, one per row. cellIndex []int // Buffers. cellBuf [256]cell cellIndexBuf [64]int spanBuf [64]Span } // findCell returns the index in r.cell for the cell corresponding to // (r.xi, r.yi). The cell is created if necessary. func (r *Rasterizer) findCell() int { if r.yi < 0 || r.yi >= len(r.cellIndex) { return -1 } xi := r.xi if xi < 0 { xi = -1 } else if xi > r.width { xi = r.width } i, prev := r.cellIndex[r.yi], -1 for i != -1 && r.cell[i].xi <= xi { if r.cell[i].xi == xi { return i } i, prev = r.cell[i].next, i } c := len(r.cell) if c == cap(r.cell) { buf := make([]cell, c, 4*c) copy(buf, r.cell) r.cell = buf[0 : c+1] } else { r.cell = r.cell[0 : c+1] } r.cell[c] = cell{xi, 0, 0, i} if prev == -1 { r.cellIndex[r.yi] = c } else { r.cell[prev].next = c } return c } // saveCell saves any accumulated r.area/r.cover for (r.xi, r.yi). func (r *Rasterizer) saveCell() { if r.area != 0 || r.cover != 0 { i := r.findCell() if i != -1 { r.cell[i].area += r.area r.cell[i].cover += r.cover } r.area = 0 r.cover = 0 } } // setCell sets the (xi, yi) cell that r is accumulating area/coverage for. func (r *Rasterizer) setCell(xi, yi int) { if r.xi != xi || r.yi != yi { r.saveCell() r.xi, r.yi = xi, yi } } // scan accumulates area/coverage for the yi'th scanline, going from // x0 to x1 in the horizontal direction (in 24.8 fixed point co-ordinates) // and from y0f to y1f fractional vertical units within that scanline. func (r *Rasterizer) scan(yi int, x0, y0f, x1, y1f Fix32) { // Break the 24.8 fixed point X co-ordinates into integral and fractional parts. x0i := int(x0) / 256 x0f := x0 - Fix32(256*x0i) x1i := int(x1) / 256 x1f := x1 - Fix32(256*x1i) // A perfectly horizontal scan. if y0f == y1f { r.setCell(x1i, yi) return } dx, dy := x1-x0, y1f-y0f // A single cell scan. if x0i == x1i { r.area += int((x0f + x1f) * dy) r.cover += int(dy) return } // There are at least two cells. Apart from the first and last cells, // all intermediate cells go through the full width of the cell, // or 256 units in 24.8 fixed point format. var ( p, q, edge0, edge1 Fix32 xiDelta int ) if dx > 0 { p, q = (256-x0f)*dy, dx edge0, edge1, xiDelta = 0, 256, 1 } else { p, q = x0f*dy, -dx edge0, edge1, xiDelta = 256, 0, -1 } yDelta, yRem := p/q, p%q if yRem < 0 { yDelta -= 1 yRem += q } // Do the first cell. xi, y := x0i, y0f r.area += int((x0f + edge1) * yDelta) r.cover += int(yDelta) xi, y = xi+xiDelta, y+yDelta r.setCell(xi, yi) if xi != x1i { // Do all the intermediate cells. p = 256 * (y1f - y + yDelta) fullDelta, fullRem := p/q, p%q if fullRem < 0 { fullDelta -= 1 fullRem += q } yRem -= q for xi != x1i { yDelta = fullDelta yRem += fullRem if yRem >= 0 { yDelta += 1 yRem -= q } r.area += int(256 * yDelta) r.cover += int(yDelta) xi, y = xi+xiDelta, y+yDelta r.setCell(xi, yi) } } // Do the last cell. yDelta = y1f - y r.area += int((edge0 + x1f) * yDelta) r.cover += int(yDelta) } // Start starts a new curve at the given point. func (r *Rasterizer) Start(a Point) { r.setCell(int(a.X/256), int(a.Y/256)) r.a = a } // Add1 adds a linear segment to the current curve. func (r *Rasterizer) Add1(b Point) { x0, y0 := r.a.X, r.a.Y x1, y1 := b.X, b.Y dx, dy := x1-x0, y1-y0 // Break the 24.8 fixed point Y co-ordinates into integral and fractional parts. y0i := int(y0) / 256 y0f := y0 - Fix32(256*y0i) y1i := int(y1) / 256 y1f := y1 - Fix32(256*y1i) if y0i == y1i { // There is only one scanline. r.scan(y0i, x0, y0f, x1, y1f) } else if dx == 0 { // This is a vertical line segment. We avoid calling r.scan and instead // manipulate r.area and r.cover directly. var ( edge0, edge1 Fix32 yiDelta int ) if dy > 0 { edge0, edge1, yiDelta = 0, 256, 1 } else { edge0, edge1, yiDelta = 256, 0, -1 } x0i, yi := int(x0)/256, y0i x0fTimes2 := (int(x0) - (256 * x0i)) * 2 // Do the first pixel. dcover := int(edge1 - y0f) darea := int(x0fTimes2 * dcover) r.area += darea r.cover += dcover yi += yiDelta r.setCell(x0i, yi) // Do all the intermediate pixels. dcover = int(edge1 - edge0) darea = int(x0fTimes2 * dcover) for yi != y1i { r.area += darea r.cover += dcover yi += yiDelta r.setCell(x0i, yi) } // Do the last pixel. dcover = int(y1f - edge0) darea = int(x0fTimes2 * dcover) r.area += darea r.cover += dcover } else { // There are at least two scanlines. Apart from the first and last scanlines, // all intermediate scanlines go through the full height of the row, or 256 // units in 24.8 fixed point format. var ( p, q, edge0, edge1 Fix32 yiDelta int ) if dy > 0 { p, q = (256-y0f)*dx, dy edge0, edge1, yiDelta = 0, 256, 1 } else { p, q = y0f*dx, -dy edge0, edge1, yiDelta = 256, 0, -1 } xDelta, xRem := p/q, p%q if xRem < 0 { xDelta -= 1 xRem += q } // Do the first scanline. x, yi := x0, y0i r.scan(yi, x, y0f, x+xDelta, edge1) x, yi = x+xDelta, yi+yiDelta r.setCell(int(x)/256, yi) if yi != y1i { // Do all the intermediate scanlines. p = 256 * dx fullDelta, fullRem := p/q, p%q if fullRem < 0 { fullDelta -= 1 fullRem += q } xRem -= q for yi != y1i { xDelta = fullDelta xRem += fullRem if xRem >= 0 { xDelta += 1 xRem -= q } r.scan(yi, x, edge0, x+xDelta, edge1) x, yi = x+xDelta, yi+yiDelta r.setCell(int(x)/256, yi) } } // Do the last scanline. r.scan(yi, x, edge0, x1, y1f) } // The next lineTo starts from b. r.a = b } // Add2 adds a quadratic segment to the current curve. func (r *Rasterizer) Add2(b, c Point) { // Calculate nSplit (the number of recursive decompositions) based on how `curvy' it is. // Specifically, how much the middle point b deviates from (a+c)/2. dev := maxAbs(r.a.X-2*b.X+c.X, r.a.Y-2*b.Y+c.Y) / Fix32(r.splitScale2) nsplit := 0 for dev > 0 { dev /= 4 nsplit++ } // dev is 32-bit, and nsplit++ every time we shift off 2 bits, so maxNsplit is 16. const maxNsplit = 16 if nsplit > maxNsplit { panic("freetype/raster: Add2 nsplit too large: " + strconv.Itoa(nsplit)) } // Recursively decompose the curve nSplit levels deep. var ( pStack [2*maxNsplit + 3]Point sStack [maxNsplit + 1]int i int ) sStack[0] = nsplit pStack[0] = c pStack[1] = b pStack[2] = r.a for i >= 0 { s := sStack[i] p := pStack[2*i:] if s > 0 { // Split the quadratic curve p[0:3] into an equivalent set of two shorter curves: // p[0:3] and p[2:5]. The new p[4] is the old p[2], and p[0] is unchanged. mx := p[1].X p[4].X = p[2].X p[3].X = (p[4].X + mx) / 2 p[1].X = (p[0].X + mx) / 2 p[2].X = (p[1].X + p[3].X) / 2 my := p[1].Y p[4].Y = p[2].Y p[3].Y = (p[4].Y + my) / 2 p[1].Y = (p[0].Y + my) / 2 p[2].Y = (p[1].Y + p[3].Y) / 2 // The two shorter curves have one less split to do. sStack[i] = s - 1 sStack[i+1] = s - 1 i++ } else { // Replace the level-0 quadratic with a two-linear-piece approximation. midx := (p[0].X + 2*p[1].X + p[2].X) / 4 midy := (p[0].Y + 2*p[1].Y + p[2].Y) / 4 r.Add1(Point{midx, midy}) r.Add1(p[0]) i-- } } } // Add3 adds a cubic segment to the current curve. func (r *Rasterizer) Add3(b, c, d Point) { // Calculate nSplit (the number of recursive decompositions) based on how `curvy' it is. dev2 := maxAbs(r.a.X-3*(b.X+c.X)+d.X, r.a.Y-3*(b.Y+c.Y)+d.Y) / Fix32(r.splitScale2) dev3 := maxAbs(r.a.X-2*b.X+d.X, r.a.Y-2*b.Y+d.Y) / Fix32(r.splitScale3) nsplit := 0 for dev2 > 0 || dev3 > 0 { dev2 /= 8 dev3 /= 4 nsplit++ } // devN is 32-bit, and nsplit++ every time we shift off 2 bits, so maxNsplit is 16. const maxNsplit = 16 if nsplit > maxNsplit { panic("freetype/raster: Add3 nsplit too large: " + strconv.Itoa(nsplit)) } // Recursively decompose the curve nSplit levels deep. var ( pStack [3*maxNsplit + 4]Point sStack [maxNsplit + 1]int i int ) sStack[0] = nsplit pStack[0] = d pStack[1] = c pStack[2] = b pStack[3] = r.a for i >= 0 { s := sStack[i] p := pStack[3*i:] if s > 0 { // Split the cubic curve p[0:4] into an equivalent set of two shorter curves: // p[0:4] and p[3:7]. The new p[6] is the old p[3], and p[0] is unchanged. m01x := (p[0].X + p[1].X) / 2 m12x := (p[1].X + p[2].X) / 2 m23x := (p[2].X + p[3].X) / 2 p[6].X = p[3].X p[5].X = m23x p[1].X = m01x p[2].X = (m01x + m12x) / 2 p[4].X = (m12x + m23x) / 2 p[3].X = (p[2].X + p[4].X) / 2 m01y := (p[0].Y + p[1].Y) / 2 m12y := (p[1].Y + p[2].Y) / 2 m23y := (p[2].Y + p[3].Y) / 2 p[6].Y = p[3].Y p[5].Y = m23y p[1].Y = m01y p[2].Y = (m01y + m12y) / 2 p[4].Y = (m12y + m23y) / 2 p[3].Y = (p[2].Y + p[4].Y) / 2 // The two shorter curves have one less split to do. sStack[i] = s - 1 sStack[i+1] = s - 1 i++ } else { // Replace the level-0 cubic with a two-linear-piece approximation. midx := (p[0].X + 3*(p[1].X+p[2].X) + p[3].X) / 8 midy := (p[0].Y + 3*(p[1].Y+p[2].Y) + p[3].Y) / 8 r.Add1(Point{midx, midy}) r.Add1(p[0]) i-- } } } // AddPath adds the given Path. func (r *Rasterizer) AddPath(p Path) { for i := 0; i < len(p); { switch p[i] { case 0: r.Start(Point{p[i+1], p[i+2]}) i += 4 case 1: r.Add1(Point{p[i+1], p[i+2]}) i += 4 case 2: r.Add2(Point{p[i+1], p[i+2]}, Point{p[i+3], p[i+4]}) i += 6 case 3: r.Add3(Point{p[i+1], p[i+2]}, Point{p[i+3], p[i+4]}, Point{p[i+5], p[i+6]}) i += 8 default: panic("freetype/raster: bad path") } } } // AddStroke adds a stroked Path. func (r *Rasterizer) AddStroke(q Path, width Fix32, cr Capper, jr Joiner) { Stroke(r, q, width, cr, jr) } // Converts an area value to a uint32 alpha value. A completely filled pixel // corresponds to an area of 256*256*2, and an alpha of 1<<32-1. The // conversion of area values greater than this depends on the winding rule: // even-odd or non-zero. func (r *Rasterizer) areaToAlpha(area int) uint32 { // The C Freetype implementation (version 2.3.12) does "alpha := area>>1" without // the +1. Round-to-nearest gives a more symmetric result than round-down. // The C implementation also returns 8-bit alpha, not 32-bit alpha. a := (area + 1) >> 1 if a < 0 { a = -a } alpha := uint32(a) if r.UseNonZeroWinding { if alpha > 0xffff { alpha = 0xffff } } else { alpha &= 0x1ffff if alpha > 0x10000 { alpha = 0x20000 - alpha } else if alpha == 0x10000 { alpha = 0x0ffff } } alpha |= alpha << 16 return alpha } // Rasterize converts r's accumulated curves into Spans for p. The Spans // passed to p are non-overlapping, and sorted by Y and then X. They all // have non-zero width (and 0 <= X0 < X1 <= r.width) and non-zero A, except // for the final Span, which has Y, X0, X1 and A all equal to zero. func (r *Rasterizer) Rasterize(p Painter) { r.saveCell() s := 0 for yi := 0; yi < len(r.cellIndex); yi++ { xi, cover := 0, 0 for c := r.cellIndex[yi]; c != -1; c = r.cell[c].next { if cover != 0 && r.cell[c].xi > xi { alpha := r.areaToAlpha(cover * 256 * 2) if alpha != 0 { xi0, xi1 := xi, r.cell[c].xi if xi0 < 0 { xi0 = 0 } if xi1 >= r.width { xi1 = r.width } if xi0 < xi1 { r.spanBuf[s] = Span{yi + r.Dy, xi0 + r.Dx, xi1 + r.Dx, alpha} s++ } } } cover += r.cell[c].cover alpha := r.areaToAlpha(cover*256*2 - r.cell[c].area) xi = r.cell[c].xi + 1 if alpha != 0 { xi0, xi1 := r.cell[c].xi, xi if xi0 < 0 { xi0 = 0 } if xi1 >= r.width { xi1 = r.width } if xi0 < xi1 { r.spanBuf[s] = Span{yi + r.Dy, xi0 + r.Dx, xi1 + r.Dx, alpha} s++ } } if s > len(r.spanBuf)-2 { p.Paint(r.spanBuf[0:s], false) s = 0 } } } p.Paint(r.spanBuf[0:s], true) } // Clear cancels any previous calls to r.Start or r.AddXxx. func (r *Rasterizer) Clear() { r.a = Point{0, 0} r.xi = 0 r.yi = 0 r.area = 0 r.cover = 0 r.cell = r.cell[0:0] for i := 0; i < len(r.cellIndex); i++ { r.cellIndex[i] = -1 } } // SetBounds sets the maximum width and height of the rasterized image and // calls Clear. The width and height are in pixels, not Fix32 units. func (r *Rasterizer) SetBounds(width, height int) { if width < 0 { width = 0 } if height < 0 { height = 0 } // Use the same ssN heuristic as the C Freetype implementation. // The C implementation uses the values 32, 16, but those are in // 26.6 fixed point units, and we use 24.8 fixed point everywhere. ss2, ss3 := 128, 64 if width > 24 || height > 24 { ss2, ss3 = 2*ss2, 2*ss3 if width > 120 || height > 120 { ss2, ss3 = 2*ss2, 2*ss3 } } r.width = width r.splitScale2 = ss2 r.splitScale3 = ss3 r.cell = r.cellBuf[0:0] if height > len(r.cellIndexBuf) { r.cellIndex = make([]int, height) } else { r.cellIndex = r.cellIndexBuf[0:height] } r.Clear() } // NewRasterizer creates a new Rasterizer with the given bounds. func NewRasterizer(width, height int) *Rasterizer { r := new(Rasterizer) r.SetBounds(width, height) return r } mumax3-3.10/freetype/raster/stroke.go000066400000000000000000000341711371432437400176460ustar00rootroot00000000000000// Copyright 2010 The Freetype-Go Authors. All rights reserved. // Use of this source code is governed by your choice of either the // FreeType License or the GNU General Public License version 2 (or // any later version), both of which can be found in the LICENSE file. package raster // Two points are considered practically equal if the square of the distance // between them is less than one quarter (i.e. 16384 / 65536 in Fix64). const epsilon = 16384 // A Capper signifies how to begin or end a stroked path. type Capper interface { // Cap adds a cap to p given a pivot point and the normal vector of a // terminal segment. The normal's length is half of the stroke width. Cap(p Adder, halfWidth Fix32, pivot, n1 Point) } // The CapperFunc type adapts an ordinary function to be a Capper. type CapperFunc func(Adder, Fix32, Point, Point) func (f CapperFunc) Cap(p Adder, halfWidth Fix32, pivot, n1 Point) { f(p, halfWidth, pivot, n1) } // A Joiner signifies how to join interior nodes of a stroked path. type Joiner interface { // Join adds a join to the two sides of a stroked path given a pivot // point and the normal vectors of the trailing and leading segments. // Both normals have length equal to half of the stroke width. Join(lhs, rhs Adder, halfWidth Fix32, pivot, n0, n1 Point) } // The JoinerFunc type adapts an ordinary function to be a Joiner. type JoinerFunc func(lhs, rhs Adder, halfWidth Fix32, pivot, n0, n1 Point) func (f JoinerFunc) Join(lhs, rhs Adder, halfWidth Fix32, pivot, n0, n1 Point) { f(lhs, rhs, halfWidth, pivot, n0, n1) } // RoundCapper adds round caps to a stroked path. var RoundCapper Capper = CapperFunc(roundCapper) func roundCapper(p Adder, halfWidth Fix32, pivot, n1 Point) { // The cubic Bézier approximation to a circle involves the magic number // (√2 - 1) * 4/3, which is approximately 141/256. const k = 141 e := n1.Rot90CCW() side := pivot.Add(e) start, end := pivot.Sub(n1), pivot.Add(n1) d, e := n1.Mul(k), e.Mul(k) p.Add3(start.Add(e), side.Sub(d), side) p.Add3(side.Add(d), end.Add(e), end) } // ButtCapper adds butt caps to a stroked path. var ButtCapper Capper = CapperFunc(buttCapper) func buttCapper(p Adder, halfWidth Fix32, pivot, n1 Point) { p.Add1(pivot.Add(n1)) } // SquareCapper adds square caps to a stroked path. var SquareCapper Capper = CapperFunc(squareCapper) func squareCapper(p Adder, halfWidth Fix32, pivot, n1 Point) { e := n1.Rot90CCW() side := pivot.Add(e) p.Add1(side.Sub(n1)) p.Add1(side.Add(n1)) p.Add1(pivot.Add(n1)) } // RoundJoiner adds round joins to a stroked path. var RoundJoiner Joiner = JoinerFunc(roundJoiner) func roundJoiner(lhs, rhs Adder, haflWidth Fix32, pivot, n0, n1 Point) { dot := n0.Rot90CW().Dot(n1) if dot >= 0 { addArc(lhs, pivot, n0, n1) rhs.Add1(pivot.Sub(n1)) } else { lhs.Add1(pivot.Add(n1)) addArc(rhs, pivot, n0.Neg(), n1.Neg()) } } // BevelJoiner adds bevel joins to a stroked path. var BevelJoiner Joiner = JoinerFunc(bevelJoiner) func bevelJoiner(lhs, rhs Adder, haflWidth Fix32, pivot, n0, n1 Point) { lhs.Add1(pivot.Add(n1)) rhs.Add1(pivot.Sub(n1)) } // addArc adds a circular arc from pivot+n0 to pivot+n1 to p. The shorter of // the two possible arcs is taken, i.e. the one spanning <= 180 degrees. // The two vectors n0 and n1 must be of equal length. func addArc(p Adder, pivot, n0, n1 Point) { // r2 is the square of the length of n0. r2 := n0.Dot(n0) if r2 < epsilon { // The arc radius is so small that we collapse to a straight line. p.Add1(pivot.Add(n1)) return } // We approximate the arc by 0, 1, 2 or 3 45-degree quadratic segments plus // a final quadratic segment from s to n1. Each 45-degree segment has control // points {1, 0}, {1, tan(π/8)} and {1/√2, 1/√2} suitably scaled, rotated and // translated. tan(π/8) is approximately 106/256. const tpo8 = 106 var s Point // We determine which octant the angle between n0 and n1 is in via three dot products. // m0, m1 and m2 are n0 rotated clockwise by 45, 90 and 135 degrees. m0 := n0.Rot45CW() m1 := n0.Rot90CW() m2 := m0.Rot90CW() if m1.Dot(n1) >= 0 { if n0.Dot(n1) >= 0 { if m2.Dot(n1) <= 0 { // n1 is between 0 and 45 degrees clockwise of n0. s = n0 } else { // n1 is between 45 and 90 degrees clockwise of n0. p.Add2(pivot.Add(n0).Add(m1.Mul(tpo8)), pivot.Add(m0)) s = m0 } } else { pm1, n0t := pivot.Add(m1), n0.Mul(tpo8) p.Add2(pivot.Add(n0).Add(m1.Mul(tpo8)), pivot.Add(m0)) p.Add2(pm1.Add(n0t), pm1) if m0.Dot(n1) >= 0 { // n1 is between 90 and 135 degrees clockwise of n0. s = m1 } else { // n1 is between 135 and 180 degrees clockwise of n0. p.Add2(pm1.Sub(n0t), pivot.Add(m2)) s = m2 } } } else { if n0.Dot(n1) >= 0 { if m0.Dot(n1) >= 0 { // n1 is between 0 and 45 degrees counter-clockwise of n0. s = n0 } else { // n1 is between 45 and 90 degrees counter-clockwise of n0. p.Add2(pivot.Add(n0).Sub(m1.Mul(tpo8)), pivot.Sub(m2)) s = m2.Neg() } } else { pm1, n0t := pivot.Sub(m1), n0.Mul(tpo8) p.Add2(pivot.Add(n0).Sub(m1.Mul(tpo8)), pivot.Sub(m2)) p.Add2(pm1.Add(n0t), pm1) if m2.Dot(n1) <= 0 { // n1 is between 90 and 135 degrees counter-clockwise of n0. s = m1.Neg() } else { // n1 is between 135 and 180 degrees counter-clockwise of n0. p.Add2(pm1.Sub(n0t), pivot.Sub(m0)) s = m0.Neg() } } } // The final quadratic segment has two endpoints s and n1 and the middle // control point is a multiple of s.Add(n1), i.e. it is on the angle bisector // of those two points. The multiple ranges between 128/256 and 150/256 as // the angle between s and n1 ranges between 0 and 45 degrees. // When the angle is 0 degrees (i.e. s and n1 are coincident) then s.Add(n1) // is twice s and so the middle control point of the degenerate quadratic // segment should be half s.Add(n1), and half = 128/256. // When the angle is 45 degrees then 150/256 is the ratio of the lengths of // the two vectors {1, tan(π/8)} and {1 + 1/√2, 1/√2}. // d is the normalized dot product between s and n1. Since the angle ranges // between 0 and 45 degrees then d ranges between 256/256 and 181/256. d := 256 * s.Dot(n1) / r2 multiple := Fix32(150 - 22*(d-181)/(256-181)) p.Add2(pivot.Add(s.Add(n1).Mul(multiple)), pivot.Add(n1)) } // midpoint returns the midpoint of two Points. func midpoint(a, b Point) Point { return Point{(a.X + b.X) / 2, (a.Y + b.Y) / 2} } // angleGreaterThan45 returns whether the angle between two vectors is more // than 45 degrees. func angleGreaterThan45(v0, v1 Point) bool { v := v0.Rot45CCW() return v.Dot(v1) < 0 || v.Rot90CW().Dot(v1) < 0 } // interpolate returns the point (1-t)*a + t*b. func interpolate(a, b Point, t Fix64) Point { s := 65536 - t x := s*Fix64(a.X) + t*Fix64(b.X) y := s*Fix64(a.Y) + t*Fix64(b.Y) return Point{Fix32(x >> 16), Fix32(y >> 16)} } // curviest2 returns the value of t for which the quadratic parametric curve // (1-t)²*a + 2*t*(1-t).b + t²*c has maximum curvature. // // The curvature of the parametric curve f(t) = (x(t), y(t)) is // |x′y″-y′x″| / (x′²+y′²)^(3/2). // // Let d = b-a and e = c-2*b+a, so that f′(t) = 2*d+2*e*t and f″(t) = 2*e. // The curvature's numerator is (2*dx+2*ex*t)*(2*ey)-(2*dy+2*ey*t)*(2*ex), // which simplifies to 4*dx*ey-4*dy*ex, which is constant with respect to t. // // Thus, curvature is extreme where the denominator is extreme, i.e. where // (x′²+y′²) is extreme. The first order condition is that // 2*x′*x″+2*y′*y″ = 0, or (dx+ex*t)*ex + (dy+ey*t)*ey = 0. // Solving for t gives t = -(dx*ex+dy*ey) / (ex*ex+ey*ey). func curviest2(a, b, c Point) Fix64 { dx := int64(b.X - a.X) dy := int64(b.Y - a.Y) ex := int64(c.X - 2*b.X + a.X) ey := int64(c.Y - 2*b.Y + a.Y) if ex == 0 && ey == 0 { return 32768 } return Fix64(-65536 * (dx*ex + dy*ey) / (ex*ex + ey*ey)) } // A stroker holds state for stroking a path. type stroker struct { // p is the destination that records the stroked path. p Adder // u is the half-width of the stroke. u Fix32 // cr and jr specify how to end and connect path segments. cr Capper jr Joiner // r is the reverse path. Stroking a path involves constructing two // parallel paths 2*u apart. The first path is added immediately to p, // the second path is accumulated in r and eventually added in reverse. r Path // a is the most recent segment point. anorm is the segment normal of // length u at that point. a, anorm Point } // addNonCurvy2 adds a quadratic segment to the stroker, where the segment // defined by (k.a, b, c) achieves maximum curvature at either k.a or c. func (k *stroker) addNonCurvy2(b, c Point) { // We repeatedly divide the segment at its middle until it is straight // enough to approximate the stroke by just translating the control points. // ds and ps are stacks of depths and points. t is the top of the stack. const maxDepth = 5 var ( ds [maxDepth + 1]int ps [2*maxDepth + 3]Point t int ) // Initially the ps stack has one quadratic segment of depth zero. ds[0] = 0 ps[2] = k.a ps[1] = b ps[0] = c anorm := k.anorm var cnorm Point for { depth := ds[t] a := ps[2*t+2] b := ps[2*t+1] c := ps[2*t+0] ab := b.Sub(a) bc := c.Sub(b) abIsSmall := ab.Dot(ab) < Fix64(1<<16) bcIsSmall := bc.Dot(bc) < Fix64(1<<16) if abIsSmall && bcIsSmall { // Approximate the segment by a circular arc. cnorm = bc.Norm(k.u).Rot90CCW() mac := midpoint(a, c) addArc(k.p, mac, anorm, cnorm) addArc(&k.r, mac, anorm.Neg(), cnorm.Neg()) } else if depth < maxDepth && angleGreaterThan45(ab, bc) { // Divide the segment in two and push both halves on the stack. mab := midpoint(a, b) mbc := midpoint(b, c) t++ ds[t+0] = depth + 1 ds[t-1] = depth + 1 ps[2*t+2] = a ps[2*t+1] = mab ps[2*t+0] = midpoint(mab, mbc) ps[2*t-1] = mbc continue } else { // Translate the control points. bnorm := c.Sub(a).Norm(k.u).Rot90CCW() cnorm = bc.Norm(k.u).Rot90CCW() k.p.Add2(b.Add(bnorm), c.Add(cnorm)) k.r.Add2(b.Sub(bnorm), c.Sub(cnorm)) } if t == 0 { k.a, k.anorm = c, cnorm return } t-- anorm = cnorm } panic("unreachable") } // Add1 adds a linear segment to the stroker. func (k *stroker) Add1(b Point) { bnorm := b.Sub(k.a).Norm(k.u).Rot90CCW() if len(k.r) == 0 { k.p.Start(k.a.Add(bnorm)) k.r.Start(k.a.Sub(bnorm)) } else { k.jr.Join(k.p, &k.r, k.u, k.a, k.anorm, bnorm) } k.p.Add1(b.Add(bnorm)) k.r.Add1(b.Sub(bnorm)) k.a, k.anorm = b, bnorm } // Add2 adds a quadratic segment to the stroker. func (k *stroker) Add2(b, c Point) { ab := b.Sub(k.a) bc := c.Sub(b) abnorm := ab.Norm(k.u).Rot90CCW() if len(k.r) == 0 { k.p.Start(k.a.Add(abnorm)) k.r.Start(k.a.Sub(abnorm)) } else { k.jr.Join(k.p, &k.r, k.u, k.a, k.anorm, abnorm) } // Approximate nearly-degenerate quadratics by linear segments. abIsSmall := ab.Dot(ab) < epsilon bcIsSmall := bc.Dot(bc) < epsilon if abIsSmall || bcIsSmall { acnorm := c.Sub(k.a).Norm(k.u).Rot90CCW() k.p.Add1(c.Add(acnorm)) k.r.Add1(c.Sub(acnorm)) k.a, k.anorm = c, acnorm return } // The quadratic segment (k.a, b, c) has a point of maximum curvature. // If this occurs at an end point, we process the segment as a whole. t := curviest2(k.a, b, c) if t <= 0 || t >= 65536 { k.addNonCurvy2(b, c) return } // Otherwise, we perform a de Casteljau decomposition at the point of // maximum curvature and process the two straighter parts. mab := interpolate(k.a, b, t) mbc := interpolate(b, c, t) mabc := interpolate(mab, mbc, t) // If the vectors ab and bc are close to being in opposite directions, // then the decomposition can become unstable, so we approximate the // quadratic segment by two linear segments joined by an arc. bcnorm := bc.Norm(k.u).Rot90CCW() if abnorm.Dot(bcnorm) < -Fix64(k.u)*Fix64(k.u)*2047/2048 { pArc := abnorm.Dot(bc) < 0 k.p.Add1(mabc.Add(abnorm)) if pArc { z := abnorm.Rot90CW() addArc(k.p, mabc, abnorm, z) addArc(k.p, mabc, z, bcnorm) } k.p.Add1(mabc.Add(bcnorm)) k.p.Add1(c.Add(bcnorm)) k.r.Add1(mabc.Sub(abnorm)) if !pArc { z := abnorm.Rot90CW() addArc(&k.r, mabc, abnorm.Neg(), z) addArc(&k.r, mabc, z, bcnorm.Neg()) } k.r.Add1(mabc.Sub(bcnorm)) k.r.Add1(c.Sub(bcnorm)) k.a, k.anorm = c, bcnorm return } // Process the decomposed parts. k.addNonCurvy2(mab, mabc) k.addNonCurvy2(mbc, c) } // Add3 adds a cubic segment to the stroker. func (k *stroker) Add3(b, c, d Point) { panic("freetype/raster: stroke unimplemented for cubic segments") } // stroke adds the stroked Path q to p, where q consists of exactly one curve. func (k *stroker) stroke(q Path) { // Stroking is implemented by deriving two paths each k.u apart from q. // The left-hand-side path is added immediately to k.p; the right-hand-side // path is accumulated in k.r. Once we've finished adding the LHS to k.p, // we add the RHS in reverse order. k.r = Path(make([]Fix32, 0, len(q))) k.a = Point{q[1], q[2]} for i := 4; i < len(q); { switch q[i] { case 1: k.Add1(Point{q[i+1], q[i+2]}) i += 4 case 2: k.Add2(Point{q[i+1], q[i+2]}, Point{q[i+3], q[i+4]}) i += 6 case 3: k.Add3(Point{q[i+1], q[i+2]}, Point{q[i+3], q[i+4]}, Point{q[i+5], q[i+6]}) i += 8 default: panic("freetype/raster: bad path") } } if len(k.r) == 0 { return } // TODO(nigeltao): if q is a closed curve then we should join the first and // last segments instead of capping them. k.cr.Cap(k.p, k.u, q.lastPoint(), k.anorm.Neg()) addPathReversed(k.p, k.r) pivot := q.firstPoint() k.cr.Cap(k.p, k.u, pivot, pivot.Sub(Point{k.r[1], k.r[2]})) } // Stroke adds q stroked with the given width to p. The result is typically // self-intersecting and should be rasterized with UseNonZeroWinding. // cr and jr may be nil, which defaults to a RoundCapper or RoundJoiner. func Stroke(p Adder, q Path, width Fix32, cr Capper, jr Joiner) { if len(q) == 0 { return } if cr == nil { cr = RoundCapper } if jr == nil { jr = RoundJoiner } if q[0] != 0 { panic("freetype/raster: bad path") } s := stroker{p: p, u: width / 2, cr: cr, jr: jr} i := 0 for j := 4; j < len(q); { switch q[j] { case 0: s.stroke(q[i:j]) i, j = j, j+4 case 1: j += 4 case 2: j += 6 case 3: j += 8 default: panic("freetype/raster: bad path") } } s.stroke(q[i:]) } mumax3-3.10/gui/000077500000000000000000000000001371432437400134435ustar00rootroot00000000000000mumax3-3.10/gui/Makefile000066400000000000000000000000211371432437400150740ustar00rootroot00000000000000all: go install mumax3-3.10/gui/button.go000066400000000000000000000006511371432437400153070ustar00rootroot00000000000000package gui import "fmt" type button struct { data } func (e *button) update(id string) []jsCall { return []jsCall{{F: "setAttr", Args: []interface{}{id, "innerHTML", e.value()}}} } func (d *Page) Button(id string, value interface{}, extra ...string) string { e := &button{data: data{value}} d.addElem(id, e) return fmt.Sprintf(``, id, id) } mumax3-3.10/gui/checkbox.go000066400000000000000000000007031371432437400155600ustar00rootroot00000000000000package gui import "fmt" type checkbox struct { data } func (e *checkbox) update(id string) []jsCall { return []jsCall{{F: "setAttr", Args: []interface{}{id, "checked", e.value()}}} } func (d *Page) Checkbox(id, text string, value bool, extra ...string) string { e := &checkbox{data: data{value}} d.addElem(id, e) return fmt.Sprintf(`%v`, id, id, text) } mumax3-3.10/gui/clibox.go000066400000000000000000000007231371432437400152540ustar00rootroot00000000000000package gui import "fmt" type clibox struct { data } func (e *clibox) update(id string) []jsCall { return []jsCall{} // We never set the value of the CLI box, only the user does } // Command-line interface textbox where user types commands. func (d *Page) CliBox(id string, value interface{}, extra ...string) string { e := &clibox{data: data{value}} d.addElem(id, e) return fmt.Sprintf(``, "text", id, cat(extra)) } mumax3-3.10/gui/console.go000066400000000000000000000006711371432437400154400ustar00rootroot00000000000000package gui import "fmt" type console struct { data } func (e *console) update(id string) []jsCall { return []jsCall{{F: "setConsoleText", Args: []interface{}{e.value()}}} } func (d *Page) Console(id string, rows, cols int, value interface{}, extra ...string) string { e := &console{data: data{value}} d.addElem(id, e) return fmt.Sprintf(``, id, rows, cols, cat(extra)) } mumax3-3.10/gui/data.go000066400000000000000000000002311371432437400146770ustar00rootroot00000000000000package gui type data struct { val interface{} } func (d *data) set(v interface{}) { d.val = v } func (d *data) value() interface{} { return d.val } mumax3-3.10/gui/datamodels.go000066400000000000000000000020361371432437400161100ustar00rootroot00000000000000package gui import ( "fmt" "log" "strconv" ) type interfaceData struct { v interface{} } func (d *interfaceData) setValue(v interface{}) { d.v = v } func (d *interfaceData) value() interface{} { return d.v } type boolData struct{ interfaceData } func (d *boolData) setValue(v interface{}) { d.v = v.(bool) } func BoolData(v bool) *boolData { return &boolData{interfaceData{v}} } type intData struct{ interfaceData } func IntData(v int) *intData { return &intData{interfaceData{v}} } func (d *intData) setValue(v interface{}) { switch v := v.(type) { case int: d.v = v default: i, err := strconv.Atoi(fmt.Sprint(v)) if err == nil { d.v = i } else { log.Println(err) } } } type floatData struct{ interfaceData } func FloatData(v float64) *floatData { return &floatData{interfaceData{v}} } func (d *floatData) setValue(v interface{}) { switch v := v.(type) { case float64: d.v = v default: i, err := strconv.ParseFloat(fmt.Sprint(v), 64) if err == nil { d.v = i } else { log.Println(err) } } } mumax3-3.10/gui/el.go000066400000000000000000000044601371432437400143760ustar00rootroot00000000000000package gui import "sync" // wraps a GUI element (button, textbox, ...), // stores the dirty flag, extra attributes, lock event handler, ... type E struct { _m sync.Mutex _dirty bool // dirty means the value/attributes need updating in browser _attr map[string]interface{} // extra html attributes (e.g. style, onclick, ...) _elem El // the wrapped gui element onevent func() // called upon value change by user (not by Go code) } func newE(elem El) *E { return &E{_elem: elem, _dirty: true} } // atomically pass a new value to the underlying element and mark it dirty. func (e *E) set(v interface{}) { e._m.Lock() defer e._m.Unlock() old := e._elem.value() // carefully check if value changed, set/value may do things behind the screens e._elem.set(v) if e._elem.value() != old { e._dirty = true } } // atomically set an html attribute for the underlying element and mark it dirty func (e *E) attr(key string, v interface{}) { e._m.Lock() defer e._m.Unlock() if e._attr == nil { e._attr = make(map[string]interface{}) } old := e._attr[key] if v != old { e._dirty = true } e._attr[key] = v } // atomically produce a list of javascript calls needed to update the element in the browser, // and clear dirty flag func (e *E) update(id string) []jsCall { e._m.Lock() defer e._m.Unlock() if !e._dirty { return []jsCall{} } upd := e._elem.update(id) for k, v := range e._attr { upd = append(upd, jsCall{F: "setAttr", Args: []interface{}{id, k, v}}) } e._dirty = false return upd } // atomically returns the underlying element's value // depending its implementation (e.g. textBox's text, checkBox's checked value, etc.) func (e *E) value() interface{} { e._m.Lock() defer e._m.Unlock() return e._elem.value() } // atomically set the dirty flag w/o changing value. // called, e.g., when a second brower window opens func (e *E) setDirty() { e._m.Lock() defer e._m.Unlock() e._dirty = true } // Atomically set a new onevent function, which is called each time // the user changes the underlying elements value. func (e *E) OnEvent(f func()) { e._m.Lock() defer e._m.Unlock() e.onevent = f } // Underlying html element like Span, TextBox, etc. type El interface { update(id string) []jsCall set(v interface{}) value() interface{} } mumax3-3.10/gui/element.go000066400000000000000000000006151371432437400154250ustar00rootroot00000000000000package gui import "fmt" type element struct { data } func (e *element) update(id string) []jsCall { return []jsCall{{F: "setAttr", Args: []interface{}{id, "innerHTML", e.value()}}} } func (d *Page) Element(id, typ, attr string, value interface{}, extra ...string) string { e := &element{data: data{value}} d.addElem(id, e) return fmt.Sprintf(` `, id, cat(extra)) } mumax3-3.10/gui/img.go000066400000000000000000000005451371432437400145520ustar00rootroot00000000000000package gui import "fmt" type img struct { data } func (e *img) update(id string) []jsCall { return []jsCall{{F: "setAttr", Args: []interface{}{id, "src", e.value()}}} } func (d *Page) Img(id string, value interface{}, extra ...string) string { e := &img{data: data{value}} d.addElem(id, e) return fmt.Sprintf(` `, id, cat(extra)) } mumax3-3.10/gui/js.go000066400000000000000000000105731371432437400144140ustar00rootroot00000000000000package gui // Javascript for the GUI page. const JS = `` mumax3-3.10/gui/meter.go000066400000000000000000000006021371432437400151040ustar00rootroot00000000000000package gui import "fmt" type meter struct { data } func (e *meter) update(id string) []jsCall { return []jsCall{{F: "setAttr", Args: []interface{}{id, "value", e.value()}}} } func (d *Page) Meter(id string, min, max, value int, extra ...string) string { e := &meter{data: data{value}} d.addElem(id, e) return fmt.Sprintf(``, id, min, max) } mumax3-3.10/gui/number.go000066400000000000000000000010241371432437400152570ustar00rootroot00000000000000package gui import "fmt" type number struct { data } func (e *number) update(id string) []jsCall { return []jsCall{{F: "setAttr", Args: []interface{}{id, "value", e.value()}}} } func (d *Page) Number(id string, min, max, value int, extra ...string) string { e := &number{data: data{value}} d.addElem(id, e) return fmt.Sprintf(``, "number", id, id, id, id, min, max, cat(extra)) } mumax3-3.10/gui/page.go000066400000000000000000000124431371432437400147120ustar00rootroot00000000000000package gui import ( "bytes" "encoding/json" "fmt" "log" "net/http" "sync" "text/template" ) var Debug = false // Page holds the state to serve a single GUI page to the browser type Page struct { elems map[string]*E htmlCache []byte // static html content, rendered only once haveJS bool // have called JS()? data interface{} // any additional data to be passed to template onUpdate func() onAnyEvent func() httpLock sync.Mutex lastPageID string } // NewPage constructs a Page based on an HTML template containing // element tags like {{.Button}}, {{.Textbox}}, etc. data is fed // to the template as additional arbitrary data, available as {{.Data}}. func NewPage(htmlTemplate string, data interface{}) *Page { d := &Page{elems: make(map[string]*E), data: data} // exec template (once) t := template.Must(template.New("").Parse(htmlTemplate)) cache := bytes.NewBuffer(nil) check(t.Execute(cache, d)) d.htmlCache = cache.Bytes() // check if template contains {{.JS}} if !d.haveJS { log.Panic("template should call {{.JS}}") } return d } // Value returns the value of the HTML element with given id. // E.g.: the text in a textbox, the checked value of a checkbox, etc. func (d *Page) Value(id string) interface{} { return d.elem(id).value() } // StringValue is like Value but returns the value as string, // converting if necessary. func (d *Page) StringValue(id string) string { v := d.Value(id) if s, ok := v.(string); ok { return s } else { return fmt.Sprint(v) } } func (d *Page) Set(id string, v interface{}) { d.elem(id).set(v) } func (d *Page) Attr(id string, k string, v interface{}) { d.elem(id).attr(k, v) } // OnEvent sets a handler to be called when an event happens // to the HTML element with given id. The event depends on the // element type: click for Button, change for TextBox, etc... func (d *Page) OnEvent(id string, handler func()) { d.elem(id).onevent = handler } // OnEvent sets a handler to be called when an event happens // to any of the page's HTML elements. func (d *Page) OnAnyEvent(handler func()) { d.onAnyEvent = handler } // Set func to be executed each time javascript polls for updates func (d *Page) OnUpdate(f func()) { d.onUpdate = f } // {{.JS}} should always be embedded in the template . // Expands to needed JavaScript code. func (d *Page) JS() string { d.haveJS = true return JS } // {{.ErrorBox}} should be embedded in the template where errors are to be shown. // CSS rules for class ErrorBox may be set, e.g., to render errors in red. func (t *Page) ErrorBox() string { return ` ` } // {{.UpdateButton}} adds a page Update button func (t *Page) UpdateButton(text string) string { if text == "" { text = `↻` } return `` } // {{.UpdateBox}} adds an auto update checkbox func (t *Page) UpdateBox(text string) string { if text == "" { text = "auto update" } return `` + text + `` } // {{.Data}} returns the extra data that was passed to NewPage func (t *Page) Data() interface{} { return t.data } // return elem[id], panic if non-existent func (d *Page) elem(id string) *E { if e, ok := d.elems[id]; ok { return e } else { panic("no element with id: " + id) } } // elem[id] = e, panic if already defined func (d *Page) addElem(id string, e El) { if _, ok := d.elems[id]; ok { panic("addElem: already defined: " + id) } else { d.elems[id] = newE(e) } } // ServeHTTP implements http.Handler. func (d *Page) ServeHTTP(w http.ResponseWriter, r *http.Request) { d.httpLock.Lock() defer d.httpLock.Unlock() switch r.Method { default: http.Error(w, "not allowed: "+r.Method+" "+r.URL.Path, http.StatusForbidden) case "GET": d.serveContent(w, r) case "POST": d.serveUpdate(w, r) case "PUT": d.serveEvent(w, r) } } // serves the html content. func (d *Page) serveContent(w http.ResponseWriter, r *http.Request) { w.Write(d.htmlCache) } // HTTP handler for event notifications by button clicks etc func (d *Page) serveEvent(w http.ResponseWriter, r *http.Request) { var ev event check(json.NewDecoder(r.Body).Decode(&ev)) if Debug { fmt.Println(ev) } if d.onAnyEvent != nil { d.onAnyEvent() } el := d.elem(ev.ID) el.set(ev.Arg) if el.onevent != nil { el.onevent() } } type event struct { ID string Arg interface{} } // HTTP handler for updating the dynamic elements func (d *Page) serveUpdate(w http.ResponseWriter, r *http.Request) { if d.onUpdate != nil { d.onUpdate() } // read page ID from body buf := make([]byte, 100) r.Body.Read(buf) pageID := string(buf) if pageID != d.lastPageID { for _, e := range d.elems { e.setDirty() } d.lastPageID = pageID } calls := make([]jsCall, 0, len(d.elems)) for id, e := range d.elems { calls = append(calls, e.update(id)...) // update atomically checks dirty and clears it } if Debug && len(calls) != 0 { fmt.Println(calls) // debug } check(json.NewEncoder(w).Encode(calls)) } // javascript call type jsCall struct { F string // function to call Args []interface{} // function arguments } func check(err error) { if err != nil { log.Panic(err) } } mumax3-3.10/gui/page_test.go000066400000000000000000000001031371432437400157370ustar00rootroot00000000000000package gui import "testing" func TestNewPage(t *testing.T) { } mumax3-3.10/gui/progress.go000066400000000000000000000006031371432437400156350ustar00rootroot00000000000000package gui import "fmt" type progress struct { data } func (e *progress) update(id string) []jsCall { return []jsCall{{F: "setAttr", Args: []interface{}{id, "value", e.value()}}} } func (d *Page) Progress(id string, max, value int, extra ...string) string { e := &progress{data: data{value}} d.addElem(id, e) return fmt.Sprintf(``, id, max) } mumax3-3.10/gui/range.go000066400000000000000000000007221371432437400150670ustar00rootroot00000000000000package gui import "fmt" type slider struct { data } func (e *slider) update(id string) []jsCall { return []jsCall{{F: "setAttr", Args: []interface{}{id, "value", e.value()}}} } func (d *Page) Range(id string, min, max, value int, extra ...string) string { e := &slider{data: data{value}} d.addElem(id, e) return fmt.Sprintf(``, id, min, max, id) } mumax3-3.10/gui/select.go000066400000000000000000000012751371432437400152560ustar00rootroot00000000000000package gui import "fmt" type sel struct { data } func (e *sel) update(id string) []jsCall { return []jsCall{{F: "setSelect", Args: []interface{}{id, e.value()}}} } func (d *Page) SelectArray(id string, value string, options []string) string { return d.Select(id, value, options...) } func (d *Page) Select(id string, value string, options ...string) string { e := &sel{data: data{value}} d.addElem(id, e) html := fmt.Sprintf(`` return html } mumax3-3.10/gui/span.go000066400000000000000000000006741371432437400147420ustar00rootroot00000000000000package gui import "fmt" type span struct { data } func (e *span) update(id string) []jsCall { return []jsCall{{F: "setAttr", Args: []interface{}{id, "innerHTML", e.value()}}} } // {{.Span id value}} adds a piece of text ("label") to the document. func (d *Page) Span(id string, value interface{}, extra ...string) string { e := &span{data: data{value}} d.addElem(id, e) return fmt.Sprintf(` `, id, cat(extra)) } mumax3-3.10/gui/textbox.go000066400000000000000000000010341371432437400154650ustar00rootroot00000000000000package gui import "fmt" type textbox struct { data } func (e *textbox) update(id string) []jsCall { return []jsCall{{F: "setTextbox", Args: []interface{}{id, e.value()}}} } func (d *Page) TextBox(id string, value interface{}, extra ...string) string { e := &textbox{data: data{value}} d.addElem(id, e) return fmt.Sprintf(``, "text", id, id, id, id, id, cat(extra)) } mumax3-3.10/gui/util.go000066400000000000000000000002071371432437400147460ustar00rootroot00000000000000package gui // concatenate elements func cat(s []string) string { str := "" for _, s := range s { str += s + " " } return str } mumax3-3.10/httpfs/000077500000000000000000000000001371432437400141675ustar00rootroot00000000000000mumax3-3.10/httpfs/Makefile000066400000000000000000000000211371432437400156200ustar00rootroot00000000000000all: go install mumax3-3.10/httpfs/client.go000066400000000000000000000101361371432437400157750ustar00rootroot00000000000000package httpfs // client-side API import ( "bytes" "encoding/json" "errors" "fmt" "io/ioutil" "net/http" "net/url" "path" "strings" ) var wd = "" // working directory, see SetWD // SetWD sets a "working directory" for the client side, // prefixed to all relative local paths passed to client functions (Mkdir, Touch, Remove, ...). // dir may start with "http://", turning local relative client paths into remote paths. // E.g.: // http://path -> http://path // path/file -> wd/path/file // /path/file -> /path/file func SetWD(dir string) { if dir != "" && !strings.HasSuffix(dir, "/") { dir = dir + "/" } wd = dir } // Mkdir creates a directory at specified URL. func Mkdir(URL string) error { URL = addWorkDir(URL) if isRemote(URL) { return httpMkdir(URL) } else { return localMkdir(URL) } } // Touch creates an empty file at the specified URL. func Touch(URL string) error { URL = addWorkDir(URL) if isRemote(URL) { return httpTouch(URL) } else { return localTouch(URL) } } // ReadDir reads and returns all file names in the directory at URL. func ReadDir(URL string) ([]string, error) { URL = addWorkDir(URL) if isRemote(URL) { return httpLs(URL) } else { return localLs(URL) } } // Remove removes the file or directory at URL, and all children it may contain. // Similar to os.RemoveAll. func Remove(URL string) error { URL = addWorkDir(URL) if isRemote(URL) { return httpRemove(URL) } else { return localRemove(URL) } } // Read the entire file and return its contents. func Read(URL string) ([]byte, error) { URL = addWorkDir(URL) if isRemote(URL) { return httpRead(URL) } else { return localRead(URL) } } // Append p to the file given by URL, // but first assure that the file had the expected size. // Used to avoid accidental concurrent writes by two processes to the same file. // Size < 0 disables size check. func AppendSize(URL string, p []byte, size int64) error { URL = addWorkDir(URL) if isRemote(URL) { return httpAppend(URL, p, size) } else { return localAppend(URL, p, size) } } // Append p to the file given by URL. func Append(URL string, p []byte) error { return AppendSize(URL, p, -1) } // Create file given by URL and put data from p there. func Put(URL string, p []byte) error { URL = addWorkDir(URL) if isRemote(URL) { return httpPut(URL, p) } else { return localPut(URL, p) } } func isRemote(URL string) bool { return strings.HasPrefix(URL, "http://") } // prefix wd to URL if URL is a relative file path // does not start with "/", "http://" func addWorkDir(URL string) string { if isRemote(URL) { return URL } if !path.IsAbs(URL) { return wd + URL } return URL } func httpMkdir(URL string) error { _, err := do(MKDIR, URL, nil, nil) return err } func httpTouch(URL string) error { _, err := do(TOUCH, URL, nil, nil) return err } func httpLs(URL string) (ls []string, err error) { r, errHTTP := do(LS, URL, nil, nil) if errHTTP != nil { return nil, errHTTP } errJSON := json.Unmarshal(r, &ls) if errJSON != nil { return nil, mkErr(LS, URL, errJSON) } return ls, nil } func httpAppend(URL string, data []byte, size int64) error { var query map[string][]string if size >= 0 { query = map[string][]string{"size": {fmt.Sprint(size)}} } _, err := do(APPEND, URL, data, query) return err } func httpPut(URL string, data []byte) error { _, err := do(PUT, URL, data, nil) return err } func httpRead(URL string) ([]byte, error) { return do(READ, URL, nil, nil) } func httpRemove(URL string) error { _, err := do(RM, URL, nil, nil) return err } // do a http request. func do(a action, URL string, body []byte, query url.Values) (resp []byte, err error) { u, err := url.Parse(URL) u.Path = string(a) + path.Clean("/"+u.Path) u.RawQuery = query.Encode() response, errR := http.Post(u.String(), "data", bytes.NewReader(body)) if errR != nil { return nil, mkErr(a, URL, errR) } defer response.Body.Close() if response.StatusCode != http.StatusOK { return nil, errors.New("do " + u.String() + ":" + response.Status + ":" + readBody(response.Body)) } resp, err = ioutil.ReadAll(response.Body) err = mkErr(a, URL, err) return } mumax3-3.10/httpfs/httpfs.go000066400000000000000000000044521371432437400160330ustar00rootroot00000000000000/* Package httpfs provides a (userspace) file system API over http. httpfs is used by mumax3-server to proved file system access to the compute nodes. The API is similar to go's os package, but both local file names and URLs may be passed. When the file "name" starts with "http://", it is treated as a remote file, otherwise it is local. Hence, the same API is used for local and remote file access. */ package httpfs import ( "fmt" "io" "io/ioutil" "log" "os" "path" ) var Logging = false // enables logging const ( DirPerm = 0777 // permissions for new directory FilePerm = 0666 // permissions for new files ) func readBody(r io.ReadCloser) string { defer r.Close() b, err := ioutil.ReadAll(r) if err != nil { log.Println("readbody:", err) return "" } return string(b) } func mkErr(a action, URL string, err error) error { if err == nil { return nil } else { return fmt.Errorf("httpfs %v %v: %v", a, URL, err) } } func localMkdir(fname string) error { return os.Mkdir(fname, DirPerm) } func localTouch(fname string) error { f, err := os.OpenFile(fname, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0666) if err == nil { f.Close() } return err } func localLs(fname string) ([]string, error) { f, err := os.Open(fname) if err != nil { return nil, err } defer f.Close() ls, err2 := f.Readdirnames(-1) if err2 != nil { return nil, err2 } return ls, nil } func localAppend(fname string, data []byte, size int64) error { f, err := os.OpenFile(fname, os.O_APPEND|os.O_WRONLY, FilePerm) if err != nil { return err } defer f.Close() if size >= 0 { fi, errFi := f.Stat() if errFi != nil { return errFi } if size != fi.Size() { return fmt.Errorf(`httpfs: file size mismatch, possible concurrent access. size=%v B, expected=%v B`, fi.Size(), size) } } _, err2 := f.Write(data) return err2 } func localPut(fname string, data []byte) error { _ = os.MkdirAll(path.Dir(fname), DirPerm) f, err := os.OpenFile(fname, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, FilePerm) if err != nil { return err } defer f.Close() _, err2 := f.Write(data) return err2 } func localRead(fname string) ([]byte, error) { return ioutil.ReadFile(fname) } func localRemove(fname string) error { return os.RemoveAll(fname) } func Log(msg ...interface{}) { if Logging { log.Println(msg...) } } mumax3-3.10/httpfs/httpfs_test.go000066400000000000000000000130321371432437400170640ustar00rootroot00000000000000package httpfs import ( "fmt" "net" "net/http" "testing" ) // leaving this many files open is supposed to trigger os error. const MANYFILES = 1025 // start local httpfs server, and use http://address/ as WD func init() { l, err := net.Listen("tcp", ":12345") if err != nil { panic(err) } addr := "http://" + l.Addr().String() SetWD(addr) RegisterHandlers() fmt.Println("serving httpfs:", addr) go func() { if err := http.Serve(l, nil); err != nil { panic(err) } }() } func TestMkdirRemove(t *testing.T) { Remove("testdata") defer Remove("testdata") mustPass(t, Mkdir("testdata")) mustFail(t, Mkdir("testdata")) // test for closing files (internally) for i := 0; i < MANYFILES; i++ { mustPass(t, Remove("testdata")) mustPass(t, Mkdir("testdata")) } } func TestMkdir(t *testing.T) { Remove("testdata") defer Remove("testdata") mustFail(t, Mkdir("testdata/bla/bla")) mustPass(t, Mkdir("testdata/")) mustPass(t, Mkdir("testdata/bla")) mustPass(t, Mkdir("testdata/bla/bla")) } func TestTouch(t *testing.T) { Remove("testdata") defer Remove("testdata") mustFail(t, Touch("testdata/file")) mustPass(t, Mkdir("testdata/")) mustPass(t, Touch("testdata/file")) // test for closing files (internally) for i := 0; i < MANYFILES; i++ { mustPass(t, Touch("testdata/file")) } } func TestReaddir(t *testing.T) { Remove("testdata") defer Remove("testdata") s := func(s []string, e error) error { return e } mustFail(t, s(ReadDir("testdata"))) // test for closing files (internally) for i := 0; i < MANYFILES; i++ { mustFail(t, s(ReadDir("testdata"))) } mustPass(t, Mkdir("testdata/")) mustPass(t, Touch("testdata/file1")) mustPass(t, Touch("testdata/file2")) mustPass(t, Touch("testdata/file3")) ls, err := ReadDir("testdata") if err != nil { t.Error(err) } if len(ls) != 3 { t.Fail() } // test for closing files (internally) for i := 0; i < MANYFILES; i++ { mustPass(t, s(ReadDir("testdata"))) } } func TestRemove(t *testing.T) { Remove("testdata") defer Remove("testdata") mustPass(t, Remove("testdata")) // test for closing files (internally) for i := 0; i < MANYFILES; i++ { mustPass(t, Remove("testdata")) } } func TestAppendRead(t *testing.T) { Remove("testdata") defer Remove("testdata") mustPass(t, Mkdir("testdata")) data := []byte("hello httpfs\n") mustFail(t, Append("testdata/file", data)) // file does not exist yet mustPass(t, Touch("testdata/file")) for i := 0; i < MANYFILES; i++ { mustPass(t, Append("testdata/file", data)) } b, errR := Read("testdata/file") if errR != nil { t.Error(errR) } if len(b) != (MANYFILES)*len(data) { t.Error(len(b), (MANYFILES+1)*len(data)) } } func TestConcurrentWrite(t *testing.T) { Remove("testdata") defer Remove("testdata") mustPass(t, Mkdir("testdata")) mustPass(t, Touch("testdata/file")) f1 := MustCreate("testdata/file") f2 := MustCreate("testdata/file") fmt.Fprintln(f1, "a") mustPass(t, f1.Flush()) fmt.Fprintln(f2, "a") mustFail(t, f2.Flush()) for i := 0; i < MANYFILES; i++ { fmt.Fprintln(f1, "a") mustPass(t, f1.Flush()) fmt.Fprintln(f2, "a") mustFail(t, f2.Flush()) } } func TestAppendSize(t *testing.T) { Remove("testdata") defer Remove("testdata") mustPass(t, Mkdir("testdata")) data := []byte("hello httpfs\n") mustFail(t, AppendSize("testdata/file", data, 0)) // file does not exist yet mustFail(t, AppendSize("testdata/file", data, 1)) // file does not exist yet mustPass(t, Touch("testdata/file")) for i := 0; i < MANYFILES; i++ { mustPass(t, AppendSize("testdata/file", data, int64(i)*int64(len(data)))) } b, errR := Read("testdata/file") if errR != nil { t.Error(errR) } if len(b) != (MANYFILES)*len(data) { t.Error(len(b), (MANYFILES+1)*len(data)) } } func TestAppendSizeBad(t *testing.T) { Remove("testdata") defer Remove("testdata") mustPass(t, Mkdir("testdata")) mustPass(t, Touch("testdata/file")) data := []byte("hello httpfs\n") for i := 0; i < MANYFILES; i++ { mustFail(t, AppendSize("testdata/file", data, 3)) // bad size } } func TestPutRead(t *testing.T) { Remove("testdata") defer Remove("testdata") mustPass(t, Mkdir("testdata")) data := []byte("hello httpfs\n") // must pass if file does not yet exist for i := 0; i < MANYFILES; i++ { mustPass(t, Put("testdata/file", data)) } b, errR := Read("testdata/file") if errR != nil { t.Error(errR) } if len(b) != len(data) { t.Error(len(b), (MANYFILES+1)*len(data)) } } func TestReaderWriter(t *testing.T) { Remove("testdata") defer Remove("testdata") mustPass(t, Mkdir("testdata")) // open file for reading when it's not yet there { out, errO := Open("testdata/file") if errO == nil { t.Fail() } if out != nil { t.Fail() } } for i := 0; i < MANYFILES; i++ { // create and write to file { out, errO := Create("testdata/file") if errO != nil { t.Fail() } if out == nil { t.Fail() } _, errW := fmt.Fprintln(out, "hello_httpfs") if errW != nil { t.Fail() } mustPass(t, out.Close()) } // open file for reading and check content { f, errO := Open("testdata/file") if errO != nil { t.Fail() } if f == nil { t.Fail() } var str string _, err := fmt.Fscan(f, &str) if err != nil { t.Error(err) } if str != "hello_httpfs" { t.Error(str) } if i == 0 { mustPass(t, f.Close()) // it's not needed to close the file } } } } func mustPass(t *testing.T, err error) { if err != nil { t.Fatal(err) } } func mustFail(t *testing.T, err error) { if err == nil { t.Fatal("did not get error") } } mumax3-3.10/httpfs/reader.go000066400000000000000000000030171371432437400157610ustar00rootroot00000000000000package httpfs // Utility functions on top of standard httpfs protocol import ( "bufio" "bytes" "io" "io/ioutil" ) const BUFSIZE = 16 * 1024 * 1024 // bufio buffer size // create a file for writing, clobbers previous content if any. func Create(URL string) (WriteCloseFlusher, error) { _ = Remove(URL) err := Touch(URL) if err != nil { return nil, err } return &bufWriter{bufio.NewWriterSize(&appendWriter{URL, 0}, BUFSIZE)}, nil } func MustCreate(URL string) WriteCloseFlusher { f, err := Create(URL) if err != nil { panic(err) } return f } type WriteCloseFlusher interface { io.WriteCloser Flush() error } // open a file for reading func Open(URL string) (io.ReadCloser, error) { data, err := Read(URL) if err != nil { return nil, err } return ioutil.NopCloser(bytes.NewReader(data)), nil } func MustOpen(URL string) io.ReadCloser { f, err := Open(URL) if err != nil { panic(err) } return f } type bufWriter struct { buf *bufio.Writer } func (w *bufWriter) Write(p []byte) (int, error) { return w.buf.Write(p) } func (w *bufWriter) Close() error { err := w.buf.Flush() w.buf = nil // Dangling pointer somewhere? if err != nil { return err } return nil } func (w *bufWriter) Flush() error { return w.buf.Flush() } type appendWriter struct { URL string byteCount int64 } func (w *appendWriter) Write(p []byte) (int, error) { err := AppendSize(w.URL, p, w.byteCount) if err != nil { return 0, err // don't know how many bytes written } w.byteCount += int64(len(p)) return len(p), nil } mumax3-3.10/httpfs/server.go000066400000000000000000000054631371432437400160340ustar00rootroot00000000000000package httpfs // server-side httpfs code import ( "encoding/json" "io" "io/ioutil" "net/http" "net/url" "strconv" ) // file action gets its own type to avoid mixing up with other strings type action string // httpfs actions, handled at /actionName/ (e.g. /ls/, /mkdir/, ...) const ( APPEND action = "append" LS action = "ls" MKDIR action = "mkdir" PUT action = "put" READ action = "read" RM action = "rm" TOUCH action = "touch" ) // RegisterHandlers sets up the http handlers needed for the httpfs protocol (calling go's http.Handle). // After RegisterHandlers, http.ListenAndServe may be called. func RegisterHandlers() { m := map[action]handlerFunc{ APPEND: handleAppend, LS: handleLs, MKDIR: handleMkdir, PUT: handlePut, READ: handleRead, RM: handleRemove, TOUCH: handleTouch, } for k, v := range m { http.HandleFunc("/"+string(k)+"/", newHandler(k, v)) } http.Handle("/fs/", http.StripPrefix("/fs/", http.FileServer(http.Dir(".")))) } // general handler func for file name, optional URL query, input data and response writer. type handlerFunc func(fname string, data []byte, w io.Writer, query url.Values) error func newHandler(prefix action, f handlerFunc) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { fname := r.URL.Path[len(prefix)+2:] // strip "/prefix/" query := r.URL.Query() data, err := ioutil.ReadAll(r.Body) Log("httpfs req:", prefix, fname, query.Encode(), len(data), "B payload") if err != nil { Log("httpfs err:", prefix, fname, ":", err) http.Error(w, err.Error(), http.StatusBadRequest) } err2 := f(fname, data, w, query) if err2 != nil { Log("httpfs err:", prefix, fname, ":", err2) http.Error(w, err2.Error(), http.StatusInternalServerError) } } } func handleAppend(fname string, data []byte, w io.Writer, q url.Values) error { size := int64(-1) s := q.Get("size") if s != "" { var err error size, err = strconv.ParseInt(s, 0, 64) if err != nil { return err } } return localAppend(fname, data, size) } func handlePut(fname string, data []byte, w io.Writer, q url.Values) error { return localPut(fname, data) } func handleLs(fname string, data []byte, w io.Writer, q url.Values) error { ls, err := localLs(fname) if err != nil { return err } return json.NewEncoder(w).Encode(ls) } func handleMkdir(fname string, data []byte, w io.Writer, q url.Values) error { return localMkdir(fname) } func handleTouch(fname string, data []byte, w io.Writer, q url.Values) error { return localTouch(fname) } func handleRead(fname string, data []byte, w io.Writer, q url.Values) error { b, err := localRead(fname) if err != nil { return err } _, err2 := w.Write(b) return err2 } func handleRemove(fname string, data []byte, w io.Writer, q url.Values) error { return localRemove(fname) } mumax3-3.10/mag/000077500000000000000000000000001371432437400134235ustar00rootroot00000000000000mumax3-3.10/mag/Makefile000066400000000000000000000000251371432437400150600ustar00rootroot00000000000000all: go install -v mumax3-3.10/mag/constants.go000066400000000000000000000005121371432437400157640ustar00rootroot00000000000000// package mag provides magnetism-specific constants and the demag kernel. package mag import "math" const ( Mu0 = 4 * math.Pi * 1e-7 // Permeability of vacuum in Tm/A MuB = 9.2740091523e-24 // Bohr magneton in J/T Kb = 1.380650424e-23 // Boltzmann's constant in J/K Qe = 1.60217646e-19 // Electron charge in C ) mumax3-3.10/mag/demagkernel.go000066400000000000000000000350151371432437400162340ustar00rootroot00000000000000package mag import ( "bufio" "fmt" "github.com/mumax/3/data" "github.com/mumax/3/oommf" "github.com/mumax/3/timer" "github.com/mumax/3/util" "math" "os" ) // Obtains the demag kernel either from cacheDir/ or by calculating (and then storing in cacheDir for next time). // Empty cacheDir disables caching. func DemagKernel(inputSize, pbc [3]int, cellsize [3]float64, accuracy float64, cacheDir string) (kernel [3][3]*data.Slice) { timer.Start("kernel_init") timer.Stop("kernel_init") // warm-up timer.Start("kernel_init") defer timer.Stop("kernel_init") sanityCheck(cellsize, pbc) // Cache disabled if cacheDir == "" { util.Log(`//Not using kernel cache (-cache="")`) return CalcDemagKernel(inputSize, pbc, cellsize, accuracy) } // Error-resilient kernel cache: if anything goes wrong, return calculated kernel. defer func() { if err := recover(); err != nil { util.Log("//Unable to use kernel cache:", err) kernel = CalcDemagKernel(inputSize, pbc, cellsize, accuracy) } }() // Try to load kernel basename := fmt.Sprint(cacheDir, "/", "mumax3kernel_", inputSize, "_", pbc, "_", cellsize, "_", accuracy, "_") var errLoad error for i := 0; i < 3; i++ { for j := i; j < 3; j++ { if inputSize[Z] == 1 && ((i == X && j == Z) || (i == Y && j == Z)) { continue // element not needed in 2D } kernel[i][j], errLoad = LoadKernel(fmt.Sprint(basename, i, j, ".ovf")) if errLoad != nil { break } } if errLoad != nil { break } } // make result symmetric for tools that expect it so. kernel[Y][X] = kernel[X][Y] kernel[Z][X] = kernel[X][Z] kernel[Z][Y] = kernel[Y][Z] if errLoad != nil { util.Log("//Did not use cached kernel:", errLoad) } else { util.Log("//Using cached kernel:", basename) return kernel } // Could not load kernel: calculate it and save var errSave error kernel = CalcDemagKernel(inputSize, pbc, cellsize, accuracy) for i := 0; i < 3; i++ { for j := i; j < 3; j++ { if inputSize[Z] == 1 && ((i == X && j == Z) || (i == Y && j == Z)) { continue // element not needed in 2D } compName := fmt.Sprint("N_", i, j) info := data.Meta{Time: float64(0.0), Name: compName, Unit: "1", CellSize: cellsize, MeshUnit: "m"} errSave = SaveKernel(fmt.Sprint(basename, i, j, ".ovf"), kernel[i][j], info) if errSave != nil { break } } if errSave != nil { break } } if errSave != nil { util.Log("//Failed to cache kernel:", errSave) } else { util.Log("//Cached kernel:", basename) } return kernel } func LoadKernel(fname string) (kernel *data.Slice, err error) { kernel, _, err = oommf.ReadFile(fname) return } func SaveKernel(fname string, kernel *data.Slice, info data.Meta) error { f, err := os.OpenFile(fname, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0666) if err != nil { return err } out := bufio.NewWriter(f) defer out.Flush() oommf.WriteOVF2(out, kernel, info, "binary 4") return nil } // Calculates the magnetostatic kernel by brute-force integration // of magnetic charges over the faces and averages over cell volumes. func CalcDemagKernel(inputSize, pbc [3]int, cellsize [3]float64, accuracy float64) (kernel [3][3]*data.Slice) { // Add zero-padding in non-PBC directions size := padSize(inputSize, pbc) // Sanity check { util.Assert(size[Z] > 0 && size[Y] > 0 && size[X] > 0) util.Assert(cellsize[X] > 0 && cellsize[Y] > 0 && cellsize[Z] > 0) util.Assert(pbc[X] >= 0 && pbc[Y] >= 0 && pbc[Z] >= 0) util.Assert(accuracy > 0) } // Allocate only upper diagonal part. The rest is symmetric due to reciprocity. var array [3][3][][][]float32 for i := 0; i < 3; i++ { for j := i; j < 3; j++ { kernel[i][j] = data.NewSlice(1, size) array[i][j] = kernel[i][j].Scalars() } } // Field (destination) loop ranges r1, r2 := kernelRanges(size, pbc) // smallest cell dimension is our typical length scale L := cellsize[X] { if cellsize[Y] < L { L = cellsize[Y] } if cellsize[Z] < L { L = cellsize[Z] } } progress, progmax := 0, (1+(r2[Y]-r1[Y]))*(1+(r2[Z]-r1[Z])) // progress bar done := make(chan struct{}, 3) // parallel calculation of one component done? // Start brute integration // 9 nested loops, does that stress you out? // Fortunately, the 5 inner ones usually loop over just one element. for s := 0; s < 3; s++ { // source index Ksdxyz (parallelized over) go func(s int) { u, v, w := s, (s+1)%3, (s+2)%3 // u = direction of source (s), v & w are the orthogonal directions var ( R, R2 [3]float64 // field and source cell center positions pole [3]float64 // position of point charge on the surface points int // counts used integration points ) for z := r1[Z]; z <= r2[Z]; z++ { zw := wrap(z, size[Z]) // skip one half, reconstruct from symmetry later // check on wrapped index instead of loop range so it also works for PBC if zw > size[Z]/2 { if s == 0 { progress += (1 + (r2[Y] - r1[Y])) } continue } R[Z] = float64(z) * cellsize[Z] for y := r1[Y]; y <= r2[Y]; y++ { if s == 0 { // show progress of only one component progress++ util.Progress(progress, progmax, "Calculating demag kernel") } yw := wrap(y, size[Y]) if yw > size[Y]/2 { continue } R[Y] = float64(y) * cellsize[Y] for x := r1[X]; x <= r2[X]; x++ { xw := wrap(x, size[X]) if xw > size[X]/2 { continue } R[X] = float64(x) * cellsize[X] // choose number of integration points depending on how far we are from source. dx, dy, dz := delta(x)*cellsize[X], delta(y)*cellsize[Y], delta(z)*cellsize[Z] d := math.Sqrt(dx*dx + dy*dy + dz*dz) if d == 0 { d = L } maxSize := d / accuracy // maximum acceptable integration size nv := int(math.Max(cellsize[v]/maxSize, 1) + 0.5) nw := int(math.Max(cellsize[w]/maxSize, 1) + 0.5) nx := int(math.Max(cellsize[X]/maxSize, 1) + 0.5) ny := int(math.Max(cellsize[Y]/maxSize, 1) + 0.5) nz := int(math.Max(cellsize[Z]/maxSize, 1) + 0.5) // Stagger source and destination grids. // Massively improves accuracy, see note. nv *= 2 nw *= 2 util.Assert(nv > 0 && nw > 0 && nx > 0 && ny > 0 && nz > 0) scale := 1 / float64(nv*nw*nx*ny*nz) surface := cellsize[v] * cellsize[w] // the two directions perpendicular to direction s charge := surface * scale pu1 := cellsize[u] / 2. // positive pole center pu2 := -pu1 // negative pole center // Do surface integral over source cell, accumulate in B var B [3]float64 for i := 0; i < nv; i++ { pv := -(cellsize[v] / 2.) + cellsize[v]/float64(2*nv) + float64(i)*(cellsize[v]/float64(nv)) pole[v] = pv for j := 0; j < nw; j++ { pw := -(cellsize[w] / 2.) + cellsize[w]/float64(2*nw) + float64(j)*(cellsize[w]/float64(nw)) pole[w] = pw // Do volume integral over destination cell for α := 0; α < nx; α++ { rx := R[X] - cellsize[X]/2 + cellsize[X]/float64(2*nx) + (cellsize[X]/float64(nx))*float64(α) for β := 0; β < ny; β++ { ry := R[Y] - cellsize[Y]/2 + cellsize[Y]/float64(2*ny) + (cellsize[Y]/float64(ny))*float64(β) for γ := 0; γ < nz; γ++ { rz := R[Z] - cellsize[Z]/2 + cellsize[Z]/float64(2*nz) + (cellsize[Z]/float64(nz))*float64(γ) points++ pole[u] = pu1 R2[X], R2[Y], R2[Z] = rx-pole[X], ry-pole[Y], rz-pole[Z] r := math.Sqrt(R2[X]*R2[X] + R2[Y]*R2[Y] + R2[Z]*R2[Z]) qr := charge / (4 * math.Pi * r * r * r) bx := R2[X] * qr by := R2[Y] * qr bz := R2[Z] * qr pole[u] = pu2 R2[X], R2[Y], R2[Z] = rx-pole[X], ry-pole[Y], rz-pole[Z] r = math.Sqrt(R2[X]*R2[X] + R2[Y]*R2[Y] + R2[Z]*R2[Z]) qr = -charge / (4 * math.Pi * r * r * r) B[X] += (bx + R2[X]*qr) // addition ordered for accuracy B[Y] += (by + R2[Y]*qr) B[Z] += (bz + R2[Z]*qr) } } } } } for d := s; d < 3; d++ { // destination index Ksdxyz array[s][d][zw][yw][xw] += float32(B[d]) // += needed in case of PBC } } } } done <- struct{}{} // notify parallel computation of this component is done }(s) } // wait for all 3 components to finish <-done <-done <-done // Reconstruct skipped parts from symmetry (X) for z := 0; z < size[Z]; z++ { for y := 0; y < size[Y]; y++ { for x := size[X]/2 + 1; x < size[X]; x++ { x2 := size[X] - x array[X][X][z][y][x] = array[X][X][z][y][x2] array[X][Y][z][y][x] = -array[X][Y][z][y][x2] array[X][Z][z][y][x] = -array[X][Z][z][y][x2] array[Y][Y][z][y][x] = array[Y][Y][z][y][x2] array[Y][Z][z][y][x] = array[Y][Z][z][y][x2] array[Z][Z][z][y][x] = array[Z][Z][z][y][x2] } } } // Reconstruct skipped parts from symmetry (Y) for z := 0; z < size[Z]; z++ { for y := size[Y]/2 + 1; y < size[Y]; y++ { y2 := size[Y] - y for x := 0; x < size[X]; x++ { array[X][X][z][y][x] = array[X][X][z][y2][x] array[X][Y][z][y][x] = -array[X][Y][z][y2][x] array[X][Z][z][y][x] = array[X][Z][z][y2][x] array[Y][Y][z][y][x] = array[Y][Y][z][y2][x] array[Y][Z][z][y][x] = -array[Y][Z][z][y2][x] array[Z][Z][z][y][x] = array[Z][Z][z][y2][x] } } } // Reconstruct skipped parts from symmetry (Z) for z := size[Z]/2 + 1; z < size[Z]; z++ { z2 := size[Z] - z for y := 0; y < size[Y]; y++ { for x := 0; x < size[X]; x++ { array[X][X][z][y][x] = array[X][X][z2][y][x] array[X][Y][z][y][x] = array[X][Y][z2][y][x] array[X][Z][z][y][x] = -array[X][Z][z2][y][x] array[Y][Y][z][y][x] = array[Y][Y][z2][y][x] array[Y][Z][z][y][x] = -array[Y][Z][z2][y][x] array[Z][Z][z][y][x] = array[Z][Z][z2][y][x] } } } // for 2D these elements are zero: if size[Z] == 1 { kernel[X][Z] = nil kernel[Y][Z] = nil } // make result symmetric for tools that expect it so. kernel[Y][X] = kernel[X][Y] kernel[Z][X] = kernel[X][Z] kernel[Z][Y] = kernel[Y][Z] return kernel } // integration ranges for kernel. size=kernelsize, so padded for no PBC, not padded for PBC func kernelRanges(size, pbc [3]int) (r1, r2 [3]int) { for c := 0; c < 3; c++ { if pbc[c] == 0 { r1[c], r2[c] = -(size[c]-1)/2, (size[c]-1)/2 } else { r1[c], r2[c] = -(size[c]*pbc[c] - 1), (size[c]*pbc[c] - 1) // no /2 here, or we would take half right and half left image } } // support for 2D simulations (thickness 1) if size[Z] == 1 && pbc[Z] == 0 { r2[Z] = 0 } return } const ( X = 0 Y = 1 Z = 2 ) // closest distance between cells, given center distance d. // if cells touch by just even a corner, the distance is zero. func delta(d int) float64 { if d < 0 { d = -d } if d > 0 { d -= 1 } return float64(d) } // Wraps an index to [0, max] by adding/subtracting a multiple of max. func wrap(number, max int) int { for number < 0 { number += max } for number >= max { number -= max } return number } const maxAspect = 100.0 // maximum sane cell aspect ratio func sanityCheck(cellsize [3]float64, pbc [3]int) { a3 := cellsize[X] / cellsize[Y] a2 := cellsize[Y] / cellsize[Z] a1 := cellsize[Z] / cellsize[X] aMax := math.Max(a1, math.Max(a2, a3)) aMin := math.Min(a1, math.Min(a2, a3)) if aMax > maxAspect || aMin < 1./maxAspect { util.Fatal("Unrealistic cell aspect ratio:", cellsize) } } // Returns the size after zero-padding, taking into account periodic boundary conditions. // In a certain direction, there is no padding in case of PBC (it should wrap around). // Without PBC there should be zero padding up to at least 2*N - 1. In that case there // is a trade-off: for large N, padding up to 2*N can be much more efficient since // power-of-two sized FFT's are ludicrously fast on CUDA. However for very small N, // in particular N=1, we should not over-pad. func padSize(size, periodic [3]int) [3]int { var padded [3]int for i := range size { if periodic[i] != 0 { padded[i] = size[i] continue } if i != Z || size[i] > SMALL_N { // for some reason it only works for Z, perhaps we assume even FFT size elsewhere? // large N: zero pad * 2 for FFT performance padded[i] = size[i] * 2 } else { // small N: minimal zero padding for memory/performance padded[i] = size[i]*2 - 1 } } return padded } // Use 2N-1 padding instead of 2N for sizes up to SMALL_N. // 5 seems a good choice since for all n<=5, 2*n-1 only has // prime factors 2,3,5,7 (good CUFFT performance). // starting from 6 it becomes problematic so we use 2*n. const SMALL_N = 5 // "If brute force doesn't solve your problem, // you're not using enough of it." /* Note: error for cubic self-kernel for different stagger decisions: 1 ++--+----+-++---+----+--++---+----+-++---+----+--++---+----+-++--++ + + + + + + + | | + A + 0.1 ++ A A ++ + A A A A + + C A A A + 0.01 ++ B D E C ++ e + B D E C + r | F B D BE C | r + F D BE DC B+ o 0.001 ++ ++ r + F + + F + 0.0001 ++ F +F + F + | F | + + + + + + + 1e-05 ++--+----+-++---+----+--++---+----+-++---+----+--++---+----+-++--++ 100 1000 10000 100000 1e+06 1e+07 evaluation points A: no staggering B: nv = ((nv + 1) / 2) * 2 nw = ((nw + 1) / 2) * 2 nx = ((nx+1)/2)*2 - 1 ny = ((ny+1)/2)*2 - 1 nz = ((nz+1)/2)*2 - 1 C: nv = ((nv + 1) / 2) * 2 nw = ((nw + 1) / 2) * 2 nx = ((nx+1)/2)*2 + 1 ny = ((ny+1)/2)*2 + 1 nz = ((nz+1)/2)*2 + 1 D: nv += 1 nw += 1 E: nx += 1 ny += 1 nz += 1 F: best with accuracy 6 nv *= 2 nw *= 2 */ mumax3-3.10/mag/mfmkernel.go000066400000000000000000000107131371432437400157340ustar00rootroot00000000000000package mag import ( "bufio" "fmt" d "github.com/mumax/3/data" "github.com/mumax/3/oommf" "github.com/mumax/3/util" "math" "os" ) func MFMKernel(mesh *d.Mesh, lift, tipsize float64, cacheDir string) (kernel [3]*d.Slice) { // Cache disabled if cacheDir == "" { util.Log(`//Not using kernel cache (-cache="")`) return CalcMFMKernel(mesh, lift, tipsize) } // Error-resilient kernel cache: if anything goes wrong, return calculated kernel. defer func() { if err := recover(); err != nil { util.Log("//Unable to use kernel cache:", err) kernel = CalcMFMKernel(mesh, lift, tipsize) } }() // Try to load kernel basename := fmt.Sprint(cacheDir, "/", "mumax3MFMkernel_", mesh.Size(), "_", mesh.PBC(), "_", mesh.CellSize(), "_", lift, "_", tipsize, "_") var errLoad error for i := 0; i < 3; i++ { kernel[i], errLoad = LoadKernel(fmt.Sprint(basename, i, ".ovf")) if errLoad != nil { break } } if errLoad != nil { util.Log("//Did not use cached kernel:", errLoad) } else { util.Log("//Using cached kernel:", basename) return kernel } // Could not load kernel: calculate it and save var errSave error kernel = CalcMFMKernel(mesh, lift, tipsize) for i := 0; i < 3; i++ { compName := fmt.Sprint("Nmfm_", i) info := d.Meta{Time: float64(0.0), Name: compName, Unit: "1", CellSize: mesh.CellSize(), MeshUnit: "m"} errSave = SaveKernel(fmt.Sprint(basename, i, ".ovf"), kernel[i], info) if errSave != nil { break } } if errSave != nil { util.Log("//Failed to cache kernel:", errSave) } else { util.Log("//Cached kernel:", basename) } return kernel } func LoadMFMKernel(fname string) (kernel *d.Slice, err error) { kernel, _, err = oommf.ReadFile(fname) return } func SaveMFMKernel(fname string, kernel *d.Slice) error { f, err := os.OpenFile(fname, os.O_WRONLY|os.O_TRUNC|os.O_CREATE, 0666) if err != nil { return err } out := bufio.NewWriter(f) defer out.Flush() oommf.WriteOVF2(out, kernel, d.Meta{}, "binary 4") return nil } // Kernel for the vertical derivative of the force on an MFM tip due to mx, my, mz. // This is the 2nd derivative of the energy w.r.t. z. func CalcMFMKernel(mesh *d.Mesh, lift, tipsize float64) (kernel [3]*d.Slice) { const TipCharge = 1 / Mu0 // tip charge const Δ = 1e-9 // tip oscillation, take 2nd derivative over this distance util.AssertMsg(lift > 0, "MFM tip crashed into sample, please lift the new one higher") { // Kernel mesh is 2x larger than input, instead in case of PBC pbc := mesh.PBC() sz := padSize(mesh.Size(), pbc) cs := mesh.CellSize() mesh = d.NewMesh(sz[X], sz[Y], sz[Z], cs[X], cs[Y], cs[Z], pbc[:]...) } // Shorthand size := mesh.Size() pbc := mesh.PBC() cellsize := mesh.CellSize() volume := cellsize[X] * cellsize[Y] * cellsize[Z] fmt.Println("calculating MFM kernel") // Sanity check { util.Assert(size[Z] >= 1 && size[Y] >= 2 && size[X] >= 2) util.Assert(cellsize[X] > 0 && cellsize[Y] > 0 && cellsize[Z] > 0) util.AssertMsg(size[X]%2 == 0 && size[Y]%2 == 0, "Even kernel size needed") if size[Z] > 1 { util.AssertMsg(size[Z]%2 == 0, "Even kernel size needed") } } // Allocate only upper diagonal part. The rest is symmetric due to reciprocity. var K [3][][][]float32 for i := 0; i < 3; i++ { kernel[i] = d.NewSlice(1, mesh.Size()) K[i] = kernel[i].Scalars() } r1, r2 := kernelRanges(size, pbc) progress, progmax := 0, (1+r2[Y]-r1[Y])*(1+r2[Z]-r1[Z]) for iz := r1[Z]; iz <= r2[Z]; iz++ { zw := wrap(iz, size[Z]) z := float64(iz) * cellsize[Z] for iy := r1[Y]; iy <= r2[Y]; iy++ { yw := wrap(iy, size[Y]) y := float64(iy) * cellsize[Y] progress++ util.Progress(progress, progmax, "Calculating MFM kernel") for ix := r1[X]; ix <= r2[X]; ix++ { x := float64(ix) * cellsize[X] xw := wrap(ix, size[X]) for s := 0; s < 3; s++ { // source index Ksxyz m := d.Vector{0, 0, 0} m[s] = 1 var E [3]float64 // 3 energies for 2nd derivative for i := -1; i <= 1; i++ { I := float64(i) R := d.Vector{-x, -y, z - (lift + (I * Δ))} r := R.Len() B := R.Mul(TipCharge / (4 * math.Pi * r * r * r)) R = d.Vector{-x, -y, z - (lift + tipsize + (I * Δ))} r = R.Len() B = B.Add(R.Mul(-TipCharge / (4 * math.Pi * r * r * r))) E[i+1] = B.Dot(m) * volume // i=-1 stored in E[0] } dFdz_tip := ((E[0] - E[1]) + (E[2] - E[1])) / (Δ * Δ) // dFz/dz = d2E/dz2 K[s][zw][yw][xw] += float32(dFdz_tip) // += needed in case of PBC } } } } return kernel } mumax3-3.10/oommf/000077500000000000000000000000001371432437400137745ustar00rootroot00000000000000mumax3-3.10/oommf/Makefile000066400000000000000000000000241371432437400154300ustar00rootroot00000000000000all: go install -v mumax3-3.10/oommf/oommf.go000066400000000000000000000150531371432437400154440ustar00rootroot00000000000000// package oommf provides the OVF data format as used by OOMMF. package oommf import ( "bufio" "fmt" "github.com/mumax/3/data" "github.com/mumax/3/util" "io" "os" "strconv" "strings" ) // Read any OOMMF file, autodetect OVF1/OVF2 format func Read(in io.Reader) (s *data.Slice, meta data.Meta, err error) { //in := fullReader{bufio.NewReader(in_)} info := readHeader(in) n := info.Size c := info.StepSize if c == [3]float64{0, 0, 0} { c = [3]float64{1, 1, 1} // default (presumably unitless) cell size } data_ := data.NewSlice(info.NComp, n) format := strings.ToLower(info.Format) ovf := info.OVF switch { default: panic(fmt.Sprint("unknown format: OVF", ovf, " ", format)) case format == "text": readOVFDataText(in, data_) case format == "binary 4" && ovf == 1: readOVF1DataBinary4(in, data_) case format == "binary 8" && ovf == 1: readOVF1DataBinary8(in, data_) case format == "binary 4" && ovf == 2: readOVF2DataBinary4(in, data_) case format == "binary 8" && ovf == 2: readOVF2DataBinary8(in, data_) } return data_, data.Meta{Name: info.Title, Time: info.TotalTime, Unit: info.ValueUnit, CellSize: info.StepSize}, nil } func ReadFile(fname string) (*data.Slice, data.Meta, error) { f, err := os.Open(fname) if err != nil { return nil, data.Meta{}, err } defer f.Close() return Read(bufio.NewReader(f)) } func MustReadFile(fname string) (*data.Slice, data.Meta) { s, t, err := ReadFile(fname) util.FatalErr(err) return s, t } // omf.Info represents the header part of an omf file. // TODO: add Err to return error status // Perhaps CheckErr() func type Info struct { Desc map[string]interface{} Title string NComp int Size [3]int ValueMultiplier float32 ValueUnit string Format string // binary or text OVF int TotalTime float64 StageTime float64 SizeofFloat int // 4/8 StepSize [3]float64 MeshUnit string } // Parses the header part of the OVF1/OVF2 file func readHeader(in io.Reader) *Info { desc := make(map[string]interface{}) info := new(Info) info.Desc = desc line, eof := readLine(in) switch strings.ToLower(line) { default: panic("unknown header: " + line) case "# oommf ovf 2.0": info.OVF = 2 case "# oommf: rectangular mesh v1.0": info.OVF = 1 info.NComp = 3 // OVF1 only supports vector } line, eof = readLine(in) for !eof && !isHeaderEnd(line) { key, value := parseHeaderLine(line) switch strings.ToLower(key) { default: panic("Unknown key: " + key) // ignored case "oommf", "segment count", "begin", "meshtype", "xbase", "ybase", "zbase", "xmin", "ymin", "zmin", "xmax", "ymax", "zmax", "valuerangeminmag", "valuerangemaxmag", "end": // ignored (OVF1) case "", "valuelabels": // ignored (OVF2) case "title": info.Title = value case "valueunits": info.ValueUnit = strings.Split(value, " ")[0] // take unit of first component, we don't support per-component units case "valuedim": info.NComp = atoi(value) case "xnodes": info.Size[X] = atoi(value) case "ynodes": info.Size[Y] = atoi(value) case "znodes": info.Size[Z] = atoi(value) case "xstepsize": info.StepSize[X] = atof(value) case "ystepsize": info.StepSize[Y] = atof(value) case "zstepsize": info.StepSize[Z] = atof(value) case "valuemultiplier": case "valueunit": case "meshunit": // desc tags: parse further and add to metadata table case "desc": strs := strings.SplitN(value, ":", 2) desc_key := strings.Trim(strs[0], "# ") // Desc tag does not neccesarily have a key:value layout. // If not, we use an empty value string. desc_value := "" if len(strs) > 1 { desc_value = strings.Trim(strs[1], "# ") } desc[desc_key] = desc_value } line, eof = readLine(in) } // the remaining line should now be the begin:data clause key, value := parseHeaderLine(line) value = strings.TrimSpace(value) strs := strings.SplitN(value, " ", 3) if strings.ToLower(key) != "begin" || strings.ToLower(strs[0]) != "data" { panic("Expected: Begin: Data") } info.Format = strings.ToLower(strs[1]) if len(strs) >= 3 { // dataformat for text is empty info.Format = "binary " + strs[2] // binary + 4 or 8 } else { info.Format = "text" } // OVF1-style time info if t1, ok := info.Desc["Time (s)"]; ok { timestr := fmt.Sprint(t1) t, _ := strconv.ParseFloat(timestr, 64) info.TotalTime = t } // OVF2-style time info if t2, ok := info.Desc["Total simulation time"]; ok { timestr := fmt.Sprint(t2) words := strings.Split(timestr, " ") t, _ := strconv.ParseFloat(words[0], 64) info.TotalTime = t } return info } // INTERNAL: Splits "# key: value" into "key", "value". // Both may be empty func parseHeaderLine(str string) (key, value string) { strs := strings.SplitN(str, ":", 2) key = strings.Trim(strs[0], "# ") if len(strs) != 2 { return key, "" } value = strings.Trim(strs[1], "# ") return key, value } // INTERNAL: true if line starts with "# begin:data" func isHeaderEnd(str string) bool { str = strings.ToLower(strings.Trim(str, "# ")) str = strings.Replace(str, " ", "", -1) return strings.HasPrefix(str, "begin:data") } const OVF_CONTROL_NUMBER_4 = 1234567.0 // The omf format requires the first encoded number in the binary data section to be this control number const OVF_CONTROL_NUMBER_8 = 123456789012345.0 // read data block in text format, for OVF1 and OVF2 func readOVFDataText(in io.Reader, t *data.Slice) { size := t.Size() data := t.Tensors() for iz := 0; iz < size[Z]; iz++ { for iy := 0; iy < size[Y]; iy++ { for ix := 0; ix < size[X]; ix++ { for c := 0; c < t.NComp(); c++ { _, err := fmt.Fscan(in, &data[c][iz][iy][ix]) if err != nil { panic(err) } } } } } } // write data block in text format, for OVF1 and OVF2 func writeOVFText(out io.Writer, tens *data.Slice) (err error) { data := tens.Tensors() gridsize := tens.Size() ncomp := tens.NComp() // Here we loop over X,Y,Z, not Z,Y,X, because // internal in C-order == external in Fortran-order for iz := 0; iz < gridsize[Z]; iz++ { for iy := 0; iy < gridsize[Y]; iy++ { for ix := 0; ix < gridsize[X]; ix++ { for c := 0; c < ncomp; c++ { _, err = fmt.Fprint(out, data[c][iz][iy][ix], " ") } _, err = fmt.Fprint(out, "\n") } } } return } // Writes a header key/value pair to out: // # Key: Value func hdr(out io.Writer, key string, value ...interface{}) { _, err := fmt.Fprint(out, "# ", key, ": ") util.FatalErr(err) _, err = fmt.Fprintln(out, value...) util.FatalErr(err) } func dsc(out io.Writer, k, v interface{}) { hdr(out, "Desc", k, ": ", v) } mumax3-3.10/oommf/ovf1.go000066400000000000000000000106331371432437400152010ustar00rootroot00000000000000package oommf import ( "encoding/binary" "fmt" "github.com/mumax/3/data" "io" "log" "strings" "unsafe" ) func WriteOVF1(out io.Writer, q *data.Slice, meta data.Meta, dataformat string) { if q.NComp() != 3 { log.Fatal("Cannot save the quantity: the OVF1 format only supports 3D-vector fields.") } writeOVF1Header(out, q, meta) writeOVF1Data(out, q, dataformat) hdr(out, "End", "Segment") } func writeOVF1Data(out io.Writer, q *data.Slice, dataformat string) { canonicalFormat := "" switch strings.ToLower(dataformat) { case "text": canonicalFormat = "Text" hdr(out, "Begin", "Data "+canonicalFormat) writeOVFText(out, q) case "binary", "binary 4": canonicalFormat = "Binary 4" hdr(out, "Begin", "Data "+canonicalFormat) writeOVF1Binary4(out, q) default: log.Fatalf("Illegal OVF data format: %v. Options are: Text, Binary 4", dataformat) } hdr(out, "End", "Data "+canonicalFormat) } // Writes the OMF header func writeOVF1Header(out io.Writer, q *data.Slice, meta data.Meta) { gridsize := q.Size() cellsize := meta.CellSize hdr(out, "OOMMF", "rectangular mesh v1.0") hdr(out, "Segment count", "1") hdr(out, "Begin", "Segment") hdr(out, "Begin", "Header") dsc(out, "Time (s)", meta.Time) hdr(out, "Title", meta.Name) hdr(out, "meshtype", "rectangular") hdr(out, "meshunit", "m") hdr(out, "xbase", cellsize[X]/2) hdr(out, "ybase", cellsize[Y]/2) hdr(out, "zbase", cellsize[Z]/2) hdr(out, "xstepsize", cellsize[X]) hdr(out, "ystepsize", cellsize[Y]) hdr(out, "zstepsize", cellsize[Z]) hdr(out, "xmin", 0) hdr(out, "ymin", 0) hdr(out, "zmin", 0) hdr(out, "xmax", cellsize[X]*float64(gridsize[X])) hdr(out, "ymax", cellsize[Y]*float64(gridsize[Y])) hdr(out, "zmax", cellsize[Z]*float64(gridsize[Z])) hdr(out, "xnodes", gridsize[X]) hdr(out, "ynodes", gridsize[Y]) hdr(out, "znodes", gridsize[Z]) hdr(out, "ValueRangeMinMag", 1e-08) // not so "optional" as the OOMMF manual suggests... hdr(out, "ValueRangeMaxMag", 1) // TODO hdr(out, "valueunit", meta.Unit) hdr(out, "valuemultiplier", 1) hdr(out, "End", "Header") } // Writes data in OMF Binary 4 format func writeOVF1Binary4(out io.Writer, array *data.Slice) (err error) { data := array.Tensors() gridsize := array.Size() var bytes []byte // OOMMF requires this number to be first to check the format var controlnumber float32 = OVF_CONTROL_NUMBER_4 // Conversion form float32 [4]byte in big-endian // Inlined for performance, terabytes of data will pass here... bytes = (*[4]byte)(unsafe.Pointer(&controlnumber))[:] bytes[0], bytes[1], bytes[2], bytes[3] = bytes[3], bytes[2], bytes[1], bytes[0] // swap endianess _, err = out.Write(bytes) ncomp := array.NComp() for iz := 0; iz < gridsize[Z]; iz++ { for iy := 0; iy < gridsize[Y]; iy++ { for ix := 0; ix < gridsize[X]; ix++ { for c := 0; c < ncomp; c++ { // dirty conversion from float32 to [4]byte bytes = (*[4]byte)(unsafe.Pointer(&data[c][iz][iy][ix]))[:] bytes[0], bytes[1], bytes[2], bytes[3] = bytes[3], bytes[2], bytes[1], bytes[0] out.Write(bytes) } } } } return } func readOVF1DataBinary4(in io.Reader, t *data.Slice) { size := t.Size() data := t.Tensors() // OOMMF requires this number to be first to check the format var controlnumber float32 // OVF 1.0 is network byte order (MSB) binary.Read(in, binary.BigEndian, &controlnumber) if controlnumber != OVF_CONTROL_NUMBER_4 { panic("invalid OVF1 control number: " + fmt.Sprint(controlnumber)) } var tmp float32 for iz := 0; iz < size[Z]; iz++ { for iy := 0; iy < size[Y]; iy++ { for ix := 0; ix < size[X]; ix++ { for c := 0; c < 3; c++ { err := binary.Read(in, binary.BigEndian, &tmp) if err != nil { panic(err) } data[c][iz][iy][ix] = tmp } } } } } func readOVF1DataBinary8(in io.Reader, t *data.Slice) { size := t.Size() data := t.Tensors() // OOMMF requires this number to be first to check the format var controlnumber float64 // OVF 1.0 is network byte order (MSB) binary.Read(in, binary.BigEndian, &controlnumber) if controlnumber != OVF_CONTROL_NUMBER_8 { panic("invalid OVF1 control number: " + fmt.Sprint(controlnumber)) } var tmp float64 for iz := 0; iz < size[Z]; iz++ { for iy := 0; iy < size[Y]; iy++ { for ix := 0; ix < size[X]; ix++ { for c := 0; c < 3; c++ { err := binary.Read(in, binary.BigEndian, &tmp) if err != nil { panic(err) } data[c][iz][iy][ix] = float32(tmp) } } } } } mumax3-3.10/oommf/ovf2.go000066400000000000000000000111111371432437400151720ustar00rootroot00000000000000package oommf import ( "fmt" "github.com/mumax/3/data" "io" "log" "strings" "unsafe" ) func WriteOVF2(out io.Writer, q *data.Slice, meta data.Meta, dataformat string) { writeOVF2Header(out, q, meta) writeOVF2Data(out, q, dataformat) hdr(out, "End", "Segment") } func writeOVF2Header(out io.Writer, q *data.Slice, meta data.Meta) { gridsize := q.Size() cellsize := meta.CellSize fmt.Fprintln(out, "# OOMMF OVF 2.0") hdr(out, "Segment count", "1") hdr(out, "Begin", "Segment") hdr(out, "Begin", "Header") hdr(out, "Title", meta.Name) hdr(out, "meshtype", "rectangular") hdr(out, "meshunit", "m") hdr(out, "xmin", 0) hdr(out, "ymin", 0) hdr(out, "zmin", 0) hdr(out, "xmax", cellsize[X]*float64(gridsize[X])) hdr(out, "ymax", cellsize[Y]*float64(gridsize[Y])) hdr(out, "zmax", cellsize[Z]*float64(gridsize[Z])) name := meta.Name var labels []interface{} if q.NComp() == 1 { labels = []interface{}{name} } else { for i := 0; i < q.NComp(); i++ { labels = append(labels, name+"_"+string('x'+i)) } } hdr(out, "valuedim", q.NComp()) hdr(out, "valuelabels", labels...) // TODO unit := meta.Unit if unit == "" { unit = "1" } if q.NComp() == 1 { hdr(out, "valueunits", unit) } else { hdr(out, "valueunits", unit, unit, unit) } // We don't really have stages //fmt.Fprintln(out, "# Desc: Stage simulation time: ", meta.TimeStep, " s") // TODO hdr(out, "Desc", "Total simulation time: ", meta.Time, " s") hdr(out, "xbase", cellsize[X]/2) hdr(out, "ybase", cellsize[Y]/2) hdr(out, "zbase", cellsize[Z]/2) hdr(out, "xnodes", gridsize[X]) hdr(out, "ynodes", gridsize[Y]) hdr(out, "znodes", gridsize[Z]) hdr(out, "xstepsize", cellsize[X]) hdr(out, "ystepsize", cellsize[Y]) hdr(out, "zstepsize", cellsize[Z]) hdr(out, "End", "Header") } func writeOVF2Data(out io.Writer, q *data.Slice, dataformat string) { canonicalFormat := "" switch strings.ToLower(dataformat) { case "text": canonicalFormat = "Text" hdr(out, "Begin", "Data "+canonicalFormat) writeOVFText(out, q) case "binary", "binary 4": canonicalFormat = "Binary 4" hdr(out, "Begin", "Data "+canonicalFormat) writeOVF2DataBinary4(out, q) default: log.Fatalf("Illegal OMF data format: %v. Options are: Text, Binary 4", dataformat) } hdr(out, "End", "Data "+canonicalFormat) } func writeOVF2DataBinary4(out io.Writer, array *data.Slice) { //w.count(w.out.Write((*(*[1<<31 - 1]byte)(unsafe.Pointer(&list[0])))[0 : 4*len(list)])) // (shortcut) data := array.Tensors() size := array.Size() var bytes []byte // OOMMF requires this number to be first to check the format var controlnumber float32 = OVF_CONTROL_NUMBER_4 bytes = (*[4]byte)(unsafe.Pointer(&controlnumber))[:] out.Write(bytes) ncomp := array.NComp() for iz := 0; iz < size[Z]; iz++ { for iy := 0; iy < size[Y]; iy++ { for ix := 0; ix < size[X]; ix++ { for c := 0; c < ncomp; c++ { bytes = (*[4]byte)(unsafe.Pointer(&data[c][iz][iy][ix]))[:] out.Write(bytes) } } } } } func readOVF2DataBinary4(in io.Reader, array *data.Slice) { size := array.Size() data := array.Tensors() // OOMMF requires this number to be first to check the format controlnumber := readFloat32(in) if controlnumber != OVF_CONTROL_NUMBER_4 { panic("invalid OVF2 control number: " + fmt.Sprint(controlnumber)) } ncomp := array.NComp() for iz := 0; iz < size[Z]; iz++ { for iy := 0; iy < size[Y]; iy++ { for ix := 0; ix < size[X]; ix++ { for c := 0; c < ncomp; c++ { data[c][iz][iy][ix] = readFloat32(in) } } } } } // fully read buf, panic on error func readFull(in io.Reader, buf []byte) { _, err := io.ReadFull(in, buf) if err != nil { panic(err) } return } // read float32 in machine endianess, panic on error func readFloat32(in io.Reader) float32 { var bytes4 [4]byte bytes := bytes4[:] readFull(in, bytes) return *((*float32)(unsafe.Pointer(&bytes4))) } // read float64 in machine endianess, panic on error func readFloat64(in io.Reader) float64 { var bytes8 [8]byte bytes := bytes8[:] readFull(in, bytes) return *((*float64)(unsafe.Pointer(&bytes8))) } func readOVF2DataBinary8(in io.Reader, array *data.Slice) { size := array.Size() data := array.Tensors() // OOMMF requires this number to be first to check the format controlnumber := readFloat64(in) if controlnumber != OVF_CONTROL_NUMBER_8 { panic("invalid OVF2 control number: " + fmt.Sprint(controlnumber)) } ncomp := array.NComp() for iz := 0; iz < size[Z]; iz++ { for iy := 0; iy < size[Y]; iy++ { for ix := 0; ix < size[X]; ix++ { for c := 0; c < ncomp; c++ { data[c][iz][iy][ix] = float32(readFloat64(in)) } } } } } mumax3-3.10/oommf/util.go000066400000000000000000000021551371432437400153030ustar00rootroot00000000000000package oommf import ( "io" "strconv" ) func readLine(in io.Reader) (line string, eof bool) { char := readChar(in) eof = isEOF(char) for !isEndline(char) { line += string(byte(char)) char = readChar(in) } return line, eof } func isEOF(char int) bool { return char == -1 } func isEndline(char int) bool { return isEOF(char) || char == int('\n') } //// Blocks until all requested bytes are read. //type fullReader struct{ io.Reader } // //func (r fullReader) Read(p []byte) (n int, err error) { // return io.ReadFull(r.Reader, p) //} // Reads one character from the Reader. // -1 means EOF. // Errors are cought and cause panic func readChar(in io.Reader) int { buffer := [1]byte{} switch nr, err := in.Read(buffer[0:]); true { case nr < 0: // error panic(err) case nr == 0: // eof return -1 case nr > 0: // ok return int(buffer[0]) } panic("unreachable") } func atoi(a string) int { i, err := strconv.Atoi(a) if err != nil { panic(err) } return i } func atof(a string) float64 { i, err := strconv.ParseFloat(a, 64) if err != nil { panic(err) } return i } const ( X = 0 Y = 1 Z = 2 ) mumax3-3.10/post-commit000077500000000000000000000010251371432437400150560ustar00rootroot00000000000000#!/bin/sh # # A hook script to verify what is about to be committed. # Called by git-commit with no arguments. The hook should # exit with non-zero status after issuing an appropriate message if # it wants to stop the commit. # # Add this file to .git/hooks # Run all unit tests echo Running unit tests in background rm test.log -rf (if (make test >> test.log 2>> test.log); then notify-send "Unit tests passed" 2> /dev/null exit 0; else notify-send "Unit tests failed" 2> /dev/null cat test.log; rm test.log; exit 2; fi;)& mumax3-3.10/pre-commit000077500000000000000000000007611371432437400146650ustar00rootroot00000000000000#!/bin/sh # # A hook script to verify what is about to be committed. # Called by git-commit with no arguments. The hook should # exit with non-zero status after issuing an appropriate message if # it wants to stop the commit. # # Add this file to .git/hooks # # Runs gofmt on the code and stops commit if files were affected. # fail=0; if (gofmt -w -l */*.go */*/*.go | grep \.go); then exit 1; fi; make || exit 1 #if astyle --indent=tab cuda/*.cu | grep Formatted; then exit 1; fi; exit 0 mumax3-3.10/script/000077500000000000000000000000001371432437400141635ustar00rootroot00000000000000mumax3-3.10/script/Makefile000066400000000000000000000000241371432437400156170ustar00rootroot00000000000000all: go install -v mumax3-3.10/script/assignstmt.go000066400000000000000000000037711371432437400167160ustar00rootroot00000000000000package script import ( "go/ast" "go/token" "reflect" ) // compiles a (single) assign statement lhs = rhs func (w *World) compileAssignStmt(a *ast.AssignStmt) Expr { if len(a.Lhs) != 1 || len(a.Rhs) != 1 { panic(err(a.Pos(), "multiple assignment not allowed")) } lhs, rhs := a.Lhs[0], a.Rhs[0] r := w.compileExpr(rhs) switch a.Tok { default: panic(err(a.Pos(), a.Tok, "not allowed")) case token.ASSIGN: // = return w.compileAssign(a, lhs, r) case token.DEFINE: // := return w.compileDefine(a, lhs, r) case token.ADD_ASSIGN: // += return w.compileAddAssign(a, lhs, r) case token.SUB_ASSIGN: // -= return w.compileSubAssign(a, lhs, r) } } // compile a = b func (w *World) compileAssign(a *ast.AssignStmt, lhs ast.Expr, r Expr) Expr { l := w.compileLvalue(lhs) return &assignStmt{lhs: l, rhs: typeConv(a.Pos(), r, inputType(l))} } // compile a := b func (w *World) compileDefine(a *ast.AssignStmt, lhs ast.Expr, r Expr) Expr { ident, ok := lhs.(*ast.Ident) if !ok { panic(err(a.Pos(), "non-name on left side of :=")) } addr := reflect.New(r.Type()) ok = w.safeDeclare(ident.Name, &reflectLvalue{addr.Elem()}) if !ok { panic(err(a.Pos(), "already defined: "+ident.Name)) } return w.compileAssign(a, lhs, r) } type assignStmt struct { lhs LValue rhs Expr void } func (a *assignStmt) Eval() interface{} { a.lhs.SetValue(a.rhs.Eval()) return nil } func (a *assignStmt) Child() []Expr { return []Expr{a.lhs, a.rhs} } func (w *World) compileAddAssign(a *ast.AssignStmt, lhs ast.Expr, r Expr) Expr { l := w.compileLvalue(lhs) x := typeConv(a.Pos(), l, float64_t) y := typeConv(a.Pos(), r, float64_t) sum := &add{binaryExpr{x, y}} return &assignStmt{lhs: l, rhs: typeConv(a.Pos(), sum, inputType(l))} } func (w *World) compileSubAssign(a *ast.AssignStmt, lhs ast.Expr, r Expr) Expr { l := w.compileLvalue(lhs) x := typeConv(a.Pos(), l, float64_t) y := typeConv(a.Pos(), r, float64_t) sub := &sub{binaryExpr{x, y}} return &assignStmt{lhs: l, rhs: typeConv(a.Pos(), sub, inputType(l))} } mumax3-3.10/script/binaryexpr.go000066400000000000000000000077431371432437400167100ustar00rootroot00000000000000package script import ( "go/ast" "go/token" "reflect" ) // compiles a binary expression x 'op' y func (w *World) compileBinaryExpr(n *ast.BinaryExpr) Expr { switch n.Op { default: panic(err(n.Pos(), "not allowed:", n.Op)) case token.ADD: return &add{w.newBinExpr(n)} case token.SUB: return &sub{w.newBinExpr(n)} case token.MUL: return &mul{w.newBinExpr(n)} case token.QUO: return &quo{w.newBinExpr(n)} case token.LSS: return &lss{w.newComp(n)} case token.GTR: return >r{w.newComp(n)} case token.LEQ: return &leq{w.newComp(n)} case token.GEQ: return &geq{w.newComp(n)} case token.EQL: return &eql{w.newComp(n)} case token.NEQ: return &neq{w.newComp(n)} case token.LAND: return &and{w.newBoolOp(n)} case token.LOR: return &or{w.newBoolOp(n)} } } // abstract superclass for all binary expressions type binaryExpr struct{ x, y Expr } func (w *World) newBinExpr(n *ast.BinaryExpr) binaryExpr { x := typeConv(n.Pos(), w.compileExpr(n.X), float64_t) y := typeConv(n.Pos(), w.compileExpr(n.Y), float64_t) return binaryExpr{x, y} } func (b *binaryExpr) Type() reflect.Type { return float64_t } func (b *binaryExpr) Child() []Expr { return []Expr{b.x, b.y} } type add struct{ binaryExpr } type sub struct{ binaryExpr } type mul struct{ binaryExpr } type quo struct{ binaryExpr } func (b *add) Eval() interface{} { return b.x.Eval().(float64) + b.y.Eval().(float64) } func (b *sub) Eval() interface{} { return b.x.Eval().(float64) - b.y.Eval().(float64) } func (b *mul) Eval() interface{} { return b.x.Eval().(float64) * b.y.Eval().(float64) } func (b *quo) Eval() interface{} { return b.x.Eval().(float64) / b.y.Eval().(float64) } func (b *add) Fix() Expr { return &add{binaryExpr{x: b.x.Fix(), y: b.y.Fix()}} } func (b *sub) Fix() Expr { return &sub{binaryExpr{x: b.x.Fix(), y: b.y.Fix()}} } func (b *mul) Fix() Expr { return &mul{binaryExpr{x: b.x.Fix(), y: b.y.Fix()}} } func (b *quo) Fix() Expr { return &quo{binaryExpr{x: b.x.Fix(), y: b.y.Fix()}} } type comp binaryExpr func (w *World) newComp(n *ast.BinaryExpr) comp { return comp(w.newBinExpr(n)) } func (b *comp) Type() reflect.Type { return bool_t } func (b *comp) Child() []Expr { return []Expr{b.x, b.y} } type lss struct{ comp } type gtr struct{ comp } type leq struct{ comp } type geq struct{ comp } type eql struct{ comp } type neq struct{ comp } func (b *lss) Eval() interface{} { return b.x.Eval().(float64) < b.y.Eval().(float64) } func (b *gtr) Eval() interface{} { return b.x.Eval().(float64) > b.y.Eval().(float64) } func (b *leq) Eval() interface{} { return b.x.Eval().(float64) <= b.y.Eval().(float64) } func (b *geq) Eval() interface{} { return b.x.Eval().(float64) >= b.y.Eval().(float64) } func (b *eql) Eval() interface{} { return b.x.Eval().(float64) == b.y.Eval().(float64) } func (b *neq) Eval() interface{} { return b.x.Eval().(float64) != b.y.Eval().(float64) } func (b *lss) Fix() Expr { return &lss{comp{x: b.x.Fix(), y: b.y.Fix()}} } func (b *gtr) Fix() Expr { return >r{comp{x: b.x.Fix(), y: b.y.Fix()}} } func (b *leq) Fix() Expr { return &leq{comp{x: b.x.Fix(), y: b.y.Fix()}} } func (b *geq) Fix() Expr { return &geq{comp{x: b.x.Fix(), y: b.y.Fix()}} } func (b *eql) Fix() Expr { return &eql{comp{x: b.x.Fix(), y: b.y.Fix()}} } func (b *neq) Fix() Expr { return &neq{comp{x: b.x.Fix(), y: b.y.Fix()}} } type boolOp struct{ x, y Expr } func (w *World) newBoolOp(n *ast.BinaryExpr) boolOp { x := typeConv(n.Pos(), w.compileExpr(n.X), bool_t) y := typeConv(n.Pos(), w.compileExpr(n.Y), bool_t) return boolOp{x, y} } func (b *boolOp) Child() []Expr { return []Expr{b.x, b.y} } func (b *boolOp) Type() reflect.Type { return bool_t } type and struct{ boolOp } type or struct{ boolOp } func (b *and) Eval() interface{} { return b.x.Eval().(bool) && b.y.Eval().(bool) } func (b *or) Eval() interface{} { return b.x.Eval().(bool) || b.y.Eval().(bool) } func (b *and) Fix() Expr { return &and{boolOp{x: b.x.Fix(), y: b.y.Fix()}} } func (b *or) Fix() Expr { return &or{boolOp{x: b.x.Fix(), y: b.y.Fix()}} } mumax3-3.10/script/blockstmt.go000066400000000000000000000024301371432437400165130ustar00rootroot00000000000000package script import ( "bytes" "fmt" "go/ast" "go/format" "go/token" "reflect" "strings" ) // block statement is a list of statements. type BlockStmt struct { Children []Expr Node []ast.Node } // does not enter scope because it does not necessarily needs to (e.g. for, if). func (w *World) compileBlockStmt_noScope(n *ast.BlockStmt) *BlockStmt { b := &BlockStmt{} for _, s := range n.List { b.append(w.compileStmt(s), s) } return b } func (b *BlockStmt) append(s Expr, n ast.Node) { b.Children = append(b.Children, s) b.Node = append(b.Node, n) } func (b *BlockStmt) Eval() interface{} { for _, s := range b.Children { s.Eval() } return nil } func (b *BlockStmt) Type() reflect.Type { return nil } func (b *BlockStmt) Child() []Expr { return b.Children } func Format(n ast.Node) string { var buf bytes.Buffer fset := token.NewFileSet() format.Node(&buf, fset, n) str := buf.String() if strings.HasSuffix(str, "\n") { str = str[:len(str)-1] } return str } func (b *BlockStmt) Format() string { var buf bytes.Buffer fset := token.NewFileSet() for i := range b.Children { format.Node(&buf, fset, b.Node[i]) fmt.Fprintln(&buf) } return buf.String() } func (b *BlockStmt) Fix() Expr { return &BlockStmt{Children: fixExprs(b.Children), Node: b.Node} } mumax3-3.10/script/call.go000066400000000000000000000040161371432437400154260ustar00rootroot00000000000000package script import ( "go/ast" "reflect" ) type call struct { f Expr args []Expr } func (w *World) compileCallExpr(n *ast.CallExpr) Expr { // compile function or method to be called var f Expr var fname string switch Fun := n.Fun.(type) { default: panic(err(n.Pos(), "not allowed:", typ(n.Fun))) case *ast.Ident: // function call fname = Fun.Name if fname == "source" { return w.compileSource(n) } f = w.compileExpr(Fun) case *ast.SelectorExpr: // method call f = w.compileSelectorStmt(Fun) fname = Fun.Sel.Name } if f.Type().Kind() != reflect.Func { panic(err(n.Pos(), "can not call", Format(n))) } // compile and check args args := make([]Expr, len(n.Args)) variadic := f.Type().IsVariadic() if !variadic && len(n.Args) != f.Type().NumIn() { panic(err(n.Pos(), fname, "needs", f.Type().NumIn(), "arguments, got", len(n.Args))) // TODO: varargs } for i := range args { if variadic { args[i] = w.compileExpr(n.Args[i]) // no type check or conversion } else { args[i] = typeConv(n.Args[i].Pos(), w.compileExpr(n.Args[i]), f.Type().In(i)) } } return &call{f, args} } func (c *call) Eval() interface{} { // evaluate and pack arguments argv := make([]reflect.Value, len(c.args)) for i := range c.args { argv[i] = reflect.ValueOf(c.args[i].Eval()) } // evaluate function f := reflect.ValueOf(c.f.Eval()) // call ret := f.Call(argv) // at most 1 return value allowed assert(len(ret) <= 1) if len(ret) == 0 { return nil } else { return ret[0].Interface() } } func (c *call) Child() []Expr { return append([]Expr{c.f}, c.args...) } // return type of call func (c *call) Type() reflect.Type { switch c.f.Type().NumOut() { case 0: return nil // "void" case 1: return c.f.Type().Out(0) default: panic("bug: multiple return values not allowed") } } func (c *call) Fix() Expr { return &call{f: c.f, args: fixExprs(c.args)} } // apply .Fix() to all elements func fixExprs(e []Expr) []Expr { f := make([]Expr, len(e)) for i := range f { f[i] = e[i].Fix() } return f } mumax3-3.10/script/child.go000066400000000000000000000003471371432437400156010ustar00rootroot00000000000000package script func Contains(tree, search Expr) bool { if tree == search { return true } else { children := tree.Child() for _, e := range children { if Contains(e, search) { return true } } } return false } mumax3-3.10/script/compile.go000066400000000000000000000036641371432437400161530ustar00rootroot00000000000000package script import ( "fmt" "go/ast" "go/parser" ) // Compiles an expression, which can then be evaluated. E.g.: // expr, err := world.CompileExpr("1+1") // expr.Eval() // returns 2 func (w *World) CompileExpr(src string) (code Expr, e error) { // parse tree, err := parser.ParseExpr(src) if err != nil { return nil, fmt.Errorf(`parse "%s": %v`, src, err) } if Debug { ast.Print(nil, tree) } // catch compile errors if !Debug { defer func() { err := recover() if err == nil { return } if er, ok := err.(*compileErr); ok { code = nil e = fmt.Errorf(`parse "%s": %v`, src, er) } else { panic(err) } }() } return w.compile(tree), nil } // CompileExpr with panic on error. func (w *World) MustCompileExpr(src string) Expr { code, err := w.CompileExpr(src) if err != nil { panic(err) } return code } // compiles source consisting of a number of statements. E.g.: // src = "a = 1; b = sin(x)" // code, err := world.Compile(src) // code.Eval() func (w *World) Compile(src string) (code *BlockStmt, e error) { // parse exprSrc := "func(){\n" + src + "\n}" // wrap in func to turn into expression tree, err := parser.ParseExpr(exprSrc) if err != nil { return nil, fmt.Errorf("script line %v: ", err) } // catch compile errors and decode line number if !Debug { defer func() { err := recover() if err == nil { return } if compErr, ok := err.(*compileErr); ok { code = nil e = fmt.Errorf("script %v: %v", pos2line(compErr.pos, exprSrc), compErr.msg) } else { panic(err) } }() } // compile stmts := tree.(*ast.FuncLit).Body.List // strip func again if Debug { ast.Print(nil, stmts) } block := new(BlockStmt) for _, s := range stmts { block.append(w.compile(s), s) } return block, nil } // Like Compile but panics on error func (w *World) MustCompile(src string) Expr { code, err := w.Compile(src) if err != nil { panic(err) } return code } mumax3-3.10/script/const.go000066400000000000000000000006011371432437400156350ustar00rootroot00000000000000package script import "reflect" type Const struct { value interface{} typ reflect.Type } func NewConst(e Expr) *Const { return &Const{value: e.Eval(), typ: e.Type()} } func (c *Const) Eval() interface{} { return c.value } func (c *Const) Type() reflect.Type { return c.typ } func (c *Const) Child() []Expr { return nil } func (c *Const) Fix() Expr { return c } mumax3-3.10/script/error.go000066400000000000000000000024601371432437400156450ustar00rootroot00000000000000package script import ( "fmt" "go/token" "reflect" "strings" ) var Debug = false // print debug info? // compileErr, and only compileErr will be caught by Compile and returned as an error. type compileErr struct { pos token.Pos msg string } // implements error func (c *compileErr) Error() string { return c.msg } // constructs a compileErr func err(pos token.Pos, msg ...interface{}) *compileErr { str := fmt.Sprintln(msg...) // use Sprinln to insert spaces str = str[:len(str)-1] // strip final \n return &compileErr{pos, str} } // type string for value i func typ(i interface{}) string { typ := reflect.TypeOf(reflect.ValueOf(i).Interface()).String() if strings.HasPrefix(typ, "*ast.") { typ = typ[len("*ast."):] } return typ } func assert(test bool) { if !test { panic("assertion failed") } } // decodes a token position in source to a line number // and returns the line number + line code. func pos2line(pos token.Pos, src string) string { if pos == 0 { return "" } lines := strings.Split(src, "\n") line := 0 for i, b := range src { if token.Pos(i) == pos { return fmt.Sprint("line ", line, ": ", strings.Trim(lines[line], " \t")) // func{ prefix makes lines count from 1 } if b == '\n' { line++ } } return fmt.Sprint("position", pos) // we should not reach this } mumax3-3.10/script/exec.go000066400000000000000000000013601371432437400154360ustar00rootroot00000000000000package script // Exec compiles and executes the source statements. func (w *World) Exec(src string) error { code, err := w.Compile(src) if err != nil { return err } code.Eval() return nil } // Exec with panic on error. func (w *World) MustExec(src string) { code := w.MustCompile(src) code.Eval() } // Eval with panic on error. func (w *World) MustEval(src string) interface{} { Expr := w.MustCompileExpr(src) return Expr.Eval() } // Eval compiles and evaluates src, which must be an expression, and returns the result(s). E.g.: // world.Eval("1+1") // returns 2, nil func (w *World) Eval(src string) (ret interface{}, err error) { Expr, err := w.CompileExpr(src) if err != nil { return nil, err } return Expr.Eval(), nil } mumax3-3.10/script/expr.go000066400000000000000000000015231371432437400154710ustar00rootroot00000000000000package script import ( "go/ast" "reflect" ) // an expression can be evaluated type Expr interface { Eval() interface{} // evaluate and return result (nil for void) Type() reflect.Type // return type, nil for void Child() []Expr Fix() Expr // replace all variables by their current value, except for the time "t". } // compiles an expression func (w *World) compileExpr(e ast.Expr) Expr { switch e := e.(type) { default: panic(err(e.Pos(), "not allowed:", typ(e))) case *ast.Ident: return w.resolve(e.Pos(), e.Name) case *ast.BasicLit: return w.compileBasicLit(e) case *ast.BinaryExpr: return w.compileBinaryExpr(e) case *ast.UnaryExpr: return w.compileUnaryExpr(e) case *ast.CallExpr: return w.compileCallExpr(e) case *ast.ParenExpr: return w.compileExpr(e.X) case *ast.IndexExpr: return w.compileIndexExpr(e) } } mumax3-3.10/script/for.go000066400000000000000000000017631371432437400153070ustar00rootroot00000000000000package script import ( "go/ast" ) // for statement type forStmt struct { init, cond, post, body Expr void } func (b *forStmt) Eval() interface{} { for b.init.Eval(); b.cond.Eval().(bool); b.post.Eval() { b.body.Eval() } return nil // void } func (w *World) compileForStmt(n *ast.ForStmt) *forStmt { w.EnterScope() defer w.ExitScope() stmt := &forStmt{init: &nop{}, cond: &nop{}, post: &nop{}, body: &nop{}} if n.Init != nil { stmt.init = w.compileStmt(n.Init) } if n.Cond != nil { stmt.cond = typeConv(n.Cond.Pos(), w.compileExpr(n.Cond), bool_t) } else { stmt.cond = boolLit(true) } if n.Post != nil { stmt.post = w.compileStmt(n.Post) } if n.Body != nil { stmt.body = w.compileBlockStmt_noScope(n.Body) } return stmt } type nop struct{ void } func (e *nop) Child() []Expr { return nil } func (e *nop) Eval() interface{} { return nil } func (e *nop) Fix() Expr { return e } func (e *forStmt) Child() []Expr { return []Expr{e.init, e.cond, e.post, e.body} } mumax3-3.10/script/funcif.go000066400000000000000000000017341371432437400157710ustar00rootroot00000000000000package script // Here be dragons import ( "github.com/mumax/3/data" "reflect" ) type ScalarFunction interface { Expr Float() float64 } // converts float64 to ScalarFunction type scalFn struct{ in Expr } func (c *scalFn) Eval() interface{} { return c } func (c *scalFn) Type() reflect.Type { return ScalarFunction_t } func (c *scalFn) Float() float64 { return c.in.Eval().(float64) } func (c *scalFn) Child() []Expr { return []Expr{c.in} } func (c *scalFn) Fix() Expr { return &scalFn{in: c.in.Fix()} } type VectorFunction interface { Expr Float3() data.Vector } // converts data.Vector to VectorFunction type vecFn struct{ in Expr } func (c *vecFn) Eval() interface{} { return c } func (c *vecFn) Type() reflect.Type { return VectorFunction_t } func (c *vecFn) Float3() data.Vector { return c.in.Eval().(data.Vector) } func (c *vecFn) Child() []Expr { return []Expr{c.in} } func (c *vecFn) Fix() Expr { return &vecFn{in: c.in.Fix()} } mumax3-3.10/script/function.go000066400000000000000000000014761371432437400163470ustar00rootroot00000000000000package script import ( "fmt" "reflect" ) type function struct { reflect.Value } func newFunction(fn interface{}) *function { val := reflect.ValueOf(fn) if val.Type().Kind() != reflect.Func { panic(fmt.Errorf("not a function: %v", val.Type())) } if val.Type().NumOut() > 1 { panic(fmt.Errorf("multiple return values not allowed: %v", val.Type())) } return &function{val} } // type of the function itself (when not called) func (f *function) Type() reflect.Type { return f.Value.Type() } func (f *function) NumIn() int { return f.Type().NumIn() } func (f *function) In(i int) reflect.Type { return f.Type().In(i) } func (f *function) Eval() interface{} { return f.Value.Interface() } func (f *function) Child() []Expr { return nil } func (f *function) Fix() Expr { return f } mumax3-3.10/script/if.go000066400000000000000000000012741371432437400151140ustar00rootroot00000000000000package script import ( "go/ast" ) // if statement type ifStmt struct { cond, body, else_ Expr void } func (b *ifStmt) Eval() interface{} { if b.cond.Eval().(bool) { b.body.Eval() } else { if b.else_ != nil { b.else_.Eval() } } return nil // void } func (w *World) compileIfStmt(n *ast.IfStmt) *ifStmt { w.EnterScope() defer w.ExitScope() stmt := &ifStmt{ cond: typeConv(n.Cond.Pos(), w.compileExpr(n.Cond), bool_t), body: w.compileBlockStmt_noScope(n.Body)} if n.Else != nil { stmt.else_ = w.compileStmt(n.Else) } return stmt } func (e *ifStmt) Child() []Expr { child := []Expr{e.cond, e.body, e.else_} if e.else_ == nil { child = child[:2] } return child } mumax3-3.10/script/incdecstmt.go000066400000000000000000000016711371432437400166540ustar00rootroot00000000000000package script import ( "go/ast" "go/token" "reflect" ) func (w *World) compileIncDecStmt(n *ast.IncDecStmt) Expr { l := w.compileLvalue(n.X) switch n.Tok { case token.INC: rhs_plus1 := &addone{incdec{typeConv(n.Pos(), l, float64_t)}} return &assignStmt{lhs: l, rhs: typeConv(n.Pos(), rhs_plus1, l.Type())} case token.DEC: rhs_minus1 := &subone{incdec{typeConv(n.Pos(), l, float64_t)}} return &assignStmt{lhs: l, rhs: typeConv(n.Pos(), rhs_minus1, l.Type())} default: panic(err(n.Pos(), "not allowed:", n.Tok)) } } type incdec struct{ x Expr } func (e *incdec) Type() reflect.Type { return float64_t } func (e *incdec) Child() []Expr { return []Expr{e.x} } func (e *incdec) Fix() Expr { panic(invalid_closure) } type addone struct{ incdec } type subone struct{ incdec } func (s *addone) Eval() interface{} { return s.x.Eval().(float64) + 1 } func (s *subone) Eval() interface{} { return s.x.Eval().(float64) - 1 } mumax3-3.10/script/index.go000066400000000000000000000013351371432437400156230ustar00rootroot00000000000000package script import ( "go/ast" "reflect" ) func (w *World) compileIndexExpr(n *ast.IndexExpr) Expr { x := w.compileExpr(n.X) kind := x.Type().Kind() if !(kind == reflect.Array || kind == reflect.Slice) { panic(err(n.Pos(), "can not index", x.Type())) } i := typeConv(n.Index.Pos(), w.compileExpr(n.Index), int_t) return &index{x, i} } type index struct { x, index Expr } func (e *index) Type() reflect.Type { return e.x.Type().Elem() } func (e *index) Eval() interface{} { x := reflect.ValueOf(e.x.Eval()) i := e.index.Eval().(int) return x.Index(i).Interface() } func (e *index) Child() []Expr { return []Expr{e.x, e.index} } func (e *index) Fix() Expr { return &index{x: e.x.Fix(), index: e.index.Fix()} } mumax3-3.10/script/lit.go000066400000000000000000000033321371432437400153030ustar00rootroot00000000000000package script import ( "go/ast" "go/token" "reflect" "strconv" ) // compiles a basic literal, like numbers and strings func (w *World) compileBasicLit(n *ast.BasicLit) Expr { switch n.Kind { default: panic(err(n.Pos(), "not allowed:", n.Value, "(", typ(n), ")")) case token.FLOAT: return floatLit(parseFloat(n.Value)) case token.INT: return intLit(parseInt(n.Value)) case token.STRING: return stringLit(n.Value[1 : len(n.Value)-1]) // remove quotes } } type floatLit float64 func (l floatLit) Eval() interface{} { return float64(l) } func (l floatLit) Type() reflect.Type { return float64_t } func (l floatLit) Child() []Expr { return nil } func (l floatLit) Fix() Expr { return l } type intLit int func (l intLit) Eval() interface{} { return int(l) } func (l intLit) Type() reflect.Type { return int_t } func (l intLit) Child() []Expr { return nil } func (l intLit) Fix() Expr { return l } type stringLit string func (l stringLit) Eval() interface{} { return string(l) } func (l stringLit) Type() reflect.Type { return string_t } func (l stringLit) Child() []Expr { return nil } func (l stringLit) Fix() Expr { return l } type boolLit bool func (l boolLit) Eval() interface{} { return bool(l) } func (l boolLit) Type() reflect.Type { return bool_t } func (l boolLit) Child() []Expr { return nil } func (l boolLit) Fix() Expr { return l } func parseFloat(str string) float64 { v, err := strconv.ParseFloat(str, 64) if err != nil { panic("internal error") // we were sure it was a number... } return v } func parseInt(str string) int { v, err := strconv.Atoi(str) if err != nil { panic("internal error") // we were sure it was a number... } return v } mumax3-3.10/script/lvalue.go000066400000000000000000000025271371432437400160100ustar00rootroot00000000000000package script import ( "go/ast" "reflect" ) // left-hand value in (single) assign statement type LValue interface { Expr SetValue(interface{}) // assigns a new value } func (w *World) compileLvalue(lhs ast.Node) LValue { switch lhs := lhs.(type) { default: panic(err(lhs.Pos(), "cannot assign to", typ(lhs))) case *ast.Ident: if l, ok := w.resolve(lhs.Pos(), lhs.Name).(LValue); ok { return l } else { panic(err(lhs.Pos(), "cannot assign to", lhs.Name)) } } } type reflectLvalue struct { elem reflect.Value } // general lvalue implementation using reflect. // lhs must be settable, e.g. address of something: // var x float64 // newReflectLValue(&x) func newReflectLvalue(addr interface{}) LValue { elem := reflect.ValueOf(addr).Elem() if elem.Kind() == 0 { panic("variable/constant needs to be passed as pointer to addressable value") } return &reflectLvalue{elem} } func (l *reflectLvalue) Eval() interface{} { return l.elem.Interface() } func (l *reflectLvalue) Type() reflect.Type { return l.elem.Type() } func (l *reflectLvalue) SetValue(rvalue interface{}) { l.elem.Set(reflect.ValueOf(rvalue)) } func (l *reflectLvalue) Child() []Expr { return nil } func (l *reflectLvalue) Fix() Expr { return NewConst(l) } type TVar struct { LValue } func (t *TVar) Fix() Expr { return t // only variable that's not fixed } mumax3-3.10/script/ronly.go000066400000000000000000000011671371432437400156620ustar00rootroot00000000000000package script import "reflect" // read-only value (from script, but mutable from outside) type reflectROnly struct { elem reflect.Value } func newReflectROnly(addr interface{}) *reflectROnly { elem := reflect.ValueOf(addr) if elem.Kind() == 0 { panic("variable/constant needs to be passed as pointer to addressable value") } return &reflectROnly{elem} } func (l *reflectROnly) Eval() interface{} { return l.elem.Interface() } func (l *reflectROnly) Type() reflect.Type { return l.elem.Type() } func (l *reflectROnly) Child() []Expr { return nil } func (l *reflectROnly) Fix() Expr { return NewConst(l) } mumax3-3.10/script/script_test.go000066400000000000000000000067201371432437400170620ustar00rootroot00000000000000package script import ( "log" "math" "reflect" "testing" ) func init() { log.SetFlags(0) } func TestEval(t *testing.T) { w := NewWorld() // Test Variables x := 1.0 w.Var("x", &x) if w.MustEval("x") != 1.0 { t.Fail() } x = 2.0 if w.MustEval("x") != 2.0 { t.Fail() } w.MustExec("x=3") if w.MustEval("x") != 3.0 { t.Fail() } w.MustExec("y:=8") if w.MustEval("y") != 8 { t.Error("got", w.MustEval("y")) } // Test Ops if w.MustEval("1+2*3/4-5-6") != 1.+2.*3./4.-5.-6 { t.Fail() } // Test func if w.MustEval("sqrt(3*3)").(float64) != 3 { t.Fail() } } func TestContains(t *testing.T) { w := NewWorld() var x float64 w.Var("x", &x) X := w.Resolve("x") if X == nil { t.Fail() } if !Contains(w.MustCompile("x+1"), X) { t.Fail() } if Contains(w.MustCompile("1+1"), X) { t.Fail() } } func TestTypes(t *testing.T) { w := NewWorld() x := 3.14 w.Var("x", &x) w.MustExec("x=7") w.Func("printInt", func(x int) { log.Println(x) }) w.MustExec("printInt(7)") } func TestLoop(t *testing.T) { w := NewWorld() sum := 0.0 w.Var("sum", &sum) src := ` for i:=0; i<100; i++{ sum = sum + i } ` w.MustExec(src) if sum != 4950 { t.Error("got", sum) } src = ` for i:=100; i>=0; i--{ sum = sum + i } ` w.MustExec(src) if sum != 10000 { t.Error("got", sum) } } type test struct { a, b, c int } func (t *test) A() int { return 41 } func (t *test) B() int { return 42 } func (t *test) C() int { return 43 } func TestMethod(t *testing.T) { w := NewWorld() var s *test w.Var("s", &s) if w.MustEval("s.B()") != 42 { t.Fail() } } func TestScope(t *testing.T) { w := NewWorld() w.MustEval("sin(0)") w.EnterScope() w.MustEval("sin(0)") w.ExitScope() w.MustEval("sin(0)") } func BenchmarkEval1(b *testing.B) { b.StopTimer() w := NewWorld() code := w.MustCompileExpr("1+(2-3)*(4+5)/6") b.StartTimer() for i := 0; i < b.N; i++ { code.Eval() } } func BenchmarkEval1_native(bench *testing.B) { var a, b, c, d, e, f float64 for i := 0; i < bench.N; i++ { a += (b - c) * (d + e) / f } if a == 1 { panic("make sure result is used") } } func BenchmarkEval2(b *testing.B) { b.StopTimer() w := NewWorld() code := w.MustCompileExpr("sin(cos(tan(log(sqrt(exp(1))))))") b.StartTimer() for i := 0; i < b.N; i++ { code.Eval() } } func BenchmarkEval2_native(bench *testing.B) { var a float64 b := 1. for i := 0; i < bench.N; i++ { a += math.Sin(math.Cos(math.Tan(math.Log(math.Sqrt(math.Exp(b)))))) } if a == 1.23456 { panic("make sure result is used") } } type T struct { in string out interface{} } func TestMany(test *testing.T) { tests := []T{ {"1+1", 2.}, {"7-5", 2.}, {"2*3", 6.}, {"10/5", 2.}, {"1+10/5", 3.}, {"10/5+1", 3.}, {"(1+14)/5", 3.}, {"1<1", false}, {"1<2", true}, {"2<1", false}, {"1>1", false}, {"2>1", true}, {"1>2", false}, {"1<=1", true}, {"1<=2", true}, {"2<=1", false}, {"1>=1", true}, {"2>=1", true}, {"1>=2", false}} w := NewWorld() for _, t := range tests { out := w.MustEval(t.in) if !reflect.DeepEqual(out, t.out) { test.Error(t.in, "returned", out, "expected:", t.out) } } } // Test a few cases that should not compile func TestFail(test *testing.T) { w := NewWorld() w.Const("c", 3e8) a := 1. w.Var("a", &a) tests := []string{"c=1", "undefined", "1++", "a=true", "x:=a++"} for _, t := range tests { _, err := w.Compile(t) if err == nil { test.Error(t, "should not compile") } else { log.Println(t, ":", err, ":OK") } } } mumax3-3.10/script/selector.go000066400000000000000000000023401371432437400163310ustar00rootroot00000000000000package script import ( "fmt" "go/ast" "reflect" "strings" "unicode" ) const GoExclusiveMethodSuffix = "Go" type selector struct { x Expr method string } // compiles a selector statement like x.sel func (w *World) compileSelectorStmt(n *ast.SelectorExpr) Expr { x := w.compileExpr(n.X) t := x.Type() if t == nil { panic(err(n.Pos(), "void does not have member", n.Sel.Name)) } sel := strings.ToLower(n.Sel.Name) N := "" for i := 0; i < t.NumMethod(); i++ { name := t.Method(i).Name if strings.ToLower(name) == sel && unicode.IsUpper(rune(name[0])) && !strings.HasSuffix(name, GoExclusiveMethodSuffix) { N = t.Method(i).Name break } } if N == "" { panic(err(n.Pos(), t, "has no method", n.Sel.Name)) } return &selector{x, N} } func (e *selector) Eval() interface{} { obj := reflect.ValueOf(e.x.Eval()) meth := obj.MethodByName(e.method) if meth.Kind() == 0 { panic(fmt.Sprint(e.x, " has no method ", e.method)) } return meth.Interface() } func (e *selector) Type() reflect.Type { return reflect.New(e.x.Type()).Elem().MethodByName(e.method).Type() } func (e *selector) Child() []Expr { return []Expr{e.x} } func (e *selector) Fix() Expr { return &selector{x: e.x.Fix(), method: e.method} } mumax3-3.10/script/source.go000066400000000000000000000011431371432437400160110ustar00rootroot00000000000000package script import ( "go/ast" "go/token" "io/ioutil" ) func (w *World) compileSource(n *ast.CallExpr) Expr { if len(n.Args) != 1 { panic(err(n.Pos(), "source() needs 1 string argument, got", len(n.Args))) } arg := n.Args[0] if lit, ok := arg.(*ast.BasicLit); ok && lit.Kind == token.STRING { code, err1 := ioutil.ReadFile(lit.Value[1 : len(lit.Value)-1]) if err1 != nil { panic(err(n.Pos(), err1)) } block, err2 := w.Compile(string(code)) if err1 != nil { panic(err(n.Pos(), err2)) } return block } else { panic(err(n.Pos(), "source() needs literal string argument")) } } mumax3-3.10/script/stdlib.go000066400000000000000000000056031371432437400157770ustar00rootroot00000000000000package script import ( "fmt" "math" "math/rand" "time" ) // Loads standard functions into the world. func (w *World) LoadStdlib() { // literals w.declare("true", boolLit(true)) w.declare("false", boolLit(false)) // math w.Func("abs", math.Abs) w.Func("acos", math.Acos) w.Func("acosh", math.Acosh) w.Func("asin", math.Asin) w.Func("asinh", math.Asinh) w.Func("atan", math.Atan) w.Func("atanh", math.Atanh) w.Func("cbrt", math.Cbrt) w.Func("ceil", math.Ceil) w.Func("cos", math.Cos) w.Func("cosh", math.Cosh) w.Func("erf", math.Erf) w.Func("erfc", math.Erfc) w.Func("exp", math.Exp) w.Func("exp2", math.Exp2) w.Func("expm1", math.Expm1) w.Func("floor", math.Floor) w.Func("gamma", math.Gamma) w.Func("j0", math.J0) w.Func("j1", math.J1) w.Func("log", math.Log) w.Func("log10", math.Log10) w.Func("log1p", math.Log1p) w.Func("log2", math.Log2) w.Func("logb", math.Logb) w.Func("sin", math.Sin) w.Func("sinh", math.Sinh) w.Func("sqrt", math.Sqrt) w.Func("tan", math.Tan) w.Func("tanh", math.Tanh) w.Func("trunc", math.Trunc) w.Func("y0", math.Y0) w.Func("y1", math.Y1) w.Func("ilogb", math.Ilogb) w.Func("pow10", math.Pow10) w.Func("atan2", math.Atan2) w.Func("hypot", math.Hypot) w.Func("remainder", math.Remainder) w.Func("max", math.Max) w.Func("min", math.Min) w.Func("mod", math.Mod) w.Func("pow", math.Pow) w.Func("yn", math.Yn) w.Func("jn", math.Jn) w.Func("ldexp", math.Ldexp) w.Func("isInf", math.IsInf) w.Func("isNaN", math.IsNaN) w.Func("norm", norm, "Standard normal distribution") w.Func("heaviside", heaviside, "Returns 1 if x>0, 0 if x<0, and 0.5 if x==0") w.Func("sinc", sinc, "Sinc returns sin(x)/x. If x=0, then Sinc(x) returns 0.") w.Func("randSeed", intseed, "Sets the random number seed") w.Func("rand", rng.Float64, "Random number between 0 and 1") w.Func("randExp", rng.ExpFloat64, "Exponentially distributed random number between 0 and +inf, mean=1") w.Func("randNorm", rng.NormFloat64, "Standard normal random number") w.Func("randInt", randInt, "Random non-negative integer") w.declare("pi", floatLit(math.Pi)) w.declare("inf", floatLit(math.Inf(1))) //string w.Func("sprint", fmt.Sprint, "Print all arguments to string with automatic formatting") w.Func("sprintf", fmt.Sprintf, "Print to string with C-style formatting.") //time w.Func("now", time.Now, "Returns the current time") w.Func("since", time.Since, "Returns the time elapsed since argument") } var rng = rand.New(rand.NewSource(0)) // script does not know int64 func intseed(seed int) { rng.Seed(int64(seed)) } func randInt(upper int) int { return rng.Int() % upper } func heaviside(x float64) float64 { switch { default: return 1 case x == 0: return 0.5 case x < 0: return 0 } } func norm(x float64) float64 { return (1 / math.Sqrt(2*math.Pi)) * math.Exp(-0.5*x*x) } func sinc(x float64) float64 { if x == 0 { return 1 } else { return math.Sin(x) / x } } mumax3-3.10/script/stmt.go000066400000000000000000000023041371432437400155000ustar00rootroot00000000000000package script import ( "go/ast" "reflect" ) // compiles expression or statement func (w *World) compile(n ast.Node) Expr { switch n := n.(type) { case ast.Stmt: return w.compileStmt(n) case ast.Expr: return w.compileExpr(n) default: panic(err(n.Pos(), "not allowed")) } } // compiles a statement func (w *World) compileStmt(st ast.Stmt) Expr { switch st := st.(type) { default: panic(err(st.Pos(), "not allowed:", typ(st))) case *ast.EmptyStmt: return &emptyStmt{} case *ast.AssignStmt: return w.compileAssignStmt(st) case *ast.ExprStmt: return w.compileExpr(st.X) case *ast.IfStmt: return w.compileIfStmt(st) case *ast.ForStmt: return w.compileForStmt(st) case *ast.IncDecStmt: return w.compileIncDecStmt(st) case *ast.BlockStmt: w.EnterScope() defer w.ExitScope() return w.compileBlockStmt_noScope(st) } } // embed to get Type() that returns nil type void struct{} func (v *void) Type() reflect.Type { return nil } func (v *void) Fix() Expr { panic(invalid_closure) } type emptyStmt struct{ void } func (*emptyStmt) Child() []Expr { return nil } func (*emptyStmt) Eval() interface{} { return nil } const invalid_closure = "illegal statement in closure" mumax3-3.10/script/test.txt000066400000000000000000000000051371432437400156760ustar00rootroot00000000000000a:=1 mumax3-3.10/script/typeconv.go000066400000000000000000000106771371432437400163740ustar00rootroot00000000000000package script import ( "fmt" "github.com/mumax/3/data" "go/token" "reflect" ) // converts in to an expression of type OutT. // also serves as type check (not convertible == type error) // pos is used for error message on impossible conversion. func typeConv(pos token.Pos, in Expr, outT reflect.Type) Expr { inT := in.Type() switch { default: panic(err(pos, "type mismatch: can not use type", inT, "as", outT)) // treat 'void' (type nil) separately: case inT == nil && outT != nil: panic(err(pos, "void used as value")) case inT != nil && outT == nil: panic("script internal bug: void input type") // strict go conversions: case inT == outT: return in case inT.AssignableTo(outT): return in // extra conversions for ease-of-use: // int -> float64 case outT == float64_t && inT == int_t: return &intToFloat64{in} // float64 -> int case outT == int_t && inT == float64_t: return &float64ToInt{in} case outT == float64_t && inT.AssignableTo(ScalarIf_t): return &getScalar{in.Eval().(ScalarIf)} case outT == float64_t && inT.AssignableTo(VectorIf_t): return &getVector{in.Eval().(VectorIf)} // magical expression -> function conversions case inT == float64_t && outT.AssignableTo(ScalarFunction_t): return &scalFn{in} case inT == int_t && outT.AssignableTo(ScalarFunction_t): return &scalFn{&intToFloat64{in}} case inT == vector_t && outT.AssignableTo(VectorFunction_t): return &vecFn{in} case inT == bool_t && outT == func_bool_t: return &boolToFunc{in} } } // returns input type for expression. Usually this is the same as the return type, // unless the expression has a method InputType()reflect.Type. func inputType(e Expr) reflect.Type { if in, ok := e.(interface { InputType() reflect.Type }); ok { return in.InputType() } return e.Type() } // common type definitions var ( float64_t = reflect.TypeOf(float64(0)) bool_t = reflect.TypeOf(false) func_float64_t = reflect.TypeOf(func() float64 { panic(0) }) func_bool_t = reflect.TypeOf(func() bool { panic(0) }) int_t = reflect.TypeOf(int(0)) string_t = reflect.TypeOf("") vector_t = reflect.TypeOf(data.Vector{}) func_vector_t = reflect.TypeOf(func() data.Vector { panic(0) }) ScalarFunction_t = reflect.TypeOf(dummy_f).In(0) VectorFunction_t = reflect.TypeOf(dummy_f3).In(0) ScalarIf_t = reflect.TypeOf(dummy_scalarif).In(0) VectorIf_t = reflect.TypeOf(dummy_vectorif).In(0) ) // maneuvers to get interface type of Func (simpler way?) func dummy_f(ScalarFunction) {} func dummy_f3(VectorFunction) {} func dummy_scalarif(ScalarIf) {} func dummy_vectorif(VectorIf) {} // converts int to float64 type intToFloat64 struct{ in Expr } func (c *intToFloat64) Eval() interface{} { return float64(c.in.Eval().(int)) } func (c *intToFloat64) Type() reflect.Type { return float64_t } func (c *intToFloat64) Child() []Expr { return []Expr{c.in} } func (c *intToFloat64) Fix() Expr { return &intToFloat64{in: c.in.Fix()} } // converts float64 to int type float64ToInt struct{ in Expr } func (c *float64ToInt) Eval() interface{} { return safe_int(c.in.Eval().(float64)) } func (c *float64ToInt) Type() reflect.Type { return int_t } func (c *float64ToInt) Child() []Expr { return []Expr{c.in} } func (c *float64ToInt) Fix() Expr { return &float64ToInt{in: c.in.Fix()} } type boolToFunc struct{ in Expr } func (c *boolToFunc) Eval() interface{} { return func() bool { return c.in.Eval().(bool) } } func (c *boolToFunc) Type() reflect.Type { return func_bool_t } func (c *boolToFunc) Child() []Expr { return []Expr{c.in} } func (c *boolToFunc) Fix() Expr { return &boolToFunc{in: c.in.Fix()} } type getScalar struct{ in ScalarIf } type getVector struct{ in VectorIf } func (c *getScalar) Eval() interface{} { return c.in.Get() } func (c *getScalar) Type() reflect.Type { return float64_t } func (c *getScalar) Child() []Expr { return nil } func (c *getScalar) Fix() Expr { return NewConst(c) } func (c *getVector) Eval() interface{} { return c.in.Get() } func (c *getVector) Type() reflect.Type { return vector_t } func (c *getVector) Child() []Expr { return nil } func (c *getVector) Fix() Expr { return NewConst(c) } func safe_int(x float64) int { i := int(x) if float64(i) != x { panic(fmt.Errorf("can not use %v as int", x)) } return i } type ScalarIf interface { Get() float64 } // TODO: Scalar type VectorIf interface { Get() data.Vector } // TODO: Vector mumax3-3.10/script/unaryexpr.go000066400000000000000000000015731371432437400165550ustar00rootroot00000000000000package script import ( "go/ast" "go/token" "reflect" ) func (w *World) compileUnaryExpr(n *ast.UnaryExpr) Expr { x := w.compileExpr(n.X) switch n.Op { default: panic(err(n.Pos(), "not allowed:", n.Op)) case token.SUB: return &minus{typeConv(n.X.Pos(), x, float64_t)} case token.NOT: return ¬{typeConv(n.X.Pos(), x, bool_t)} } } type minus struct{ x Expr } func (m *minus) Type() reflect.Type { return float64_t } func (m *minus) Eval() interface{} { return -m.x.Eval().(float64) } func (m *minus) Child() []Expr { return []Expr{m.x} } func (m *minus) Fix() Expr { return &minus{m.x.Fix()} } type not struct{ x Expr } func (m *not) Type() reflect.Type { return bool_t } func (m *not) Eval() interface{} { return !m.x.Eval().(bool) } func (m *not) Child() []Expr { return []Expr{m.x} } func (m *not) Fix() Expr { return ¬{m.x.Fix()} } mumax3-3.10/script/world.go000066400000000000000000000074311371432437400156460ustar00rootroot00000000000000// package script provides a script interpreter for input files and GUI commands. package script import ( "fmt" "go/token" "strings" ) // World stores an interpreted program's state // like declared variables and functions. type World struct { *scope toplevel *scope } // scope stores identifiers type scope struct { Identifiers map[string]Expr // set of defined identifiers parent *scope // parent scope, if any Doc map[string]string // documentation for identifiers } func NewWorld() *World { w := new(World) w.scope = new(scope) w.toplevel = w.scope w.toplevel.Doc = make(map[string]string) w.LoadStdlib() // loads into toplevel return w } func (w *scope) init() { if w.Identifiers == nil { w.Identifiers = make(map[string]Expr) } } // adds a native variable to the world. E.g.: // var x = 3.14 // world.Var("x", &x) // world.MustEval("x") // returns 3.14 func (w *scope) Var(name string, addr interface{}, doc ...string) { w.declare(name, newReflectLvalue(addr), doc...) } // Hack for fixing the closure caveat: // Decleare the time variable, the only variable closures close over. func (w *scope) TVar(name string, addr interface{}, doc ...string) { w.declare(name, &TVar{newReflectLvalue(addr)}, doc...) } // adds a native variable to the world. It cannot be changed from script. // var x = 3.14 // world.ROnly("x", &x) // world.MustEval("x") // returns 3.14 // world.MustExec("x=2") // fails: cannot assign to x func (w *scope) ROnly(name string, addr interface{}, doc ...string) { w.declare(name, newReflectROnly(addr), doc...) } // adds a constant. Cannot be changed in any way. func (w *scope) Const(name string, val interface{}, doc ...string) { switch v := val.(type) { default: panic(fmt.Errorf("const of type %v not handled", typ(v))) // todo: const using reflection case float64: w.declare(name, floatLit(v), doc...) case int: w.declare(name, intLit(v), doc...) } } // adds a special variable to the world. Upon assignment, // v's Set() will be called. func (w *scope) LValue(name string, v LValue, doc ...string) { w.declare(name, v, doc...) } // adds a native function to the world. E.g.: // world.Func("sin", math.Sin) // world.MustEval("sin(0)") // returns 0 func (w *scope) Func(name string, f interface{}, doc ...string) { w.declare(name, newFunction(f), doc...) } // add identifier but check that it's not declared yet. func (w *scope) declare(key string, value Expr, doc ...string) { if ok := w.safeDeclare(key, value); !ok { panic("identifier " + key + " already defined") } w.document(key, doc...) } func (w *scope) safeDeclare(key string, value Expr) (ok bool) { w.init() lname := strings.ToLower(key) if _, ok := w.Identifiers[lname]; ok { return false } w.Identifiers[lname] = value return true } // resolve identifier in this scope or its parents func (w *scope) resolve(pos token.Pos, name string) Expr { w.init() lname := strings.ToLower(name) if v, ok := w.Identifiers[lname]; ok { return v } else { if w.parent != nil { return w.parent.resolve(pos, name) } panic(err(pos, "undefined:", name)) } } func (w *World) Resolve(identifier string) (e Expr) { defer func() { err := recover() if err != nil { e = nil // not found } }() e = w.toplevel.resolve(0, identifier) return } // add documentation for identifier func (w *scope) document(ident string, doc ...string) { if w.Doc != nil { // means we want doc for this scope (toplevel only) switch len(doc) { default: panic("too many doc strings for " + ident) case 0: w.Doc[ident] = "" case 1: w.Doc[ident] = doc[0] } } } func (w *World) EnterScope() { par := w.scope w.scope = new(scope) w.scope.parent = par } func (w *World) ExitScope() { w.scope = w.scope.parent if w.scope == nil { // went above toplevel panic("bug") } } mumax3-3.10/svgo/000077500000000000000000000000001371432437400136355ustar00rootroot00000000000000mumax3-3.10/svgo/LICENSE000066400000000000000000000002401371432437400146360ustar00rootroot00000000000000The contents of this repository are Licensed under the Creative Commons Attribution 3.0 license as described in http://creativecommons.org/licenses/by/3.0/us/ mumax3-3.10/svgo/Makefile000066400000000000000000000000211371432437400152660ustar00rootroot00000000000000all: go install mumax3-3.10/svgo/doc.go000066400000000000000000000054341371432437400147370ustar00rootroot00000000000000/* Package svg generates SVG as defined by the Scalable Vector Graphics 1.1 Specification (). Output goes to the specified io.Writer. Supported SVG elements and functions Shapes, lines, text circle, ellipse, polygon, polyline, rect (including roundrects), line, text Paths general, arc, cubic and quadratic bezier paths, Image and Gradients image, linearGradient, radialGradient, Transforms translate, rotate, scale, skewX, skewY Filter Effects filter, feBlend, feColorMatrix, feColorMatrix, feComponentTransfer, feComposite, feConvolveMatrix, feDiffuseLighting, feDisplacementMap, feDistantLight, feFlood, feGaussianBlur, feImage, feMerge, feMorphology, feOffset, fePointLight, feSpecularLighting, feSpotLight,feTile, feTurbulence Metadata elements desc, defs, g (style, transform, id), mask, marker, pattern, title, (a)ddress, link, script, use Usage: (assuming GOPATH is set) go get github.com/ajstarks/svgo go install github.com/ajstarks/svgo/... You can use godoc to browse the documentation from the command line: $ godoc github.com/ajstarks/svgo a minimal program, to generate SVG to standard output. package main import ( "github.com/ajstarks/svgo" "os" ) func main() { width := 500 height := 500 canvas := svg.New(os.Stdout) canvas.Start(width, height) canvas.Circle(width/2, height/2, 100) canvas.Text(width/2, height/2, "Hello, SVG", "text-anchor:middle;font-size:30px;fill:white") canvas.End() } Drawing in a web server: (http://localhost:2003/circle) package main import ( "log" "github.com/ajstarks/svgo" "net/http" ) func main() { http.Handle("/circle", http.HandlerFunc(circle)) err := http.ListenAndServe(":2003", nil) if err != nil { log.Fatal("ListenAndServe:", err) } } func circle(w http.ResponseWriter, req *http.Request) { w.Header().Set("Content-Type", "image/svg+xml") s := svg.New(w) s.Start(500, 500) s.Circle(250, 250, 125, "fill:none;stroke:black") s.End() } Functions and types Many functions use x, y to specify an object's location, and w, h to specify the object's width and height. Where applicable, a final optional argument specifies the style to be applied to the object. The style strings follow the SVG standard; name:value pairs delimited by semicolons, or a series of name="value" pairs. For example: `"fill:none; opacity:0.3"` or `fill="none" opacity="0.3"` (see: ) The Offcolor type: type Offcolor struct { Offset uint8 Color string Opacity float } is used to specify the offset, color, and opacity of stop colors in linear and radial gradients The Filterspec type: type Filterspec struct { In string In2 string Result string } is used to specify inputs and results for filter effects */ package svg mumax3-3.10/svgo/svg.go000066400000000000000000001037371371432437400147760ustar00rootroot00000000000000// Package svg provides an API for generating Scalable Vector Graphics (SVG) // Edited by Arne Vansteenkiste, 2014: // allow non-integer coordinates package svg // package main // // import ( // "github.com/ajstarks/svgo" // "os" // ) // // var ( // width = 500 // height = 500 // canvas = svg.New(os.Stdout) // ) // // func main() { // canvas.Start(width, height) // canvas.Circle(width/2, height/2, 100) // canvas.Text(width/2, height/2, "Hello, SVG", // "text-anchor:middle;font-size:30px;fill:white") // canvas.End() // } // import ( "fmt" "io" "encoding/xml" "strings" ) // SVG defines the location of the generated SVG type SVG struct { Writer io.Writer } // Offcolor defines the offset and color for gradients type Offcolor struct { Offset uint8 Color string Opacity float64 } // Filterspec defines the specification of SVG filters type Filterspec struct { In, In2, Result string } const ( svginit = ` ` vbfmt = `viewBox="%d %d %d %d"` emptyclose = "/>\n" ) // New is the SVG constructor, specifying the io.Writer where the generated SVG is written. func New(w io.Writer) *SVG { return &SVG{w} } func (svg *SVG) print(a ...interface{}) (n int, errno error) { return fmt.Fprint(svg.Writer, a...) } func (svg *SVG) println(a ...interface{}) (n int, error error) { return fmt.Fprintln(svg.Writer, a...) } func (svg *SVG) printf(format string, a ...interface{}) (n int, errno error) { return fmt.Fprintf(svg.Writer, format, a...) } // Structure, Metadata, Scripting, Transformation, and Links // Start begins the SVG document with the width w and height h. // Other attributes may be optionally added, for example viewbox or additional namespaces // Standard Reference: http://www.w3.org/TR/SVG11/struct.html#SVGElement func (svg *SVG) Start(w int, h int, ns ...string) { svg.printf(svginit, w, h) for _, v := range ns { svg.printf("\n %s", v) } svg.println(svgns) } // Startview begins the SVG document, with the specified width, height, and viewbox func (svg *SVG) Startview(w, h, minx, miny, vw, vh int) { svg.Start(w, h, fmt.Sprintf(vbfmt, minx, miny, vw, vh)) } // End the SVG document func (svg *SVG) End() { svg.println("") } // Script defines a script with a specified type, (for example "application/javascript"). // if the first variadic argument is a link, use only the link reference. // Otherwise, treat those arguments as the text of the script (marked up as CDATA). // if no data is specified, just close the script element func (svg *SVG) Script(scriptype string, data ...string) { svg.printf(`\n") default: svg.println(`/>`) } } // Gstyle begins a group, with the specified style. // Standard Reference: http://www.w3.org/TR/SVG11/struct.html#GElement func (svg *SVG) Gstyle(s string) { svg.println(group("style", s)) } // Gtransform begins a group, with the specified transform // Standard Reference: http://www.w3.org/TR/SVG11/coords.html#TransformAttribute func (svg *SVG) Gtransform(s string) { svg.println(group("transform", s)) } // Translate begins coordinate translation, end with Gend() // Standard Reference: http://www.w3.org/TR/SVG11/coords.html#TransformAttribute func (svg *SVG) Translate(x, y int) { svg.Gtransform(translate(x, y)) } // Scale scales the coordinate system by n, end with Gend() // Standard Reference: http://www.w3.org/TR/SVG11/coords.html#TransformAttribute func (svg *SVG) Scale(n float64) { svg.Gtransform(scale(n)) } // ScaleXY scales the coordinate system by dx and dy, end with Gend() // Standard Reference: http://www.w3.org/TR/SVG11/coords.html#TransformAttribute func (svg *SVG) ScaleXY(dx, dy float64) { svg.Gtransform(scaleXY(dx, dy)) } // SkewX skews the x coordinate system by angle a, end with Gend() // Standard Reference: http://www.w3.org/TR/SVG11/coords.html#TransformAttribute func (svg *SVG) SkewX(a float64) { svg.Gtransform(skewX(a)) } // SkewY skews the y coordinate system by angle a, end with Gend() // Standard Reference: http://www.w3.org/TR/SVG11/coords.html#TransformAttribute func (svg *SVG) SkewY(a float64) { svg.Gtransform(skewY(a)) } // SkewXY skews x and y coordinates by ax, ay respectively, end with Gend() // Standard Reference: http://www.w3.org/TR/SVG11/coords.html#TransformAttribute func (svg *SVG) SkewXY(ax, ay float64) { svg.Gtransform(skewX(ax) + " " + skewY(ay)) } // Rotate rotates the coordinate system by r degrees, end with Gend() // Standard Reference: http://www.w3.org/TR/SVG11/coords.html#TransformAttribute func (svg *SVG) Rotate(r float64) { svg.Gtransform(rotate(r)) } // TranslateRotate translates the coordinate system to (x,y), then rotates to r degrees, end with Gend() func (svg *SVG) TranslateRotate(x, y int, r float64) { svg.Gtransform(translate(x, y) + " " + rotate(r)) } // RotateTranslate rotates the coordinate system r degrees, then translates to (x,y), end with Gend() func (svg *SVG) RotateTranslate(x, y int, r float64) { svg.Gtransform(rotate(r) + " " + translate(x, y)) } // Group begins a group with arbitrary attributes func (svg *SVG) Group(s ...string) { svg.printf("`)) } // Gid begins a group, with the specified id func (svg *SVG) Gid(s string) { svg.print(``) } // Gend ends a group (must be paired with Gsttyle, Gtransform, Gid). func (svg *SVG) Gend() { svg.println(``) } // ClipPath defines a clip path func (svg *SVG) ClipPath(s ...string) { svg.printf(``)) } // ClipEnd ends a ClipPath func (svg *SVG) ClipEnd() { svg.println(``) } // Def begins a defintion block. // Standard Reference: http://www.w3.org/TR/SVG11/struct.html#DefsElement func (svg *SVG) Def() { svg.println(``) } // DefEnd ends a defintion block. func (svg *SVG) DefEnd() { svg.println(``) } // Marker defines a marker // Standard reference: http://www.w3.org/TR/SVG11/painting.html#MarkerElement func (svg *SVG) Marker(id string, x, y, width, height int, s ...string) { svg.printf(`\n")) } // MarkEnd ends a marker func (svg *SVG) MarkerEnd() { svg.println(``) } // Pattern defines a pattern with the specified dimensions. // The putype can be either "user" or "obj", which sets the patternUnits // attribute to be either userSpaceOnUse or objectBoundingBox // Standard reference: http://www.w3.org/TR/SVG11/pservers.html#Patterns func (svg *SVG) Pattern(id string, x, y, width, height int, putype string, s ...string) { puattr := "userSpaceOnUse" if putype != "user" { puattr = "objectBoundingBox" } svg.printf(`\n")) } // PatternEnd ends a marker func (svg *SVG) PatternEnd() { svg.println(``) } // Desc specified the text of the description tag. // Standard Reference: http://www.w3.org/TR/SVG11/struct.html#DescElement func (svg *SVG) Desc(s string) { svg.tt("desc", s) } // Title specified the text of the title tag. // Standard Reference: http://www.w3.org/TR/SVG11/struct.html#TitleElement func (svg *SVG) Title(s string) { svg.tt("title", s) } // Link begins a link named "name", with the specified title. // Standard Reference: http://www.w3.org/TR/SVG11/linking.html#Links func (svg *SVG) Link(href string, title string) { svg.printf("") } // LinkEnd ends a link. func (svg *SVG) LinkEnd() { svg.println(``) } // Use places the object referenced at link at the location x, y, with optional style. // Standard Reference: http://www.w3.org/TR/SVG11/struct.html#UseElement func (svg *SVG) Use(x int, y int, link string, s ...string) { svg.printf(``)) } // MaskEnd ends a Mask. func (svg *SVG) MaskEnd() { svg.println(``) } // Shapes // Circle centered at x,y, with radius r, with optional style. // Standard Reference: http://www.w3.org/TR/SVG11/shapes.html#CircleElement func (svg *SVG) Circle(x int, y int, r int, s ...string) { svg.printf(`")) xml.Escape(svg.Writer, []byte(t)) svg.println(``) } // Textpath places text optionally styled text along a previously defined path // Standard Reference: http://www.w3.org/TR/SVG11/text.html#TextPathElement func (svg *SVG) Textpath(t string, pathid string, s ...string) { svg.printf("", endstyle(s, ">"), pathid) xml.Escape(svg.Writer, []byte(t)) svg.println(``) } // Textlines places a series of lines of text starting at x,y, at the specified size, fill, and alignment. // Each line is spaced according to the spacing argument func (svg *SVG) Textlines(x, y int, s []string, size, spacing int, fill, align string) { svg.Gstyle(fmt.Sprintf("font-size:%dpx;fill:%s;text-anchor:%s", size, fill, align)) for _, t := range s { svg.Text(x, y, t) y += spacing } svg.Gend() } // Colors // RGB specifies a fill color in terms of a (r)ed, (g)reen, (b)lue triple. // Standard reference: http://www.w3.org/TR/css3-color/ func (svg *SVG) RGB(r int, g int, b int) string { return fmt.Sprintf(`fill:rgb(%d,%d,%d)`, r, g, b) } // RGBA specifies a fill color in terms of a (r)ed, (g)reen, (b)lue triple and opacity. func (svg *SVG) RGBA(r int, g int, b int, a float64) string { return fmt.Sprintf(`fill-opacity:%.2f; %s`, a, svg.RGB(r, g, b)) } // Gradients // LinearGradient constructs a linear color gradient identified by id, // along the vector defined by (x1,y1), and (x2,y2). // The stop color sequence defined in sc. Coordinates are expressed as percentages. func (svg *SVG) LinearGradient(id string, x1, y1, x2, y2 uint8, sc []Offcolor) { svg.printf("\n", id, pct(x1), pct(y1), pct(x2), pct(y2)) svg.stopcolor(sc) svg.println("") } // RadialGradient constructs a radial color gradient identified by id, // centered at (cx,cy), with a radius of r. // (fx, fy) define the location of the focal point of the light source. // The stop color sequence defined in sc. // Coordinates are expressed as percentages. func (svg *SVG) RadialGradient(id string, cx, cy, r, fx, fy uint8, sc []Offcolor) { svg.printf("\n", id, pct(cx), pct(cy), pct(r), pct(fx), pct(fy)) svg.stopcolor(sc) svg.println("") } // stopcolor is a utility function used by the gradient functions // to define a sequence of offsets (expressed as percentages) and colors func (svg *SVG) stopcolor(oc []Offcolor) { for _, v := range oc { svg.printf("\n", pct(v.Offset), v.Color, v.Opacity) } } // Filter Effects: // Most functions have common attributes (in, in2, result) defined in type Filterspec // used as a common first argument. // Filter begins a filter set // Standard reference: http://www.w3.org/TR/SVG11/filters.html#FilterElement func (svg *SVG) Filter(id string, s ...string) { svg.printf(`\n")) } // Fend ends a filter set // Standard reference: http://www.w3.org/TR/SVG11/filters.html#FilterElement func (svg *SVG) Fend() { svg.println(``) } // FeBlend specifies a Blend filter primitive // Standard reference: http://www.w3.org/TR/SVG11/filters.html#feBlendElement func (svg *SVG) FeBlend(fs Filterspec, mode string, s ...string) { switch mode { case "normal", "multiply", "screen", "darken", "lighten": break default: mode = "normal" } svg.printf(` 360 { value = 0 } svg.printf(` 1 { value = 1 } svg.printf(``) } // FeCompEnd ends a feComponent filter element // Standard reference: http://www.w3.org/TR/SVG11/filters.html#feComponentTransferElement func (svg *SVG) FeCompEnd() { svg.println(``) } // FeComposite specifies a feComposite filter primitive // Standard reference: http://www.w3.org/TR/SVG11/filters.html#feCompositeElement func (svg *SVG) FeComposite(fs Filterspec, operator string, k1, k2, k3, k4 int, s ...string) { switch operator { case "over", "in", "out", "atop", "xor", "arithmetic": break default: operator = "over" } svg.printf(``)) } // FeDiffEnd ends a diffuse lighting filter primitive container // Standard reference: http://www.w3.org/TR/SVG11/filters.html#feDiffuseLightingElement func (svg *SVG) FeDiffEnd() { svg.println(``) } // FeDisplacementMap specifies a feDisplacementMap filter primitive // Standard reference: http://www.w3.org/TR/SVG11/filters.html#feDisplacementMapElement func (svg *SVG) FeDisplacementMap(fs Filterspec, scale float64, xchannel, ychannel string, s ...string) { svg.printf(``) for _, n := range nodes { svg.printf("\n", n) } svg.println(``) } // FeMorphology specifies a feMorphologyLight filter primitive // Standard reference: http://www.w3.org/TR/SVG11/filters.html#feMorphologyElement func (svg *SVG) FeMorphology(fs Filterspec, operator string, xradius, yradius float64, s ...string) { switch operator { case "erode", "dilate": break default: operator = "erode" } svg.printf(`\n")) } // FeSpecEnd ends a specular lighting filter primitive container // Standard reference: http://www.w3.org/TR/SVG11/filters.html#feSpecularLightingElement func (svg *SVG) FeSpecEnd() { svg.println(``) } // FeSpotLight specifies a feSpotLight filter primitive // Standard reference: http://www.w3.org/TR/SVG11/filters.html#feSpotLightElement func (svg *SVG) FeSpotLight(fs Filterspec, x, y, z, px, py, pz float64, s ...string) { svg.printf(` 1 { bfx = 0 } if bfy < 0 || bfy > 1 { bfy = 0 } switch ftype[0:1] { case "f", "F": ftype = "fractalNoise" case "t", "T": ftype = "turbulence" default: ftype = "turbulence" } var ss string if stitch { ss = "stitch" } else { ss = "noStitch" } svg.printf(` 0 { svg.Gstyle(s[0]) } for ix := x; ix <= x+w; ix += n { svg.Line(ix, y, ix, y+h) } for iy := y; iy <= y+h; iy += n { svg.Line(x, iy, x+w, iy) } if len(s) > 0 { svg.Gend() } } // Support functions // style returns a style name,attribute string func style(s string) string { if len(s) > 0 { return fmt.Sprintf(`style="%s"`, s) } return s } // pp returns a series of polygon points func (svg *SVG) pp(x []float64, y []float64, tag string) { svg.print(tag) if len(x) != len(y) { svg.print(" ") return } lx := len(x) - 1 for i := 0; i < lx; i++ { svg.print(coord(x[i], y[i]) + " ") } svg.print(coord(x[lx], y[lx])) } // endstyle modifies an SVG object, with either a series of name="value" pairs, // or a single string containing a style func endstyle(s []string, endtag string) string { if len(s) > 0 { nv := "" for i := 0; i < len(s); i++ { if strings.Index(s[i], "=") > 0 { nv += (s[i]) + " " } else { nv += style(s[i]) } } return nv + endtag } return endtag } // tt creates a xml element, tag containing s func (svg *SVG) tt(tag string, s string) { svg.print("<" + tag + ">") xml.Escape(svg.Writer, []byte(s)) svg.println("") } // poly compiles the polygon element func (svg *SVG) poly(x []float64, y []float64, tag string, s ...string) { svg.pp(x, y, "<"+tag+" points=\"") svg.print(`" ` + endstyle(s, "/>\n")) } // onezero returns "0" or "1" func onezero(flag bool) string { if flag { return "1" } return "0" } // pct returns a percetage, capped at 100 func pct(n uint8) uint8 { if n > 100 { return 100 } return n } // islink determines if a string is a script reference func islink(link string) bool { return strings.HasPrefix(link, "http://") || strings.HasPrefix(link, "#") || strings.HasPrefix(link, "../") || strings.HasPrefix(link, "./") } // group returns a group element func group(tag string, value string) string { return fmt.Sprintf(``, tag, value) } // scale return the scale string for the transform func scale(n float64) string { return fmt.Sprintf(`scale(%g)`, n) } // scaleXY return the scale string for the transform func scaleXY(dx, dy float64) string { return fmt.Sprintf(`scale(%g,%g)`, dx, dy) } // skewx returns the skewX string for the transform func skewX(angle float64) string { return fmt.Sprintf(`skewX(%g)`, angle) } // skewx returns the skewX string for the transform func skewY(angle float64) string { return fmt.Sprintf(`skewY(%g)`, angle) } // rotate returns the rotate string for the transform func rotate(r float64) string { return fmt.Sprintf(`rotate(%g)`, r) } // translate returns the translate string for the transform func translate(x, y int) string { return fmt.Sprintf(`translate(%d,%d)`, x, y) } // coord returns a coordinate string func coord(x interface{}, y interface{}) string { return fmt.Sprintf(`%v,%v`, x, y) } // ptag returns the beginning of the path element func ptag(x int, y int) string { return fmt.Sprintf(` 0 { attrs += fmt.Sprintf(`in="%s" `, s.In) } if len(s.In2) > 0 { attrs += fmt.Sprintf(`in2="%s" `, s.In2) } if len(s.Result) > 0 { attrs += fmt.Sprintf(`result="%s" `, s.Result) } return attrs } // tablevalues outputs a series of values as a XML attribute func (svg *SVG) tablevalues(s string, t []float64) { svg.printf(` %s="`, s) for i := 0; i < len(t)-1; i++ { svg.printf("%g ", t[i]) } svg.printf(`%g"%s`, t[len(t)-1], emptyclose) } // imgchannel validates the image channel indicator func imgchannel(c string) string { switch c { case "R", "G", "B", "A": return c case "r", "g", "b", "a": return strings.ToUpper(c) case "red", "green", "blue", "alpha": return strings.ToUpper(c[0:1]) case "Red", "Green", "Blue", "Alpha": return c[0:1] } return "R" } mumax3-3.10/test/000077500000000000000000000000001371432437400136365ustar00rootroot00000000000000mumax3-3.10/test/.gitignore000066400000000000000000000000231371432437400156210ustar00rootroot00000000000000*.out *.ovf *.todo mumax3-3.10/test/anisenergy.mx3000066400000000000000000000012721371432437400164350ustar00rootroot00000000000000/* Test conservation of energy with anisotropy. */ SetGridSize(32, 10, 2) c := 1e-9 SetCellSize(c, 2*c, 3*c) EnableDemag = false Aex = 10e-12 Msat = 1000e3 AnisU = vector(0, 0, 1) Ku1 = 1e6 m = uniform(1, 0, 0.1) tableadd(E_total) tableautosave(1e-12) // Get idea of energy scale E0 := E_total.get() alpha = 1 run(1e-9) E1 := E_total.get() Delta1 := E1-E0 print("DeltaE, damped:", Delta1) m = uniform(1, 0, 0.1) E0 = E_total.get() alpha = 0 run(1e-9) E1 = E_total.get() Delta2 := E1-E0 print("DeltaE, undamped:", Delta2) ratio := abs(Delta2/Delta1) print("ratio:", ratio) // test relative energy non-conservation up to 1ppm. expect("Relative energy non-conservation:", ratio, 0, 1e-6) mumax3-3.10/test/anisenergy2.mx3000066400000000000000000000012721371432437400165170ustar00rootroot00000000000000/* Test conservation of energy with anisotropy. */ SetGridSize(32, 10, 2) c := 1e-9 SetCellSize(c, 2*c, 3*c) EnableDemag = false Aex = 10e-12 AnisU = vector(0, 0, 1) Ku2 = 1e6 Msat = 1000e3 m = uniform(1, 0, 0.1) tableadd(E_total) tableautosave(1e-12) // Get idea of energy scale E0 := E_total.get() alpha = 1 run(1e-9) E1 := E_total.get() Delta1 := E1-E0 print("DeltaE, damped:", Delta1) m = uniform(1, 0, 0.1) E0 = E_total.get() alpha = 0 run(1e-9) E1 = E_total.get() Delta2 := E1-E0 print("DeltaE, undamped:", Delta2) ratio := abs(Delta2/Delta1) print("ratio:", ratio) // test relative energy non-conservation up to 1ppm. expect("Relative energy non-conservation:", ratio, 0, 1e-6) mumax3-3.10/test/anisenergyconservation.mx3000066400000000000000000000010321371432437400210620ustar00rootroot00000000000000/* Test anisotropy energy conservation. Add cubic and uniaxial anisotropy, so that an off-by-a-factor error in one of them would give an total energy oscillation. */ setgridsize(1, 1, 1) setcellsize(1e-9, 1e-9, 1e-9) Msat = 1000e3 alpha = 1e-6 Kc1 = 1e3 maxDt = 1e-13 Ku1 = 1e5 enabledemag = false AnisC1 = vector(1, 0, 0) AnisC2 = vector(0, 1, 0) AnisU = vector(1, 1, 0) M = uniform(0.3, 0.7, 0.1) E0 := E_total.Get() TOL := 1e-5 for i:=0; i<10; i++{ run(10e-12) E := E_total.Get() expect("deltaE", (E0-E)/E0, 0, TOL) } mumax3-3.10/test/anisenergyconservation2.mx3000066400000000000000000000010611371432437400211460ustar00rootroot00000000000000/* Test higher-order anisotropy energy conservation. Add cubic and uniaxial anisotropy, so that an off-by-a-factor error in one of them would give an total energy oscillation. */ setgridsize(1, 1, 1) setcellsize(1e-9, 1e-9, 1e-9) Msat = 1000e3 alpha = 1e-6 Kc1 = 1e3 maxDt = 1e-13 Ku1 = 1e5 Ku2 = 2e5 enabledemag = false AnisC1 = vector(1, 0, 0) AnisC2 = vector(0, 1, 0) AnisU = vector(1, 1, 0) M = uniform(0.3, 0.7, 0.1) E0 := E_total.Get() TOL := 1e-5 for i:=0; i<10; i++{ run(10e-12) E := E_total.Get() expect("deltaE", (E0-E)/E0, 0, TOL) } mumax3-3.10/test/anisenergyconservation3.mx3000066400000000000000000000010611371432437400211470ustar00rootroot00000000000000/* Test higher-order anisotropy energy conservation. Add cubic and uniaxial anisotropy, so that an off-by-a-factor error in one of them would give an total energy oscillation. */ setgridsize(1, 1, 1) setcellsize(1e-9, 1e-9, 1e-9) Msat = 1000e3 alpha = 1e-6 Kc1 = 1e3 Kc2 = 1e3 maxDt = 1e-13 Ku1 = 1e5 enabledemag = false AnisC1 = vector(1, 0, 0) AnisC2 = vector(0, 1, 0) AnisU = vector(1, 1, 0) M = uniform(0.3, 0.7, 0.1) E0 := E_total.Get() TOL := 1e-5 for i:=0; i<10; i++{ run(10e-12) E := E_total.Get() expect("deltaE", (E0-E)/E0, 0, TOL) } mumax3-3.10/test/anisenergyconservation4.mx3000066400000000000000000000010611371432437400211500ustar00rootroot00000000000000/* Test higher-order anisotropy energy conservation. Add cubic and uniaxial anisotropy, so that an off-by-a-factor error in one of them would give an total energy oscillation. */ setgridsize(1, 1, 1) setcellsize(1e-9, 1e-9, 1e-9) Msat = 1000e3 alpha = 1e-6 Kc1 = 1e3 Kc3 = 1e3 maxDt = 1e-13 Ku1 = 1e5 enabledemag = false AnisC1 = vector(1, 0, 0) AnisC2 = vector(0, 1, 0) AnisU = vector(1, 1, 0) M = uniform(0.3, 0.7, 0.1) E0 := E_total.Get() TOL := 1e-5 for i:=0; i<10; i++{ run(10e-12) E := E_total.Get() expect("deltaE", (E0-E)/E0, 0, TOL) } mumax3-3.10/test/antenna.go000066400000000000000000000020451371432437400156120ustar00rootroot00000000000000//+build ignore package main import ( "github.com/mumax/3/data" . "github.com/mumax/3/engine" "github.com/mumax/3/oommf" "math" "os" ) const Mu0 = 4 * math.Pi * 1e-7 func main() { defer InitAndClose()() Nx := 512 Ny := 128 Nz := 1 cellsize := 5.0e-9 SetGridSize(Nx, Ny, Nz) thickness := 40e-9 length := float64(Nx) * cellsize SetCellSize(cellsize, cellsize, thickness/float64(Nz)) mask := data.NewSlice(3, Mesh().Size()) wireX := -length * 0.45 wireZ := thickness * 5.0 for h := 0; h < 10; h++ { for i := 0; i < Nx; i++ { for j := 0; j < Ny; j++ { r := Index2Coord(i, j, 0) r = r.Sub(Vector(wireX+float64(h)*cellsize, r.Y(), wireZ)) B := Vector(0, 0, 0) current := Vector(0, 1, 0) B = r.Cross(current).Mul(Mu0 / (2 * math.Pi * math.Pow(r.Len(), 2))) mask.Set(0, i, j, 0, B.X()) mask.Set(1, i, j, 0, B.Y()) mask.Set(2, i, j, 0, B.Z()) } } } f, _ := os.OpenFile("antenna.ovf", os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0666) defer f.Close() oommf.WriteOVF2(f, mask, data.Meta{}, "binary 4") } mumax3-3.10/test/average.mx3000066400000000000000000000044541371432437400157100ustar00rootroot00000000000000/* Test for averages with a non-universe geometry. Magnetization should be averaged over the geometry, while others should average over the box (parameters, external excitations). Finally, test averages over a region (ignores geometry). */ N := 512 c := 1e-9 setgridsize(N, N, 1) setcellsize(c, c, c) setgeom(circle(N*c)) tol := 1e-4 // tolerance limited by FD circle approximation defregion(1, xrange(-inf, 0)) defregion(2, xrange( 0, inf)) m = uniform(0, 1, 0) // Magnetization should average over the geometry expectv("m", m.average(), vector(0, 1, 0), tol) expect("mx", m.comp(0).average(), 0, tol) expect("my", m.comp(1).average(), 1, tol) expect("mz", m.comp(2).average(), 0, tol) // Average over region ignores geometry, // so here average m feels the surface of the disk expectv("m1", m.region(1).average(), vector(0, pi/4, 0), tol) expect("m1x", m.region(1).average()[0], 0, tol) expect("m1y", m.region(1).average()[1], pi/4, tol) expect("m1z", m.region(1).average()[2], 0, tol) expect("m1x", m.comp(0).region(1).average(), 0, tol) expect("m1y", m.comp(1).region(1).average(), pi/4, tol) expect("m1z", m.comp(2).region(1).average(), 0, tol) // Material parameter is set everywhere and averaged over the box alpha = 2 expect("alpha", alpha.average(), 2, tol) expect("alpha", alpha.region(1).average(), 2, tol) expect("alpha", alpha.region(2).average(), 2, tol) alpha.setRegion(1, 3) expect("alpha", alpha.average(), (2.+3.)/2., tol) // average of 2 and 3 expect("alpha", alpha.region(1).average(), 3, tol) expect("alpha", alpha.region(2).average(), 2, tol) // Excitation is set everywhere and averaged everywhere B_ext = vector(1, 2, 3) expectv("B_ext", B_ext.average(), vector(1, 2, 3), tol) expect("B_ext_x", B_ext.comp(0).average(), 1, tol) expect("B_ext_y", B_ext.comp(1).average(), 2, tol) expect("B_ext_z", B_ext.comp(2).average(), 3, tol) expectv("B_ext_1", B_ext.region(1).average(), vector(1, 2, 3), tol) expect("B_ext_1x", B_ext.region(1).average()[0], 1, tol) expect("B_ext_1y", B_ext.region(1).average()[1], 2, tol) expect("B_ext_1z", B_ext.region(1).average()[2], 3, tol) expect("B_ext_1x", B_ext.comp(0).region(1).average(), 1, tol) expect("B_ext_1y", B_ext.comp(1).region(1).average(), 2, tol) expect("B_ext_1z", B_ext.comp(2).region(1).average(), 3, tol) mumax3-3.10/test/axes.mx3000066400000000000000000000014021371432437400152240ustar00rootroot00000000000000/* Not really a test. Makes a snapshot showing the orientation of our axes. */ setgridsize(400, 300, 1) c := 1e-9 setcellsize(c, c, c) S := 8 * c I := rect(S, 5*S) X := I.rotz(35*pi/180).add(I.rotz(-35*pi/180)) X = X.transl(12*S, -2*S, 0) I = rect(S, 2.5*S) Y := I.transl(0, 1.2*S, 0).rotz(35*pi/180) Y = Y.add(I.transl(0, 1.2*S, 0).rotz(-35*pi/180)) Y = Y.add(I.transl(0, -1.2*S, 0)) Y = Y.transl(-2*S, 13*S, 0) head := rect(2*S, 2*S).rotz(pi/4).intersect(yrange(0, inf)) I = rect(S, 8*S).transl(0, 4*S, 0).add(head.transl(0, 8*S, 0)) O := circle(2*S) axes := X.add(Y).add(I).add(I.rotz(-pi/2)).add(O) disk := circle(12*S).transl(10*S, 10*S, 0) m = uniform(0, 0, 1) m.SetInShape(disk,vortex(1, 1).transl(10*S, 10*S, 0)) setgeom(axes.add(disk)) snapshot(m) mumax3-3.10/test/b_ext_add.mx3000066400000000000000000000025071371432437400162040ustar00rootroot00000000000000/* Construct an external field mask on-the fly and add to B_ext. Test that it actually gets added. */ Nx := 128 Ny := 64 setgridsize(Nx, Ny, 1) c := 5e-9 setcellsize(c, c, c) Msat = 800e3 Aex = 13e-12 m = uniform(1, 0, 0) // External field mask corresponding to Oersted field of a long perpendicular wire mask := newSlice(3, Nx, Ny, 1) // wire position, diameter and current direction wireX := 0e-9 wireY := 0e-9 wireDiam := 50e-9 current := vector(0, 0, 1) // construct mask for i:=0; i= wireDiam{ // outside wire b = r.cross(current).mul(mu0 / (2*pi*pow(r.len(), 2))) }else{ // inside wire relDist := r.len() / wireDiam innerCurrent := current.mul(relDist * relDist) b = r.cross(innerCurrent).mul(mu0 / (2*pi*pow(r.len(), 2))) } mask.set(0, i, j, 0, b[0]) mask.set(1, i, j, 0, b[1]) mask.set(2, i, j, 0, b[2]) } } // Add mask with multiplier B_ext=vector(0,0,-1e-4) B_ext.add(mask, 0.1) save(B_ext) relax() //alpha = 3 //RunWhile(MaxTorque > 1e-4) //steps(1000) // Check whether m has become a vortex. // Not adding mask results in uniform state. tol := 1e-3 expectv("m", m.average(), vector(0, 0, -0.001), tol) mumax3-3.10/test/bubbleshiftpos.mx3000066400000000000000000000012021371432437400172750ustar00rootroot00000000000000SetMesh(128, 128, 1, 1e-9,1e-9,0.4e-9, 1, 1, 0) Msat =580e3 Aex = 15e-12 enabledemag=false alpha = 0.1 Ku1=0.59e6 anisU=vector(0,0,1) Dind=0.0034089785 shiftregions=true maxregion:=255 seed:=17 ext_makegrains(10e-9, maxregion, seed) for i:=0; i errx { errx = ex } if ey > erry { erry = ey } } } } ii := Nx / 2 jj := Ny / 2 kk := Nz / 2 r := index2coord(ii, jj, kk) x := r.X() ref := sin(kx*x) * cos(kx*x) ref2 := (cos(kx*x)*cos(kx*x) - sin(kx*x)*sin(kx*x)) val := Fmel.get(0, ii, jj, kk) val2 := Fmel.get(1, ii, jj, kk) ex := abs(val*pre - ref) ey := abs(val2*pre2 - ref2) ERRMAX := 0.004 ERRMIN := 3e-6 expect("max((ΔFmel).x)", errx, 0., ERRMAX) expect("max((ΔFmel).y)", erry, 0., ERRMAX) expect("((ΔFmel).x)@center", ex, 0., ERRMIN) expect("((ΔFmel).y)@center", ey, 0., ERRMIN) mumax3-3.10/test/mel-force-dmxdx-dmzdx.mx3000066400000000000000000000030711371432437400204070ustar00rootroot00000000000000Nx := 1024 Ny := 16 Nz := 16 csX := 0.5e-9 csY := 1e-9 csZ := 2e-9 setgridsize(Nx, Ny, Nz) setcellsize(csX, csY, csZ) Msat = 1000e3 B1 = 1000e3 B2 = 1000e3 enabledemag = false m = uniform(1.0, 1.0, 1.0) mask := newVectorMask(Nx, Ny, Nz) Period := 32e-9 kx := 2 * pi / Period pre := 1.0 / (2.0 * B1.Average() * kx) pre2 := 1.0 / (B2.Average() * kx) for ii := 0; ii < Nx; ii++ { for jj := 0; jj < Ny; jj++ { for kk := 0; kk < Nz; kk++ { r := index2coord(ii, jj, kk) x := r.X() mx := sin(kx * x) mz := cos(kx * x) mask.setVector(ii, jj, kk, vector(mx, 0.0, mz)) } } } m.setarray(mask) save(m) save(F_mel) Fmel := F_mel.HostCopy() errx := -inf erry := -inf for ii := 0; ii < Nx; ii++ { for jj := 0; jj < Ny; jj++ { for kk := 0; kk < Nz; kk++ { r := index2coord(ii, jj, kk) x := r.X() ref := sin(kx*x) * cos(kx*x) ref2 := (cos(kx*x)*cos(kx*x) - sin(kx*x)*sin(kx*x)) val := Fmel.get(0, ii, jj, kk) val2 := Fmel.get(2, ii, jj, kk) ex := abs(val*pre - ref) ey := abs(val2*pre2 - ref2) if ex > errx { errx = ex } if ey > erry { erry = ey } } } } ii := Nx / 2 jj := Ny / 2 kk := Nz / 2 r := index2coord(ii, jj, kk) x := r.X() ref := sin(kx*x) * cos(kx*x) ref2 := (cos(kx*x)*cos(kx*x) - sin(kx*x)*sin(kx*x)) val := Fmel.get(0, ii, jj, kk) val2 := Fmel.get(2, ii, jj, kk) ex := abs(val*pre - ref) ey := abs(val2*pre2 - ref2) ERRMAX := 0.004 ERRMIN := 3e-6 expect("max(Fmel,x)", errx, 0., ERRMAX) expect("max(Fmel,y)", erry, 0., ERRMAX) expect("min(Fmel,x)", ex, 0., ERRMIN) expect("min(Fmel,y)", ey, 0., ERRMIN) mumax3-3.10/test/mel-force-dmydy-dmxdy.mx3000066400000000000000000000030711371432437400204100ustar00rootroot00000000000000Nx := 16 Ny := 1024 Nz := 16 csX := 1e-9 csY := 0.5e-9 csZ := 2e-9 setgridsize(Nx, Ny, Nz) setcellsize(csX, csY, csZ) Msat = 1000e3 B1 = 1000e3 B2 = 1000e3 enabledemag = false m = uniform(1.0, 1.0, 1.0) mask := newVectorMask(Nx, Ny, Nz) Period := 32e-9 kx := 2 * pi / Period pre := 1.0 / (2.0 * B1.Average() * kx) pre2 := 1.0 / (B2.Average() * kx) for ii := 0; ii < Nx; ii++ { for jj := 0; jj < Ny; jj++ { for kk := 0; kk < Nz; kk++ { r := index2coord(ii, jj, kk) y := r.Y() mx := cos(kx * y) my := sin(kx * y) mask.setVector(ii, jj, kk, vector(mx, my, 0.0)) } } } m.setarray(mask) save(m) save(F_mel) Fmel := F_mel.HostCopy() errx := -inf erry := -inf for ii := 0; ii < Nx; ii++ { for jj := 0; jj < Ny; jj++ { for kk := 0; kk < Nz; kk++ { r := index2coord(ii, jj, kk) y := r.Y() ref := sin(kx*y) * cos(kx*y) ref2 := (cos(kx*y)*cos(kx*y) - sin(kx*y)*sin(kx*y)) val := Fmel.get(1, ii, jj, kk) val2 := Fmel.get(0, ii, jj, kk) ex := abs(val*pre - ref) ey := abs(val2*pre2 - ref2) if ex > errx { errx = ex } if ey > erry { erry = ey } } } } ii := Nx / 2 jj := Ny / 2 kk := Nz / 2 r := index2coord(ii, jj, kk) y := r.Y() ref := sin(kx*y) * cos(kx*y) ref2 := (cos(kx*y)*cos(kx*y) - sin(kx*y)*sin(kx*y)) val := Fmel.get(1, ii, jj, kk) val2 := Fmel.get(0, ii, jj, kk) ex := abs(val*pre - ref) ey := abs(val2*pre2 - ref2) ERRMAX := 0.004 ERRMIN := 3e-6 expect("max(Fmel,x)", errx, 0., ERRMAX) expect("max(Fmel,y)", erry, 0., ERRMAX) expect("min(Fmel,x)", ex, 0., ERRMIN) expect("min(Fmel,y)", ey, 0., ERRMIN) mumax3-3.10/test/mel-force-dmydy-dmzdy.mx3000066400000000000000000000030711371432437400204120ustar00rootroot00000000000000Nx := 16 Ny := 1024 Nz := 16 csX := 1e-9 csY := 0.5e-9 csZ := 2e-9 setgridsize(Nx, Ny, Nz) setcellsize(csX, csY, csZ) Msat = 1000e3 B1 = 1000e3 B2 = 1000e3 enabledemag = false m = uniform(1.0, 1.0, 1.0) mask := newVectorMask(Nx, Ny, Nz) Period := 32e-9 kx := 2 * pi / Period pre := 1.0 / (2.0 * B1.Average() * kx) pre2 := 1.0 / (B2.Average() * kx) for ii := 0; ii < Nx; ii++ { for jj := 0; jj < Ny; jj++ { for kk := 0; kk < Nz; kk++ { r := index2coord(ii, jj, kk) y := r.Y() mz := cos(kx * y) my := sin(kx * y) mask.setVector(ii, jj, kk, vector(0.0, my, mz)) } } } m.setarray(mask) save(m) save(F_mel) Fmel := F_mel.HostCopy() errx := -inf erry := -inf for ii := 0; ii < Nx; ii++ { for jj := 0; jj < Ny; jj++ { for kk := 0; kk < Nz; kk++ { r := index2coord(ii, jj, kk) y := r.Y() ref := sin(kx*y) * cos(kx*y) ref2 := (cos(kx*y)*cos(kx*y) - sin(kx*y)*sin(kx*y)) val := Fmel.get(1, ii, jj, kk) val2 := Fmel.get(2, ii, jj, kk) ex := abs(val*pre - ref) ey := abs(val2*pre2 - ref2) if ex > errx { errx = ex } if ey > erry { erry = ey } } } } ii := Nx / 2 jj := Ny / 2 kk := Nz / 2 r := index2coord(ii, jj, kk) y := r.Y() ref := sin(kx*y) * cos(kx*y) ref2 := (cos(kx*y)*cos(kx*y) - sin(kx*y)*sin(kx*y)) val := Fmel.get(1, ii, jj, kk) val2 := Fmel.get(2, ii, jj, kk) ex := abs(val*pre - ref) ey := abs(val2*pre2 - ref2) ERRMAX := 0.004 ERRMIN := 3e-6 expect("max(Fmel,x)", errx, 0., ERRMAX) expect("max(Fmel,y)", erry, 0., ERRMAX) expect("min(Fmel,x)", ex, 0., ERRMIN) expect("min(Fmel,y)", ey, 0., ERRMIN) mumax3-3.10/test/mel-force-dmzdz-dmxdz.mx3000066400000000000000000000030711371432437400204130ustar00rootroot00000000000000Nx := 16 Ny := 16 Nz := 1024 csX := 1e-9 csY := 2e-9 csZ := 0.5e-9 setgridsize(Nx, Ny, Nz) setcellsize(csX, csY, csZ) Msat = 1000e3 B1 = 1000e3 B2 = 1000e3 enabledemag = false m = uniform(1.0, 1.0, 1.0) mask := newVectorMask(Nx, Ny, Nz) Period := 32e-9 kx := 2 * pi / Period pre := 1.0 / (2.0 * B1.Average() * kx) pre2 := 1.0 / (B2.Average() * kx) for ii := 0; ii < Nx; ii++ { for jj := 0; jj < Ny; jj++ { for kk := 0; kk < Nz; kk++ { r := index2coord(ii, jj, kk) z := r.Z() mx := cos(kx * z) mz := sin(kx * z) mask.setVector(ii, jj, kk, vector(mx, 0.0, mz)) } } } m.setarray(mask) save(m) save(F_mel) Fmel := F_mel.HostCopy() errx := -inf errz := -inf for ii := 0; ii < Nx; ii++ { for jj := 0; jj < Ny; jj++ { for kk := 0; kk < Nz; kk++ { r := index2coord(ii, jj, kk) z := r.Z() ref := sin(kx*z) * cos(kx*z) ref2 := (cos(kx*z)*cos(kx*z) - sin(kx*z)*sin(kx*z)) val := Fmel.get(2, ii, jj, kk) val2 := Fmel.get(0, ii, jj, kk) ez := abs(val*pre - ref) ex := abs(val2*pre2 - ref2) if ex > errx { errx = ex } if ez > errz { errz = ez } } } } ii := Nx / 2 jj := Ny / 2 kk := Nz / 2 r := index2coord(ii, jj, kk) z := r.Z() ref := sin(kx*z) * cos(kx*z) ref2 := (cos(kx*z)*cos(kx*z) - sin(kx*z)*sin(kx*z)) val := Fmel.get(2, ii, jj, kk) val2 := Fmel.get(0, ii, jj, kk) ez := abs(val*pre - ref) ex := abs(val2*pre2 - ref2) ERRMAX := 0.004 ERRMIN := 3e-6 expect("max(Fmel,x)", errx, 0., ERRMAX) expect("max(Fmel,z)", errz, 0., ERRMAX) expect("min(Fmel,x)", ex, 0., ERRMIN) expect("min(Fmel,z)", ez, 0., ERRMIN) mumax3-3.10/test/mel-force-dmzdz-dmydz.mx3000066400000000000000000000030711371432437400204140ustar00rootroot00000000000000Nx := 16 Ny := 16 Nz := 1024 csX := 1e-9 csY := 2e-9 csZ := 0.5e-9 setgridsize(Nx, Ny, Nz) setcellsize(csX, csY, csZ) Msat = 1000e3 B1 = 1000e3 B2 = 1000e3 enabledemag = false m = uniform(1.0, 1.0, 1.0) mask := newVectorMask(Nx, Ny, Nz) Period := 32e-9 kx := 2 * pi / Period pre := 1.0 / (2.0 * B1.Average() * kx) pre2 := 1.0 / (B2.Average() * kx) for ii := 0; ii < Nx; ii++ { for jj := 0; jj < Ny; jj++ { for kk := 0; kk < Nz; kk++ { r := index2coord(ii, jj, kk) z := r.Z() my := cos(kx * z) mz := sin(kx * z) mask.setVector(ii, jj, kk, vector(0.0, my, mz)) } } } m.setarray(mask) save(m) save(F_mel) Fmel := F_mel.HostCopy() errx := -inf errz := -inf for ii := 0; ii < Nx; ii++ { for jj := 0; jj < Ny; jj++ { for kk := 0; kk < Nz; kk++ { r := index2coord(ii, jj, kk) z := r.Z() ref := sin(kx*z) * cos(kx*z) ref2 := (cos(kx*z)*cos(kx*z) - sin(kx*z)*sin(kx*z)) val := Fmel.get(2, ii, jj, kk) val2 := Fmel.get(1, ii, jj, kk) ez := abs(val*pre - ref) ex := abs(val2*pre2 - ref2) if ex > errx { errx = ex } if ez > errz { errz = ez } } } } ii := Nx / 2 jj := Ny / 2 kk := Nz / 2 r := index2coord(ii, jj, kk) z := r.Z() ref := sin(kx*z) * cos(kx*z) ref2 := (cos(kx*z)*cos(kx*z) - sin(kx*z)*sin(kx*z)) val := Fmel.get(2, ii, jj, kk) val2 := Fmel.get(1, ii, jj, kk) ez := abs(val*pre - ref) ex := abs(val2*pre2 - ref2) ERRMAX := 0.004 ERRMIN := 3e-6 expect("max(Fmel,x)", errx, 0., ERRMAX) expect("max(Fmel,z)", errz, 0., ERRMAX) expect("min(Fmel,x)", ex, 0., ERRMIN) expect("min(Fmel,z)", ez, 0., ERRMIN) mumax3-3.10/test/memleak.mx3000066400000000000000000000003601371432437400157010ustar00rootroot00000000000000/* Test for memory leaks when resizing. */ c := 5e-9 SetGridSize(1024, 1024, 2) SetCellSize(c, c, c) m = uniform(1,1,1) Aex = 13e-12 Msat = 800e3 for i:=0; i<30; i++{ SetGridSize(128, 128+2*i, 1) SetCellSize(c, c, c) Steps(100) } mumax3-3.10/test/mfm.mx3000066400000000000000000000005021371432437400150430ustar00rootroot00000000000000/* Save an mfm image */ Nx := 400 Ny := 400 c := 2e-9 setpbc(2, 0, 0) setgridsize(Nx, Ny, 1) setcellsize(c, c, c) Msat = 1/mu0 setgeom(rect(400e-9, 400e-9).transl(-400e-9, 0, 0)) m = uniform(1,0,0.1) MFMLift = 50e-9 save(MFM) expect("mfm", MFM.Average(), -3.28009e7, 1e5) // golden value with mumax3.9.1 2015-12-05 mumax3-3.10/test/minimizer-stress.mx3000066400000000000000000000004471371432437400176200ustar00rootroot00000000000000// stress-test the minimizer for memleaks etc. setgridsize(128, 128, 1) setcellsize(3e-9, 3e-9 , 3e-9) Aex = 13e-12 Msat = 800e3 alpha = 0.02 m = uniform(-1, .1, 0) MinimizerStop = 1e-3 // make it go fast for B:=0.0; B<10e-3; B+=0.1e-3{ B_ext = vector(B, 0, 0) minimize() } mumax3-3.10/test/minimizer.mx3000066400000000000000000000005061371432437400162730ustar00rootroot00000000000000/* */ setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) Msat = 1600e3 Aex = 13e-12 E_total.get() // try to trigger bad lex2 Msat = 800e3 alpha = 0.02 m = uniform(1, .1, 0) minimize() save(m) TOL := 1e-5 expectv("m", m.average(), vector(0.9669684171676636, 0.1252732127904892, 0), TOL) mumax3-3.10/test/nodemagspins.mx3000066400000000000000000000030141371432437400167540ustar00rootroot00000000000000 setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) Msat = 800e3 Aex = 13e-12 m = uniform(1, 1, 1) LEFT := 0 RIGHT := 1 AIR := 2 defregion(LEFT, xrange(-inf, 0)) defregion(RIGHT, xrange(0, inf)) TOL := 1e-4 expectv("L", B_demag.Region(LEFT).Average(), vector(-0.005299, -0.022127, -0.552248), TOL) expectv("R", B_demag.Region(RIGHT).Average(), vector(-0.005299, -0.022127, -0.552248), TOL) NoDemagSpins.SetRegion(LEFT, 0) NoDemagSpins.SetRegion(RIGHT, 1) expectv("L", B_demag.Region(LEFT).Average(), vector(-0.0103589, -0.02122757, -0.5480939), TOL) expectv("R", B_demag.Region(RIGHT).Average(), vector(0, 0, 0), 0) NoDemagSpins.SetRegion(LEFT, 1) expectv("L", B_demag.Region(LEFT).Average(), vector(0, 0, 0), 0) expectv("R", B_demag.Region(RIGHT).Average(), vector(0, 0, 0), 0) FixDt = 1e-14 steps(1000) // test for memleaks airgap := yrange(0, inf) SetGeom(airgap.inverse()) defregion(AIR, airgap) expectv("L", B_demag.Region(LEFT).Average(), vector(0, 0, 0), 0) expectv("R", B_demag.Region(RIGHT).Average(), vector(0, 0, 0), 0) expectv("A", B_demag.Region(AIR).Average(), vector(0, 0, 0), 0) NoDemagSpins.SetRegion(LEFT, 0) NoDemagSpins.SetRegion(RIGHT, 0) expectv("L", B_demag.Region(LEFT).Average(), vector(-0.0046074, -0.0391400, -0.53594726), TOL) expectv("R", B_demag.Region(RIGHT).Average(), vector(-0.0046074, -0.0391400, -0.53594726), TOL) expectv("A", B_demag.Region(AIR).Average(), vector(-0.000692, 0.0170129, -0.01630170), TOL) NoDemagSpins.SetRegion(LEFT, 1) steps(1000) // test for memleaks mumax3-3.10/test/openbc.mx3000066400000000000000000000052211371432437400155350ustar00rootroot00000000000000/* Test if the canting at the end of a nanowire corresponds to the 1D analytical result if open boundary conditions are used. This test is similar to the standard test proposed in arXiv:1803.11174 If the nanowire consists only out of one row of cells, the analytical canting matches with the simulated canting if open (or periodic) boundary conditions are used. The Neumann BC yields a different canting. This does not mean that Neumann BC are wrong. To be more precise, the analytical result, as well as the numerical results obtained with open and Neumann BC are slightly wrong because the width of the nanowire is not taken into account properly. */ ncell := 1024 cs := 0.05 verbose := false DMI := 0.9 *4/pi // 90% of critical DMI strength Dind = DMI enabledemag = false AnisU = vector(0,0,1) Aex = 1. Ku1 = 1. Msat = 1. minimizerstop = 1e-7 // --- Along the x direction -------------------------------------------------------- setgridsize(ncell,1,1) setcellsize(cs,cs,cs) m = uniform(0,0,1) // ANALYTIC theta0 := asin(DMI/2) cant_analytic := 2*atan(exp(-cs/2)*tan(theta0/2)) // shift towards center of the cell // NEUMANN BC openbc = false minimize() cant_neumann := atan2( m.getcell(0,0,0)[0], m.getcell(0,0,0)[2] ) // OPEN BC openbc = true minimize() cant_open := atan2( m.getcell(0,0,0)[0], m.getcell(0,0,0)[2] ) // PERIODIC BC openbc = false setpbc(0,1,0) minimize() cant_periodic := atan2( m.getcell(0,0,0)[0], m.getcell(0,0,0)[2] ) if verbose { print("Neumann: ", cant_neumann) print("Open: ", cant_open) print("Periodic: ", cant_periodic) print("Analytic: ", cant_analytic) } expect("edge canting", cant_open, cant_analytic, 1e-3) expect("edge canting", cant_open, cant_periodic, 1e-5) // --- Along the y direction -------------------------------------------------------- setgridsize(1,ncell,1) setcellsize(cs,cs,cs) setpbc(0,0,0) m = uniform(0,0,1) // ANALYTIC theta0 = asin(DMI/2) cant_analytic = 2*atan(exp(-cs/2)*tan(theta0/2)) // shift towards center of the cell // NEUMANN BC openbc = false minimize() cant_neumann = atan2( m.getcell(0,0,0)[1], m.getcell(0,0,0)[2] ) // OPEN BC openbc = true minimize() cant_open = atan2( m.getcell(0,0,0)[1], m.getcell(0,0,0)[2] ) // PERIODIC BC openbc = false setpbc(1,0,0) minimize() cant_periodic = atan2( m.getcell(0,0,0)[1], m.getcell(0,0,0)[2] ) if verbose { print("Neumann: ", cant_neumann) print("Open: ", cant_open) print("Periodic: ", cant_periodic) print("Analytic: ", cant_analytic) } expect("edge canting", cant_open, cant_analytic, 1e-3) expect("edge canting", cant_open, cant_periodic, 1e-5) mumax3-3.10/test/outputformat.mx3000066400000000000000000000007551371432437400170470ustar00rootroot00000000000000/* Save data with different output formats. */ setgridsize(32, 8, 2) setcellsize(1e-9, 1e-9, 1e-9) Msat = 800e3 Aex = 13e-12 m = uniform(1, .1, 0) steps(1) outputformat = OVF1_TEXT saveas(m, sprintf("mumax_ovf1_text")) outputformat = OVF1_BINARY saveas(m, sprintf("mumax_ovf1_binary")) outputformat = OVF2_TEXT saveas(m, sprintf("mumax_ovf2_text")) step=4 outputformat = OVF2_BINARY saveas(m, sprintf("mumax_ovf2_binary")) step=5 outputformat = DUMP saveas(m, sprintf("mumax_dump"))mumax3-3.10/test/pbc1.mx3000066400000000000000000000011161371432437400151130ustar00rootroot00000000000000/* Test correct wrapping for exchange with PBC. */ setpbc(2, 2, 0) Nx := 128 Ny := Nx/2 setgridsize(Nx, Ny, 1) c := 5e-9 setcellsize(c, c, c) Msat = 1000e3 Aex = 10e-12 alpha = 1 r := rect(Nx/2*c, Ny/2*c) deltax := Nx/2*c deltay := Ny/2*c setgeom( r.repeat(Nx*c, Ny*c, 0).transl(deltax, deltay, 0) ) m = uniform(1, 0.1, 0.01) save(m) run(1e-9) save(m) expectv("m", m.average(), vector(0.89947968, 0.23352228, -0.00010287), 1e-3) setgeom( r.repeat(Nx*c, Ny*c, 0)) m = uniform(1, 0.1, 0.01) run(1e-9) expectv("m", m.average(), vector(0.89947968, 0.23352228, -0.00010287), 1e-3) mumax3-3.10/test/pbc2.mustfail000066400000000000000000000012541371432437400162340ustar00rootroot00000000000000// test correct wrapping for DMI setpbc(1, 0, 0) Nx := 128 Ny := Nx/2 setgridsize(Nx, Ny, 1) c := 5e-9 setcellsize(c, c, c) Msat = 1000e3 Aex = 10e-12 alpha = 1 defregion(1, yrange(-inf, 0)) r := rect(Nx/2*c, Ny/2*c) dx := Nx/2*c dy := Ny/2*c m = vortex(1, 1) save(m) run(1e-9) save(m) m1 := average(m.region(1)) expect("mx", m1[0], 0.8146139383, 1e-5) expect("my", m1[1], -0.0001059844, 1e-5) expect("mz", m1[2], -0.0003330991, 1e-5) m = vortex(1, 1) Dex = 1e-20 // should not make a difference save(m) run(1e-9) save(m) m1 = average(m.region(1)) expect("mx", m1[0], 0.8146139383, 1e-5) expect("my", m1[1], -0.0001059844, 1e-5) expect("mz", m1[2], -0.0003330991, 1e-5) mumax3-3.10/test/quantities.mx3000066400000000000000000000006731371432437400164630ustar00rootroot00000000000000/* Test quantity averages. */ Nx := 200 Ny := 100 setgridsize(Nx, Ny, 1) c := 5e-9 setcellsize(c, c, c) setGeom(circle(Nx*c)) defRegion(0, rect(Nx*c, Nx*c)) defRegion(1, rect(Nx*c, Nx*c/4)) Msat = 1e6 expect("Msat", Msat.Average(), 1e6, 1) Aex = 20e-12 expect("Aex", Aex.Average(), 20e-12, 1e-16) alpha = 1 expect("alpha", alpha.Average(), 1, 1e-5) anisC1 = vector(1, 2, 3) expectV("anisC1", anisC1.Average(), vector(1, 2, 3), 1e-5) mumax3-3.10/test/racetest.bash000077500000000000000000000004251371432437400163130ustar00rootroot00000000000000#! /bin/bash # builds with -race and runs tests with browser open. set -e go install -race github.com/mumax/3/cmd/mumax3 google-chrome http://localhost:35367 & for f in *.mx3; do mumax3 $f done go install github.com/mumax/3/cmd/mumax3 # re-build without race detector mumax3-3.10/test/randregions.todo000066400000000000000000000003231371432437400170360ustar00rootroot00000000000000 setgridsize(30, 20, 2) setcellsize(1e-9, 1e-9, 1e-9) for x:=0; x<30; x++{ for y:=0; y<20; y++{ randRegion := randInt(256) for z:=0; z<2; z++{ defRegionCell(randRegion, x, y, z) } } } save(regions) mumax3-3.10/test/reduced.todo000066400000000000000000000007031371432437400161400ustar00rootroot00000000000000// test derived output quantities Nx := 64 Ny := 64 Nz := 2 c := 4e-9 SetGridSize(Nx, Ny, Nz) SetCellSize(c, c, c ) DefRegion(0, xrange(-inf, 0)) DefRegion(1, xrange(0, inf)) m = Vortex(1, 1) tableadd(m.avgregion(0)) tableadd(m.avgregion(1)) tablesave() expect("m0x", m0.getVec()[0], 0 , 1e-6) expect("m0y", m0.getVec()[1], -0.323819, 1e-6) expect("m1x", m1.getVec()[0], 0 , 1e-6) expect("m1y", m1.getVec()[1], 0.323819, 1e-6) mumax3-3.10/test/regions.mx3000066400000000000000000000007661371432437400157460ustar00rootroot00000000000000/* Regions test */ N := 128 setgridsize(N, N, 1) setcellsize(1e-9, 1e-9, 1e-9) d := 1e-9 * N defregion(0, circle(d).inverse()) defregion(1, circle(d)) defregion(2, circle(d/2)) defregion(3, circle(d/2).transl(d/3, 0, 0)) save(regions) Ku1.setregion(1, 500) Ku1.setregion(2, 0) Ku1.setregion(3, -500) save(Ku1) Msat = 800e-3 // sets it everywhere Msat.setregion(2, 500e3) save(Msat) m = uniform(1, 0, 0) save(m) regions.setcell(5, 6, 0, 123) expect("getcell", regions.getcell(5, 6, 0), 123, 0) mumax3-3.10/test/regions2.todo000066400000000000000000000011031371432437400162500ustar00rootroot00000000000000// Regression test for Jonathan's pinning simulations d := 3.125e-9 h := 5e-9 SetGridsize(64, 64, 2) SetCellsize(d, d, h) setgeom( cylinder(190e-9, 190e-9).transl(0.5*d,0.5*d,0) ) Msat = 860e3 Aex = 13e-12 defregion(2, cuboid(3*d,3*d,h).transl(0.5*d,0.5*d,0.5*h)) Msat.setregion(2, 0.0) les := sqrt(2*13e-12/860e3) overrideLex(1,2, les) alpha = 0.8 m = vortex(1, -1).transl(5e-9, 0, 0) run(1e-9) m_ := average(m) expect("mx", m_[0], 1.043081283569336e-07, 1e-6) expect("my", m_[1], -3.903551260009408e-09, 1e-6) expect("mz", m_[2], -0.0037489673122763634, 1e-6) mumax3-3.10/test/regionsload.mx3000066400000000000000000000010251371432437400165730ustar00rootroot00000000000000/* Test regions.loadfile */ setgridsize(256, 128, 2) setcellsize(5e-9, 5e-9, 5e-9) defregion(1, circle(500e-9)) defregion(2, circle(500e-9).inverse()) m.setRegion(1, uniform(1, 0, 0)) m.setRegion(2, uniform(0, 1, 0)) // save regions to disk saveAs(regions, "regions.ovf") // overwrite regions defregion(256, universe()) // re-load previous state from disk regions.loadFile("testdata/regions.ovf") // check expectv("m1", m.region(1).average(), vector(1, 0, 0), 1e-5) expectv("m2", m.region(2).average(), vector(0, 1, 0), 1e-5) mumax3-3.10/test/regression001.mx3000066400000000000000000000042041371432437400166700ustar00rootroot00000000000000/* Regression test for bug reported by Ezio Iacocca okt 2013 failed on GTX TITAN, presumably because of race conditions. Fixed since xyz branch. */ Nx := 256 Ny := 256 Nz := 1 SetGridsize(Nx, Ny, Nz) CellX := 1500e-9/Nx CellY := 1500e-9/Ny CellZ := 5e-9 SetCellsize(CellX, CellY, CellZ) // DEFINE CONTACT posX := 0.e-9 posY := 0.e-9 Rc := 35.e-9 I := -30e-3 // DEFINE BOUNDARY ABC := 200e-9 factor := 100 // DEFINE APPLIED FIELD Happ := 0 // 0.965 Angle := 70 * pi / 180 // SET MATERIAL PARAMETERS alpha_v := 0.01 Msat_v := 700e3 MsatP := 1200e3 Aex_v := 10e-12 // ADJUST SOLVER MaxDt = 1e-12 MinDt = 1e-18 //MaxErr = 1e-5 // SET GEOMETRY AND REGIONS defregion(1, Ellipse(Rc, Rc).transl(posX,posY,0)) defregion(2, Ellipse(Rc, Rc).transl(posX,posY,0).inverse()) alpha_reg := ceil(ABC/CellX) for i:=0; i= Rc{ b = r.cross(current).mul(mu0 / (2*pi*pow(r.len(),2)) ) }else{ b = r.cross(current).mul(mu0 / (2*pi*pow(Rc,2)) ) } for k:=0; k 5e-4) alpha = 0.01 A := 10*1.0e-3 f := 6.88495e9 mSavetime := 1.0/(8.0*f) t0 := t B_ext = vector( 0.0, 0.0, A*sin(2*pi*f*(t-t0))*(1.0 - exp(-2*pi*f*(t-t0))) ) spot := circle(diameter/50.0).transl(diameter/5.0, diameter/7.0, 0.0) defregion(1, spot) run(0.5e-9) tol := 0.01 expectv("m", m.region(1).average(), vector(-0.450, 0.8869750613257998, 0.0804117293584914), tol) run(0.5e-9) expectv("m", m.region(1).average(), vector( -0.7082362402053106, 0.6975658053443545, -0.1068760781061081), tol) mumax3-3.10/test/regression003.mx3000066400000000000000000000007661371432437400167030ustar00rootroot00000000000000/* Regression test for bug with region > 127 which was turned into negative number. */ setgridsize(512, 512, 1) c := 4e-9 setcellsize(c, c, c) m = uniform(1,0,0) ext_makegrains(40e-9, 255, 0) defregion(1, circle(200*c)) defregion(2, circle(100*c)) defregion(128, circle(50*c)) expect("0", Ku1.average(), 0, 0) expect("0", Kc1.average(), 0, 0) expectv("0", AnisC1.average(), vector(0, 0, 0), 0) expectv("0", AnisC2.average(), vector(0, 0, 0), 0) expectv("0", AnisU.average(), vector(0, 0, 0), 0) mumax3-3.10/test/regression006.mx3000066400000000000000000000017361371432437400167040ustar00rootroot00000000000000/* Regression test for buggy intergrain exchange reported by Jonathan Leliaert */ setgridsize(256, 64, 1) setcellsize(3.125e-9, 3.125e-9, 15e-9) Msat = 860e3 Aex = 13e-12 Xi = 0.0 m = vortexwall(1,-1,1,1).scale(1.5, 1, 1) // Remove surface charges from left (mx=1) and right (mx=-1) sides to mimic infinitely long wire. We have to specify the region (0) at the boundaries. ext_rmSurfaceCharge(0, 1, -1) // Relax Alpha = 3 // high damping for fast relax RunWhile(MaxTorque > 1e-3) // relax Alpha = 0.02 // restore normal damping // Set post-step function that centers simulation window on domain wall. ext_centerwall(0) ext_makegrains(4e-8, 10, 0) for i :=0; i<10;i+=1{ for j :=i+1; j<10;j+=1{ ext_ScaleExchange(i, j, 0.5) } } // Run the simulation with current through the sample Pol =0.56 J = vector(-10e12 , 0, 0) Run(0.5e-9) expectv("m", m.average(), vector(0.013319221, 0.018588585, 0.00010564699186943471), 1e-4) mumax3-3.10/test/regression007.mx3000066400000000000000000000015201371432437400166740ustar00rootroot00000000000000/* Reported by Jonathan Lelieart Zhang-Li used to give NaN's because of division by Bsat */ setgridsize(128, 32, 1) setcellsize(3.125e-9, 3.125e-9, 10e-9) Msat = 860e3 Aex = 13e-12 Xi = 0.0 m = twodomain(1,0,0,0,1,0,-1,0,0) // Remove surface charges from left (mx=1) and right (mx=-1) sides to mimic infinitely long wire. We have to specify the region (0) at the boundaries. ext_rmSurfaceCharge(0, 1, -1) ext_makegrains(1e-9,25,0) msat.setregion(12, 0) // Set post-step function that centers simulation window on domain wall. ext_centerwall(0) tableadd(ext_dwpos) // domain wall position tableautosave(10e-12) autosave(m, 2e-10) autosave(regions, 2e-10) // Run the simulation with current through the sample Pol =0.56 J = vector(-8e12 , 0, 0) Steps(10) expectv("m", m.average(), vector(0.00035, 0.032942, 0), 1e-3) mumax3-3.10/test/regression008.todo000066400000000000000000000021051371432437400171330ustar00rootroot00000000000000// NaN's when slonczewski parameters not OK Nx := 64 Ny := 32 Nz := 1 sX := 160e-9 sY := 80e-9 sZ := 5e-9 setgridsize(Nx, Ny, Nz) setcellsize(sX/Nx, sY/Ny, sZ/Nz) Msat = 800e3 Aex = 13e-12 alpha = 1 maxdt = 1e-12 // Set a initial magnetisation to C-state m = uniform(1, 1, 0.001) run(3e-9) alpha = 0.01 lambda = 1 Pol = 0.5669 epsilonprime = 0 pdeg := 1 prad := pdeg * pi / 180.0 px := cos(prad) py := sin(prad) fixedlayer = vector(px, py, 0) Jtot := -0.008 // total current in amps carea := sX * sY jc := Jtot / carea print("Current density is: " , jc) J = vector(0, 0, jc) //autosave(m, 1e-12) tableautosave(10e-12) save(sttorque) m0 := m.average() expect("mx", m0[0], 0.9586285, 1e-3) expect("my", m0[1], 0.2039081, 1e-3) expect("mz", m0[2], 0.0000000, 1e-3) run(1e-9) m1 := m.average() expect("mx", m1[0], 0.6440672, 1e-3) expect("my", m1[1], 0.5133638, 1e-3) expect("mz", m1[2],-0.1571195, 1e-3) run(1e-9) m2 := m.average() expect("mx", m2[0], -0.9574024, 1e3) expect("my", m2[1], 0.2069624, 1e3) expect("mz", m2[2], 0.0096634, 1e3) mumax3-3.10/test/regression009.mx3000066400000000000000000000004011371432437400166730ustar00rootroot00000000000000/* Regression test for crash on shift after pbc change */ setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) Msat = 800e3 Aex = 13e-12 m = uniform(1, .1, 0) alpha = 3 steps(10) shift(1) setpbc(1, 1, 0) //steps(10) shift(1) mumax3-3.10/test/regression010.mx3000066400000000000000000000016561371432437400167000ustar00rootroot00000000000000/* Regression test for crash after resize when adding to exciations. */ Nx := 32 Ny := 16 setgridsize(Nx, Ny, 1) c := 5e-9 setcellsize(c, c, c) Msat = 800e3 Aex = 13e-12 m = uniform(1, 0, 0) mask := newSlice(3, Nx, Ny, 1) wireX := 0e-9 wireY := 0e-9 wireDiam := 50e-9 current := vector(0, 0, 1) for i:=0; i= wireDiam{ b = r.cross(current).mul(mu0 / (2*pi*pow(r.len(), 2))) }else{ relDist := r.len() / wireDiam innerCurrent := current.mul(relDist * relDist) b = r.cross(innerCurrent).mul(mu0 / (2*pi*pow(r.len(), 2))) } mask.set(0, i, j, 0, b[0]) mask.set(1, i, j, 0, b[1]) mask.set(2, i, j, 0, b[2]) } } B_ext.add(mask, 0.1) alpha = 3 steps(1) setgridsize(Nx*2, Ny, 1) setcellsize(2*c, 2*c, 2*c) steps(1) mumax3-3.10/test/regression011.mx3000066400000000000000000000005041371432437400166700ustar00rootroot00000000000000/* Binary boolean operations were missing (reported by Gabriel Chaves). Unary ! was missing (reported by Raffaele Pellicelli) */ setgridsize(1, 1, 1) setcellsize(1, 1, 1) t0 := 0 timestep := 1 Tq := 1e-3 b := (((t-t0) < timestep ) && ( (Tq > 1e-4) || ( t < 1.0e-13))) if !true{ expect("should_not_happen", 0, 1, 0) } mumax3-3.10/test/regression013.mx3000066400000000000000000000011361371432437400166740ustar00rootroot00000000000000/* Shift should not act on regions if ShiftRegios == false Reported by Ben Van de Wiele. */ Ny := 64 Nx := 16*Ny c := 3.125e-9 setgridsize(Nx, Ny, 1) setcellsize(c, c, 15e-9) Msat = 1.7e6 Aex = 21e-12 Alpha = 0.015 Ku1 = 25e4 DefRegion( 1, XRange(-inf, 0) ) DefRegion( 2, XRange(0, inf) ) anisU.SetRegion(1, vector(1, 1, 0)) anisU.SetRegion(2, vector(1, -1, 0)) m = uniform(1, 0, 0) // shift the magnetization every 1e-10 sec shiftM = True shiftRegions = False for i:=0; i<320; i++{ shift(-1) } expect("regions", regions.Average(), 1.5, 0) mumax3-3.10/test/regression015.mx3000066400000000000000000000007071371432437400167010ustar00rootroot00000000000000// Regression test for vortex config yielding NaN in case of an odd number of cells. // Reported by Mathias Helsen. Nx := 257 Ny := 257 Nz := 1 setgridsize(Nx, Ny, Nz) thickness := 10.0e-9 diameter := 1.0e-6 setcellsize(diameter/Nx, diameter/Ny, thickness/Nz) setgeom(cylinder(diameter, thickness)) mask := newslice(3, Nx, Ny, Nz) mask.set(2, 127, 127, 0, 1.0) m = vortex(1, 1) expectv("m", m.average(), vector(0, 0, 0.0001), 1e-2) mumax3-3.10/test/regression016.mx3000066400000000000000000000005571371432437400167050ustar00rootroot00000000000000// test httpfs saveas: should not create "http:/..." directory // reported by Jonathan Leliaert. Gsize := 20 Ared := 0.75 Delta_x := 450.0/128.0 SetGridsize(128, 128, 1) SetCellsize(Delta_x*1e-9, Delta_x*1e-9, 30e-9) alpha = 0.02 Msat = 860e3 Aex = 13e-12 m = Vortex(1, 1) Mstring := sprint("m_Relaxed_Ared=", Ared, "_Gsize=", Gsize, ".ovf") saveAs(m, Mstring) mumax3-3.10/test/regression017.mx3000066400000000000000000000005561371432437400167050ustar00rootroot00000000000000/* Test for tripping the solver with an instable equilibrium start magnetization. Reported by Mykola Dvornik, Feb. 2015. */ setgridsize(5, 1, 1) setcellsize(1e-9, 1e-9, 1e-9) Msat = 1000e3 Aex = 10e-12 anisU = vector(0, 0, 1) Ku1 = 1.3e6 alpha = 0.02 m = uniform(0.0, 0.0, 1.0) relax() m = uniform(0.0, 0.0, -1.0) relax() // used to fail here. mumax3-3.10/test/regression018.mx3000066400000000000000000000031321371432437400166770ustar00rootroot00000000000000/* Problem reported by Gabriel Chaves: CURAND error + httpfs touch did not close file */ SetSolver(2) FixDt = 1e-14 Bval := 0.0 mos := -1.0 kval := 278607 fwidth := 6.000000e-08 fellipseaxis := 2.400000e-07 flength := 2.400000e-07 ThermSeed(128) Temp =300.000000 fthickness := 1.7e-9 sr := fellipseaxis/fwidth xcellsize := 2.00e-9 ycellsize := 2.00e-9 zcellsize := 1.70e-9 nx := floor(flength/xcellsize) ny := floor(fwidth/ycellsize) print("grid:", nx, ny) setgridsize(128, 32, 1) setcellsize(xcellsize, ycellsize, zcellsize) /* Input parameters */ Msat = 817647 Aex = 13e-12 alpha = 0.01 // Gilbert damping constant B_ext = vector(0.0,0,0) // Applied field in T a := cylinder(fellipseaxis,zcellsize).scale(1.0,1.0/sr,1.0).transl((fellipseaxis-flength)/2.0,0e-9, 0 ) b := cylinder(fellipseaxis,zcellsize).scale(1.0,1.0/sr,1.0).transl((-fellipseaxis+flength)/2.0,0e-9, 0 ) c := cuboid(flength-fellipseaxis,fwidth,zcellsize).scale(1.0,1.0,1.0).transl(0e-9,0e-9, 0 ) setgeom( a.add(b).add(c)) defregion(1,a.add(b).add(c)) m.setRegion(1, uniform(-1, 1e-3, 1e-3)) FixedLayer.setRegion(1,vector (1,0,0)) anisU = vector (0,0,1) tableAdd(Ku1) tableAdd(m) tableAdd(E_total) tableAdd(E_anis) tableAdd(E_exch) tableAdd(E_Zeeman) tableAdd(E_demag) tableAdd(B_ext) tableAdd(FixedLayer.Region(1)) B_ext = vector(Bval,0, 0) timestep := 10.0e-9 //autosave(m, 1e-12) //tableadd(dt) //tableautosave(1e-15) for i:=0; i<1000; i++ { steps(1) fprintln("dt.txt", Neval.get(), dt) } Ku1 = kval //m.loadfile("initialmstate.out/m000000.ovf") m.set(uniform(1,1,1)) B_ext = vector (Bval,0.0,0.0) count := 0 //run(timestep) mumax3-3.10/test/relax-stress.mx3000066400000000000000000000003171371432437400167240ustar00rootroot00000000000000// stress-test relax for memleaks etc. setgridsize(128, 128, 1) setcellsize(3e-9, 3e-9 , 3e-9) Aex = 13e-12 Msat = 800e3 alpha = 0.02 m = uniform(-1, .1, 0) for i:=0; i<100; i++{ relax() } mumax3-3.10/test/repeat.mx3000066400000000000000000000004151371432437400155470ustar00rootroot00000000000000/* Test Shape.repeat() */ N := 128 c := 5e-9 SetGridSize(N, N, 1) SetCellSize(c, c, c) d := 20*c SetGeom(circle(d).repeat(2*d, d, 0)) m = uniform(1, 0, 0) Msat = 800e3 Aex = 13e-12 alpha = 1 steps(1000) expectv("m", m.average(), vector(0.525, 0, 0), 1e-2) mumax3-3.10/test/resize.mx3000066400000000000000000000007711371432437400155750ustar00rootroot00000000000000/* Test magnetization stretch upon resize. */ setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) Msat = 800e3 Aex = 13e-12 m = uniform(1, .1, 0) alpha = 3 //MaxErr = 1e-4 RunWhile(MaxTorque > 1e-4) run(1e-9) expectv("m", m.average(), vector(0.96696, 0.12528, 0), 1e-3) setgridsize(128*2, 32*2, 2) setcellsize(500e-9/(128*2), 125e-9/(32*2), 3e-9/2) expectv("m", m.average(), vector(0.96696, 0.12528, 0), 1e-3) run(1e-9) expectv("m", m.average(), vector(0.96696, 0.12528, 0), 1e-3) mumax3-3.10/test/rk4.mx3000066400000000000000000000007461371432437400147760ustar00rootroot00000000000000/* Test rk4 solver with fixed and adaptive timestep */ setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) Msat = 800e3 Aex = 13e-12 m = uniform(1, .1, 0) // relax (rk4) setsolver(4) alpha = 3 run(1e-9) // reversal FixDt = 1e-14 alpha = 0.02 B_ext = vector(-24.6E-3, 4.3E-3, 0) run(0.1e-9) expectv("m", m.average(), vector(0.59293, 0.63278, -0.08152), 1e-3) FixDt=0 run(0.4e-9) expectv("m", m.average(), vector(-0.8740, -0.2713, 0.01795), 1e-3) mumax3-3.10/test/rk4temperature.mx3000066400000000000000000000005601371432437400172460ustar00rootroot00000000000000/* Test rk4 solver with temperature */ c := 10e-9 setcellsize(c, c, c) setgridsize(256, 256, 1) Msat = 1e6 Aex = 0 alpha = 0.1 AnisU = vector(0, 0, 1) m = uniform(0, 0, 1) fixdt = 2e-12 Temp = 100 Ku1 = 1e4/4 enabledemag = false setsolver(4) run(5e-9) print(m.average()) expectv("m", m.average(), vector(-0.000, -0.000, 0.8366), 1e-3) mumax3-3.10/test/rk56.mx3000066400000000000000000000007501371432437400150600ustar00rootroot00000000000000/* Test rk56 solver with fixed and adaptive timestep */ setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) Msat = 800e3 Aex = 13e-12 m = uniform(1, .1, 0) // relax (rk56) setsolver(6) alpha = 3 run(1e-9) // reversal FixDt = 1e-14 alpha = 0.02 B_ext = vector(-24.6E-3, 4.3E-3, 0) run(0.1e-9) expectv("m", m.average(), vector(0.59293, 0.63278, -0.08152), 1e-3) FixDt=0 run(0.4e-9) expectv("m", m.average(), vector(-0.8740, -0.2713, 0.01795), 1e-3) mumax3-3.10/test/rkky.mx3000066400000000000000000000007061371432437400152520ustar00rootroot00000000000000/* Test setting an absolute RKKY coupling. */ N := 10 setgridsize(N, N, 2) c := 1e-9 setcellsize(c, c, c) defRegion(0, layer(0)) defRegion(1, layer(1)) Msat = 1e6 Aex = 10e-12 RKKY := -1e-3 // 1mJ/m2 scale := (RKKY * c) / (2 * Aex.Average()) print("scale:", scale) ext_scaleExchange(0, 1, scale) m = uniform(1, 0, 0) E0 := E_total.Get() m.setRegion(0, uniform(0, 1, 0)) E1 := E_total.Get() expect("delta E", E1 - E0, RKKY * N*N*c*c, 1e-20) mumax3-3.10/test/rmsurfacecharge.mx3000066400000000000000000000004511371432437400174300ustar00rootroot00000000000000 setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) Aex = 2*13e-12 Msat = 2*800e3 // about 2T m = uniform(1, .1, 0) ext_rmsurfacecharge(0, 1, 1) minimize() tol := 1e-5 // without surface charges, ground state should be uniform. expectv("m", m.Average(), vector(1, 0, 0), tol) mumax3-3.10/test/roughness.mx3000066400000000000000000000004711371432437400163060ustar00rootroot00000000000000/* Regression test for Roughness shape. */ Nx := 500 Ny := 200 Nz := 10 c := 1e-9 setgridsize(Nx, Ny, Nz) setcellsize(c, c, c) setgeom(ellipse(Nx*c, Ny*c).intersect(grainroughness(40e-9, 0, 5e-9, 123))) expect("volume", geom.average(), 0.577983, 1e-4) // this volume was OK so should not change unexpectedly mumax3-3.10/test/run.bash000077500000000000000000000001561371432437400153060ustar00rootroot00000000000000#! /bin/bash set -e mumax3 -vet *.mx3 mumax3 -paranoid=false -failfast -cache /tmp -f -http "" *.go *.mx3 mumax3-3.10/test/runwhile.mx3000066400000000000000000000005531371432437400161270ustar00rootroot00000000000000/* Test for runwhile(). */ N := 20 c := 4e-9 SetGridSize(N, N, 1) SetCellSize(c, c, c) SetGeom(circle(N*c)) m = uniform(1, 0, 0) Msat = 800e3 Aex = 13e-12 alpha= 1 RunWhile(MaxTorque > 1e-3) B_ext = vector(0, 0.01, 1) RunWhile(m.comp(1).average() < 0.5) expect("my", m.comp(1).average(), 0.53, 0.02) // unavoidable overshoot because of large time steps mumax3-3.10/test/savefile.mx3000066400000000000000000000027131371432437400160700ustar00rootroot00000000000000/* Test loading external data files. */ randommagseed(666) Nx := 128 Ny := 64 Nz := 32 setgridsize(Nx, Ny, Nz) c := 5e-9 setcellsize(c, c, c) Msat = 800e3 Aex = 13e-12 m = randommag() mref := m.GetCell(99, 50, 24) outputformat = OVF1_TEXT saveas(m, sprintf("ovf1t")) outputformat = OVF1_BINARY saveas(m, sprintf("ovf1b")) outputformat = OVF2_TEXT saveas(m, sprintf("ovf2t")) outputformat = OVF2_BINARY saveas(m, sprintf("ovf2b")) outputformat = DUMP saveas(m, sprintf("dump")) flush() // make sure output is saved before loading s := loadfile("savefile.out/ovf1t.ovf") expect("elem", s.get(0, 99, 50, 24), mref[0], 0) expect("elem", s.get(1, 99, 50, 24), mref[1], 0) expect("elem", s.get(2, 99, 50, 24), mref[2], 0) s = loadfile("savefile.out/ovf1b.ovf") expect("elem", s.get(0, 99, 50, 24), mref[0], 0) expect("elem", s.get(1, 99, 50, 24), mref[1], 0) expect("elem", s.get(2, 99, 50, 24), mref[2], 0) s = loadfile("savefile.out/ovf2t.ovf") expect("elem", s.get(0, 99, 50, 24), mref[0], 0) expect("elem", s.get(1, 99, 50, 24), mref[1], 0) expect("elem", s.get(2, 99, 50, 24), mref[2], 0) s = loadfile("savefile.out/ovf2b.ovf") expect("elem", s.get(0, 99, 50, 24), mref[0], 0) expect("elem", s.get(1, 99, 50, 24), mref[1], 0) expect("elem", s.get(2, 99, 50, 24), mref[2], 0) s = loadfile("savefile.out/dump.dump") expect("elem", s.get(0, 99, 50, 24), mref[0], 0) expect("elem", s.get(1, 99, 50, 24), mref[1], 0) expect("elem", s.get(2, 99, 50, 24), mref[2], 0) mumax3-3.10/test/shiftgeom.mx3000066400000000000000000000007311371432437400162550ustar00rootroot00000000000000/* Test that shifting introduces the correct geometry from the sides. */ setgridsize(512, 64, 1) c := 2e-9 setcellsize(c, c, c) m = twodomain(0,0,1, 0,1,0, 0,0,-1) ext_centerwall(2) Msat = 1100e3 Aex = 16e-12 AnisU = vector(0, 0, 1) Ku1 = 1.27E6 alpha = 1 setgeom(circle(80*c).repeat(64*c,0,0)) ShiftMagL = vector(0,0,1) ShiftMagR = vector(0,0,-1) for i:=0; i<100; i++{ shift(1) } tol := 1e-5 expectv("m", m.average(), vector(0, 0.008466859, 0.38830321), tol) mumax3-3.10/test/shiftgeom.todo000066400000000000000000000021231371432437400165100ustar00rootroot00000000000000 Nx := 128 Ny := 64 Nz := 1 c := 3e-9 setgridsize(Nx, Ny, Nz) setcellsize(c, c, 10e-9) wx := Nx * c wy := Ny * c Msat = 860e3 Aex = 13e-12 Xi = 0.1 m = twodomain(1,0,0, 0,1,0, -1,0,0) notch := rect(25e-9, 25e-9).RotZ(45*pi/180).transl(0, wy/2, 0).inverse() setGeom(notch.Repeat(wx/2, 0, 0)) save(geom) // Remove surface charges from left (mx=1) and right (mx=-1) sides to mimic infinitely long wire. We have to specify the region (0) at the boundaries. ext_rmSurfaceCharge(0, 1, -1) // Relax Alpha = 3 // high damping for fast relax RunWhile(MaxTorque > 1e-3) // relax Alpha = 0.02 // restore normal damping // Set post-step function that centers simulation window on domain wall. ext_centerWall(0) // keep m[0] (= m_x) close to zero // Schedule output autosave(m, 100e-12) tableadd(ext_dwpos) // domain wall position tableadd(ext_dwspeed) // domain wall speed tableautosave(10e-12) // Run the simulation with current through the sample pol = 0.56 J = vector(-10e12, 0, 0) Run(0.4e-9) expectv("m", m.average(), vector(0.0256580703, 0.3362270342, 0.0306527231), 1e-5) mumax3-3.10/test/shiftgrains.mx3000066400000000000000000000010241371432437400166050ustar00rootroot00000000000000/* Test shift of voronoi cells: new ones should enter from the side. */ setgridsize(512, 64, 1) c := 4e-9 setcellsize(c, c, c) m = twodomain(1,0,0, 0,1,0, -1,0,0) Aex = 13e-12 Msat = 800e3 ext_makegrains(40e-9, 255, 0) ext_rmsurfacecharge(0, 1, -1) ext_centerwall(0) alpha = 1 for i:=0; i<255; i++{ Aex.SetRegion(i, 13e-12 + randNorm()*1.3e-12) Msat.SetRegion(i, 800e3 + randNorm()*80e3) } for i:=0; i< 56; i++{ shift(-1) } expect("aex", Aex.Average(), 1.298444e-11, 1e-16) expect("msat", MSat.Average(), 797898, 1) mumax3-3.10/test/slice.mx3000066400000000000000000000006171371432437400153720ustar00rootroot00000000000000/* Test basic slice operations. */ setgridsize(1, 1, 1) setcellsize(1, 1, 1) a := NewSlice(1, 10, 4, 2) a.set(0, 0, 0, 0, 0) a.set(0, 1, 0, 0, 100) a.set(0, 0, 1, 0, 10) a.set(0, 0, 0, 1, 1) a.set(0, 2, 3, 1, 231) print(a) expect("1,0,0", a.get(0, 1, 0, 0), 100, 0) expect("0,1,0", a.get(0, 0, 1, 0), 10, 0) expect("0,0,1", a.get(0, 0, 0, 1), 1, 0) expect("2,3,1", a.get(0, 2, 3, 1), 231, 0) mumax3-3.10/test/smoothdemag.mx3000066400000000000000000000016641371432437400166050ustar00rootroot00000000000000/* Test if smoothed geometry is handled correctly by demag calculation and Mfull. */ Nx := 100 Ny := 50 setgridsize(Nx, Ny, 2) c := 1e-9 setcellsize(c, c , c) // reference demag field for full slab msat = 1/mu0 m = uniform(1, 0, 0) expectv("B_demag", b_demag.average(), vector(-0.02648, 0, 0), 1e-3) expectv("M_full", m_full.average(), vector(1/mu0, 0, 0), 1) edgesmooth = 8 slab := cuboid(Nx*c, Ny*c, c) // Only one layer: should give about half the demag field setgeom(slab.transl(0, 0, -c/2)) m = uniform(1, 0, 0) expectv("B_demag", b_demag.average(), vector(-0.02648 / 2, 0, 0), 1e-3) expectv("M_full", m_full.average(), vector(0.5/mu0, 0, 0), 1) // Same magnet, but halfway between the two layers. // Without smoothed geometry, this fails miserably. setgeom(slab.transl(0, 0, 0)) m = uniform(1, 0, 0) expectv("B_demag", b_demag.average(), vector(-0.02648 / 2, 0, 0), 1e-3) expectv("M_full", m_full.average(), vector(0.5/mu0, 0, 0), 1) mumax3-3.10/test/smoothgeom.mx3000066400000000000000000000005701371432437400164520ustar00rootroot00000000000000/* Test smooth geometry by evaluating the surface of a circle. */ N := 10 setgridsize(2*N, N, 1) c := 1e-9 setcellsize(c, 2*c, 3*c) disk := circle(2*N*c) // No smoothing: rough approximation edgesmooth=0 setgeom(disk) expect("staircase", geom.Average(), 0.8, 1e-3) // Smoothing: good approximation edgesmooth=10 setgeom(disk) expect("smooth", geom.Average(), pi/4, 1e-3) mumax3-3.10/test/snapshot.mx3000066400000000000000000000003611371432437400161260ustar00rootroot00000000000000/* Test saving snapshots on-the-fly. */ setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) Msat = 800e3 Aex = 13e-12 m = uniform(1, .1, 0) snapshot(m) snapshotformat = "png" snapshot(m) snapshotformat = "gif" snapshot(m) mumax3-3.10/test/source.todo000066400000000000000000000001601371432437400160220ustar00rootroot00000000000000/* Test source("inputfile"). */ source("sourcetest") // defines a variable expect("variable", variable, 2, 0) mumax3-3.10/test/sourcetest000066400000000000000000000000441371432437400157570ustar00rootroot00000000000000// read by source.txt variable := 2 mumax3-3.10/test/sp4_angles.mx3000066400000000000000000000011671371432437400163330ustar00rootroot00000000000000/* Test angle output. */ // geometry setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) // material Msat = 800e3 TOL := 1e-5 m = uniform(1, 0, 0) expect("ext_phi", ext_phi.average(), 0, TOL) expect("ext_theta", ext_theta.average(), pi/2.0, TOL) m = uniform(0, 1, 0) expect("ext_phi", ext_phi.average(), pi/2.0, TOL) expect("ext_theta", ext_theta.average(), pi/2.0, TOL) m = uniform(1, 1, 0) expect("ext_phi", ext_phi.average(), pi/4.0, TOL) expect("ext_theta", ext_theta.average(), pi/2.0, TOL) m = uniform(1, 0, 1) expect("ext_phi", ext_phi.average(), 0, TOL) expect("ext_theta", ext_theta.average(), pi/4.0, TOL)mumax3-3.10/test/standardproblem4-3d-minimize.mx3000066400000000000000000000005331371432437400216600ustar00rootroot00000000000000/* Micromagnetic standard problem 4, with 3D discretization and minimize instead of relax */ // geometry setgridsize(128, 32, 2) setcellsize(500e-9/128, 125e-9/32, 3e-9/2) // material Msat = 800e3 Aex = 13e-12 m = uniform(1, .1, 0) // minimize minimize() TOL := 1e-3 expectv("m", m.average(), vector(0.9669952, 0.12521563, 0), TOL) mumax3-3.10/test/standardproblem4-3d.mx3000066400000000000000000000007711371432437400200450ustar00rootroot00000000000000/* Micromagnetic standard problem 4, with 3D discretization */ // geometry setgridsize(128, 32, 2) setcellsize(500e-9/128, 125e-9/32, 3e-9/2) // material Msat = 800e3 Aex = 13e-12 m = uniform(1, .1, 0) // relax relax() TOL := 1e-4 expectv("m", m.average(), vector(0.9669952392578125, 0.12521563470363617, 0), TOL) // reversal alpha = 0.02 B_ext = vector(-24.6E-3, 4.3E-3, 0) run(1e-9) expectv("m", m.average(), vector(-0.9846296310424805, 0.1256464719772339, 0.04335508495569229), TOL) mumax3-3.10/test/standardproblem4.go000066400000000000000000000010071371432437400174300ustar00rootroot00000000000000//+build ignore package main import ( . "github.com/mumax/3/engine" ) func main() { defer InitAndClose()() SetGridSize(128, 32, 1) SetCellSize(500e-9/128, 125e-9/32, 3e-9) Msat.Set(800e3) Aex.Set(13e-12) Alpha.Set(0.02) M.Set(Uniform(1, .1, 0)) AutoSave(&M, 100e-12) TableAdd(MaxTorque) TableAutoSave(5e-12) Relax() // reversal B_ext.Set(Vector(-24.6e-3, 4.3e-3, 0)) Run(1e-9) TOL := 1e-3 ExpectV("m", M.Average(), Vector(-0.9846124053001404, 0.12604089081287384, 0.04327124357223511), TOL) } mumax3-3.10/test/standardproblem4.mx3000066400000000000000000000012551371432437400175370ustar00rootroot00000000000000/* Micromagnetic standard problem 4 (a) according to http://www.ctcms.nist.gov/~rdm/mumag.org.html */ // geometry setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) // material Msat = 1600e3 Aex = 13e-12 E_total.get() // try to trigger bad lex2 Msat = 800e3 alpha = 0.02 m = uniform(1, .1, 0) // relax relax() save(m) TOL := 1e-5 expectv("m", m.average(), vector(0.9669684171676636, 0.1252732127904892, 0), TOL) // reversal tableautosave(10e-12) autosave(m, 100e-12) autosnapshot(m, 50e-12) B_ext = vector(-24.6E-3, 4.3E-3, 0) run(1e-9) expectv("m", m.average(), vector(-0.9846124053001404, 0.12604089081287384, 0.04327124357223511), TOL) mumax3-3.10/test/standardproblem4_rk56.mx3000066400000000000000000000013041371432437400204010ustar00rootroot00000000000000/* Micromagnetic standard problem 4 (a) according to http://www.ctcms.nist.gov/~rdm/mumag.org.html */ // geometry setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) // material Msat = 1600e3 Aex = 13e-12 E_total.get() // try to trigger bad lex2 Msat = 800e3 alpha = 0.02 m = uniform(1, .1, 0) //solver setsolver(6) // relax relax() save(m) TOL := 1e-5 expectv("m", m.average(), vector(0.9669684171676636, 0.1252732127904892, 0), TOL) // reversal tableautosave(10e-12) autosave(m, 100e-12) autosnapshot(m, 50e-12) B_ext = vector(-24.6E-3, 4.3E-3, 0) run(1e-9) expectv("m", m.average(), vector(-0.9846124053001404, 0.12604089081287384, 0.04327124357223511), TOL) mumax3-3.10/test/standardproblem4b.mx3000066400000000000000000000010531371432437400176750ustar00rootroot00000000000000/* Micromagnetic standard problem 4 (b) according to http://www.ctcms.nist.gov/~rdm/mumag.org.html */ // geometry setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) // material Msat = 800e3 Aex = 13e-12 alpha = 0.02 m = uniform(1, .1, 0) // relax relax() TOL := 1e-3 expectv("m", m.average(), vector(0.9669684171676636, 0.1252732276916504, 0), TOL) // reversal B_ext = vector(-35.5E-3, -6.3E-3, 0) run(1e-9) expectv("m", m.average(), vector(-0.9692331552505493, -0.12039542198181152, -0.0053076110780239105), TOL) mumax3-3.10/test/standardproblem5.mx3000066400000000000000000000007221371432437400175360ustar00rootroot00000000000000/* Micromagnetic standard problem 5 as proposed by M. Najafi et al., JAP 105, 113914 (2009). Reference solution by mumax2. */ setgridsize(32, 32, 4) setcellsize(100e-9/32, 100e-9/32, 10e-9/4) Msat = 800e3 Aex = 13e-12 m = vortex(1, 1) alpha = 0.1 relax() xi = 0.05 J = vector(1e12, 0, 0) Pol = 1 run(1e-9) m_ := m.average() expect("mx", m_[0], -0.23479773, 1e-4) expect("my", m_[1], -0.09453578, 1e-4) expect("mz", m_[2], 0.02296375, 1e-4) mumax3-3.10/test/standardproblem5a.mx3000066400000000000000000000017771371432437400177120ustar00rootroot00000000000000/* Test for reversal driven by Slonczewski STT. */ Nx := 64 Ny := 32 Nz := 1 sX := 160e-9 sY := 80e-9 sZ := 5e-9 setgridsize(Nx, Ny, Nz) setcellsize(sX/Nx, sY/Ny, sZ/Nz) Msat = 800e3 Aex = 13e-12 alpha = 3 // Set a initial magnetisation to C-state m = uniform(1, 1, 0.001) minimize() alpha = 0.01 lambda = 1 Pol = 0.5669 epsilonprime = 0 pdeg := 1 prad := pdeg * pi / 180.0 px := cos(prad) py := sin(prad) fixedlayer = vector(px, py, 0) Jtot := -0.008 // total current in amps carea := sX * sY jc := Jtot / carea print("Current density is: " , jc) J = vector(0, 0, jc) //autosave(m, 1e-12) tableautosave(10e-12) save(sttorque) TOL := 1e-3 m0 := m.average() expectv("m", m0, vector(0.9586266279220581, 0.20391345024108887, 0), TOL) run(1e-9) m1 := m.average() expectv("m", m1, vector(0.6440994739532471, 0.5131782293319702, -0.1569230705499649), TOL) run(1e-9) m2 := m.average() expectv("m", m2, vector(-0.957406222820282, 0.20698121190071106, 0.009677470661699772), TOL) mumax3-3.10/test/std5b.mif000066400000000000000000000027251371432437400153620ustar00rootroot00000000000000# MIF 2.1 set pi [expr 4*atan(1.0)] set mu0 [expr 4*$pi*1e-7] set basename std5b Parameter total_current -0.006 ;# Current in amps Parameter Ms 800e3 Parameter A 13e-12 Parameter Polarization 0.5669 Parameter Lambda 2.0 Parameter eps_prime 1.0 Parameter mp_theta 20.0 ;# Direction of mp, in degrees set mp_theta [expr {$mp_theta*$pi/180.}] set length 160e-9 set width 80e-9 set thick 5e-9 set Nx 64 set Ny 32 set Nz 1 set current_area [expr {$length*$width}] set current_density [expr {$total_current/$current_area}] set xcellsize [expr {$length/$Nx}] set ycellsize [expr {$width/$Ny}] set zcellsize [expr {$thick/$Nz}] Specify Oxs_BoxAtlas:atlas [subst { xrange {0 $length} yrange {0 $width} zrange {0 $thick} }] Specify Oxs_RectangularMesh:mesh [subst { cellsize {$xcellsize $ycellsize $zcellsize} atlas :atlas }] # Exchange Specify Oxs_UniformExchange [subst { A $A }] # Demag Specify Oxs_Demag {} # Evolver Specify Oxs_SpinXferEvolve:evolve [subst { alpha 0.01 start_dm 0.00001 min_timestep 1e-18 max_timestep 1e-12 mp {[expr {cos($mp_theta)}] [expr {sin($mp_theta)}] 0} J $current_density P $Polarization Lambda $Lambda eps_prime $eps_prime }] # Driver Specify Oxs_TimeDriver [subst { basename [list ${basename}-eprime1] evolver :evolve stopping_time 1e-9 stage_count 1 mesh :mesh Ms $Ms m0 {1.0 0.1 0.0} }] Destination archive mmArchive # Schedule Oxs_TimeDriver::Magnetization archive Stage 1 Schedule DataTable archive Stage 1 mumax3-3.10/test/std5b.mx3000066400000000000000000000026221371432437400153120ustar00rootroot00000000000000/* Test for Slonczewski STT with nonzero epsilonprime. Standard solution was verified against oommf 2.0a0 */ setsolver(5) DemagAccuracy = 29 total_current := -0.006 // Current in amps Msat = 800e3 Aex = 13e-12 Pol = 0.5669 Lambda = 2 EpsilonPrime = 1.0 gammaLL = 2.211e5 / mu0 mp_theta := pi * 20.0 / 180 // Direction of mp length := 160e-9 width := 80e-9 thick := 5e-9 Nx := 64 Ny := 32 Nz := 1 current_area := length * width current_density := total_current / current_area J = vector(0, 0, current_density) SetGridSize(Nx, Ny, Nz) SetCellSize(length/Nx, width/Ny, thick/Nz) alpha = 0.01 FixedLayer = vector(cos(mp_theta), sin(mp_theta), 0) m = uniform(1, .1, 0) minDt = 1e-18 maxDt = 1e-12 maxErr = 1e-6 run(1e-9) save(m) TOL := 1e-5 print(m.average()) // # ODT 1.0 // ## Desc: Data from vector field file std5b-eprime1-Oxs_TimeDriver-Spin-00-0002233.omf // ## Active volume: (0,0,0) x (1.6e-07,8e-08,5e-09) // ## Cell size: 2.5e-09 x 2.5e-09 x 5e-09 // ## Cells in active volume: 2048 // # // # Table Start // # Title: Average across active volume // # Columns:\ // # m_x m_y m_z // # Units:\ // # {} {} {} // -0.953323544827031 -0.279948071263437 0.00528093273512820 // # Table End m1 := -0.953323544827031 m2 := -0.279948071263437 m3 := 0.00528093273512820 expectv("m", m.average(), vector(m1, m2, m3), TOL) mumax3-3.10/test/std5c.mx3000066400000000000000000000013761371432437400153200ustar00rootroot00000000000000/* Test for Slonczewski STT with zero epsilonprime. Standard solution was verified against oommf 1.2a5bis */ total_current := -0.006 // Current in amps Msat = 800e3 Aex = 13e-12 Pol = 0.5669 Lambda = 2 EpsilonPrime = 0 mp_theta := pi*20/180 // Direction of mp length := 160e-9 width := 80e-9 thick := 5e-9 Nx := 64 Ny := 32 Nz := 1 current_area := length*width current_density := total_current/current_area J = vector(0, 0, current_density) SetGridSize(Nx, Ny, Nz) SetCellSize(length/Nx, width/Ny, thick/Nz) alpha = 0.01 FixedLayer = vector(cos(mp_theta), sin(mp_theta), 0) m = uniform(1,0,0) tableautosave(10e-12) run(0.5e-9) TOL := 1e-5 expectv("m", m.average(), vector(0.905612587928772, -0.2860856354236603, -0.011005667969584465), TOL) mumax3-3.10/test/std5c3d.mx3000066400000000000000000000014761371432437400155500ustar00rootroot00000000000000/* Test for Slonczewski STT with 3D discretization Standard solution was verified against oommf 1.2a5bis with 2D discretization */ total_current := -0.006 // Current in amps Msat = 800e3 Aex = 13e-12 Pol = 0.5669 Lambda = 2 EpsilonPrime = 0 mp_theta := pi*20/180 // Direction of mp length := 160e-9 width := 80e-9 thick := 5e-9 Nx := 64 Ny := 32 Nz := 2 current_area := length*width current_density := total_current/current_area J = vector(0, 0, current_density) SetGridSize(Nx, Ny, Nz) SetCellSize(length/Nx, width/Ny, thick/Nz) alpha = 0.01 FixedLayer = vector(cos(mp_theta), sin(mp_theta), 0) m = uniform(1,0,0) tableautosave(10e-12) run(0.5e-9) TOL := 1e-2 // a small 2D/3D difference is acceptable expectv("m", m.average(), vector(0.905612587928772, -0.2860856354236603, -0.011005667969584465), TOL) mumax3-3.10/test/steppercache.mx3000066400000000000000000000012361371432437400167370ustar00rootroot00000000000000/* Test if the stepper cache buffers are flushed when starting a new run */ setgridsize(1,1,1) setcellsize(1,1,1) msat = 1000e3 alpha = 0.1 B_ext = vector(0, 0, 0.05) setsolver(5) // This steppers uses a cache buffer fixdt = 1e-12 m = uniform(1,0,0) steps(1) rotation_wanted := acos( vector(1,0,0).dot(m.average()) ) m = uniform(-1,0,0) steps(1) rotation := acos( vector(-1,0,0).dot(m.average()) ) // Note that the rotation angle should be the same for the two cases for symmetry reasons. // However, if the cache buffers are not removed, this will lead to an erroneous result in // the second case. expect("m rotation angle", rotation, rotation_wanted, 1e-5)mumax3-3.10/test/table.mx3000066400000000000000000000002521371432437400153550ustar00rootroot00000000000000/* Test adding user-defined variable to table. */ setmesh(2, 1, 1, 1, 1, 1, 0, 0, 0) f := 0 tableAddVar(f, "f", "Hz") tableSave() f = 1 tableSave() f = 2 tableSave() mumax3-3.10/test/tempminimize.mx3000066400000000000000000000003651371432437400170020ustar00rootroot00000000000000// test minimize with finite temperature setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) Msat = 800e3 Aex = 13e-12 alpha = 0.02 m = uniform(1, .1, 0) temp = 300 minimize() expect("T", temp.Average(), 300, 0) mumax3-3.10/test/temprelax.mx3000066400000000000000000000003571371432437400162750ustar00rootroot00000000000000// test relax with finite temperature setgridsize(128, 32, 1) setcellsize(500e-9/128, 125e-9/32, 3e-9) Msat = 800e3 Aex = 13e-12 alpha = 0.02 m = uniform(1, .1, 0) temp = 300 relax() expect("T", temp.Average(), 300, 0) mumax3-3.10/test/testdata/000077500000000000000000000000001371432437400154475ustar00rootroot00000000000000mumax3-3.10/test/testdata/binary4.dump000066400000000000000000000726501371432437400177200ustar00rootroot00000000000000#dump002ds4$'PʾM^ :q_mSpLIH(J]MxQTXUamzHCϾNc_A@Rܻ3 ;Mʾ8)>u쑑w⍾ ی9Sre ľ]I 53e:z=~` 6H[09zؾ:Ǿ驻 |ɱi7(OžEо)ᾲ`#6+m)W~ 澜۾־jRվ.׾(۾O`=zjB,RH}dfvy}}t>lɼZHG4i"T<Pe9Yr !2VHkG`Zr~{~n_C@MĔ;`+9C%2 A"-@xCfLNWVc%(oN x/6})huYYXmIntjQaZR1VUkKY_͢gApnw]|z.yh#~o~~~~&!j2ysomn)r0v~i}|eE|{b{L!{z(zV{E{({{'I|8|2}}G.~~i"b4YHz~~9~q}}|$'|]{{xw{)q{5{{{4||~|zc}z}K~Ŀ~/; ~(~X~}})}|y|y|4f|f|z|||}j}}!~R~~HПeݶwI[~^v~1~} }S}{}4n}p}~}}<}}=~~t~jiS <ˠzP$C~9~~~ki~W~ P~*R~g^~nt~~ٺ~~S2 ԦosXt?*(>~_~~&0B{bL|z+KttkfFdflwu[$MQ;ɮ5ɭ.L#~],c-[  ?m68ZRu8F/191?|.#-nk%vDK!Jo4((@iLgR)u+iz|W/7{U \.F'/f}m2O_q}KItdnb],PD0mCeE1*5VUyV0 xtsgG'@u[A( /blWC1" 4S~{j[MB;9<FXr{oe]YX\fvytrsxsoot|ycWSU]iwZ@3/4?Obwa2+D_z8;^blht1]3O@t"XEc1"0VmIb%j(o*t X 4VxTHf![s[L9/q}{#rb6Y%Cb9>nA.@5+Q*g5{*@/X/Q[:"1RK!:Aگ~,v7/n$\ߙƔ8G #XmW$Wzw;wzŧe')70%ϫ {pjhkr~76*>" qyr qtL~Hlv,[xv=[u? Pmw`:AL};u+~e}|m|$|u1|?|s<}~4~Yud.o>~1}{ywmvȐuZqu5v2(w2xzr|,-~l+94}{ xzso9lCj2jHkTmRbo0rb4uzjՔo s wzt}y9}wpay;vA +zp L]Q ='29=0FG}PX L`fPl}qux(a{}}%vU?{ܾھ羜U8 =*3&3?RJS#\6@c6JibnrVFv0Oy3{}vJ˝kE|D;ȅSn{>(h<7CN"Xf_υf|kpKtntwzW|j%~?M/Nh=dh4ޔ༾s|-'$=;Ia%zQ !) "4Cs:PTZa=hKmaq|ttw[yG{I}i~[?>h>߁=W<;Ns"N:+=-XL›W0U`UgllRp svxze|*}~y?\[??>w>w!>^0<_&h6|HͬUp_fDlpMs{vx`z{=}i*~  ;~?w?\??>>ՆŽOӰ :-WDT(_+xg!m:Cq@ktFvxcz{=|*}~I@?}?8w?N/_?%?:e>b8þ(y@ARUFaiVo#s9uw)yDz{H|R}P~I~m?.?ND~?Fx?Td?z(?p9L>YP=Ytg"osY~vh~xy{l{D|U}}^~Q~8?v?v?d?~?q?V ?e`>DIcYryz}.g9XFK}?!z?Ɵt? k?W?I,?>Ba6I6PdAqIy1}~EXnpK|?KSw?do?)a?]lD?O?)%>P־@(ٚ?bTeJ:r%}yz'}z~3.dz?t?i? U?*,?>=)\F#.CXUi6t z}%D3y?/q?#c?`E?pK ?x> >AoIc<}񵾰Ͼa, C)6{Oftb{(:~~Z-v?g?eE?z?~v>mn= ~@4q|e2yN KѾ1 p "!>Zoy,}L]s?t [?m ?>N >:V<6׽S"Q'zO}K Ը>Ⱦ۾J.(?)*LixĿ}^Gk??;?Gz>9>d"=V;eAmh+1Kfd~Љ⓾S`f׹ΧϾOF|2:Ѹb^we}T?#>ցY>G==ل"{94L6Խ/fyu2qDŘT5cӆqrs*Ŧm¾3e"ZwQ?Zg>U=RB9=9T-R>d 6 ؾ!+z?yN YA`d;g'qhȸhKh;g}zeb^AX|kN=db"쉾 yؼz8žٰ 6;E6 QaXXe\^_E_]ZդVQPF7Yp줾K=uC |U;B).5.='uHORSSѽPڋL!FK!yd< <.б<>ᆭ846&W4j=~[BCB?FT8w .r ۾ɜΉBʽ@;C;=H=0=i =#4x Z&O,B-0+$!| ͩ꾻\}axT@W=Ӡ<P= ==e==^j=q1=1LW˾H I4 B Y|پxư򄁾s$`/j>>i>۵=v4===G>8>15&>UC)>x(>R%>H >jv>V>>=S=2K=#?m=Y\;j=.G=W= 3=E>J!>/>8> >>5tA>AA>4?>:>,4>+>{!>8>M>=tW=E=+s=<-&=)=kQ=C=_ >>#.>g9>+A>F>lI>I>G>?C><>>6>->5#>[>V >==E=$2s=%<ʏR=$R=_!==e >'>&>0>@8>3=>?>^@>>>;>eo6>/>J'>]>>>=`6=+=k=<4==ۥ===P={ >g>>z>t%>V)>$+>+>+7)>%>c >>o>>6=!=:=8=]=<=etA=Y[z==== !==>*B >rN>L>>9>Q >ka>==[==(==;I=m=ѥQ=Rc=3s=J=iÅ=?Љ=M===_= =Wy=qe=rK=)=J;z ];$!{;;;M;uA;O;;;P;+b; ;;;ʜ;&s;r'8xv9C:<\:&b::;)+;H:;PV;p;8;hO;u;xs; ;[;;q;,;s;i;t;S;];M2929&-:?::pU:J;,;7;]P;~g;};p;;;¢;zq;.Q;9 ;;;9P;;;9;G; H9 :U:>:":l:ֶ ;;o5;J;B_;Yr;F;*g;샑;n;F;DԞ;ǟ;Rq;U;dǒ;;i;.4;_!9Y'3:p|:1::b:} ;!;n-4;HF;.X;&h;1w;;BW;;l;Y;;;g; ;ws;KQ;!;::Z:쭒:l: S:>;,;.#;3;ӅC;}PR;g_;9Al;w;;{;υ;IІ;F;0;:~;fwp;#[;><;;X3:~:{:<:4:};>;<&;`4;A;GM;X;Hb;mj;yp;u;x;Nx;u;p;af;uX;ND;|';_z;V:䬖:B::ڐ;;p;L*;6;@;_J;FR;Y;U_;bc;Ke;Q7f;Yd;Ҙ`;Y;oO;tA;.; ;iR:U{:F::d:x ;;1$;q/;}8;.IA;VH;4N;R;U;NW;cW;4U;}[R;yL; E;z:;V,;e;WT;':ΐ:ȿ:!a:';v9;{";z,;x5;A=;C;G;J;&L;aM;L;4jJ;(F;sA;|o:; y1;C&;X;;M:6::D:|;;4";X-;ad6;`H=;tB;F;|0H;EH;RH;NiF;5C;K>;8;M1;%);; ;;$:F:FA:m:)q:;~ #;:0;89;%A;o"F;U)I;nJ;,J;?QH;,(E;@; ;;34;8,;(#;5;l ;_;:::!h:5\:H ;p.!;v2;>;G; PM; iP;IQ;6P;hM;m I;IC;?<;(4;*; ;p;Yu ;O: ::͡:s:4:-:|;1;yQC;HO;V; [;@\;Z;W; )R;+K;B;9;8.;v";f;;:o:ݻ::9:A:X:ep; );8C;U;fa;g;j;i;Yf;Lr`;3X; N;}C;7;);0;5( ;oS::X:1:w:;:NB:p9;(<;;W;Hdj;ڗu;({;(5|;y;Ys;3Nk;`;T;E;›6;5&;; k;:W:A5:9t:K4:9kۋ9x8V!;meO;n;;';'r; 1;م;Ł;%{x;j;kw[;GMJ;7;0;=;.;%j;&L;O.;0;>:&V:+[:;9p8xz|(P'"8{;9-;<;;;G;;$;c];_;;(;!;^;L<; L;::N:69h8À]U=;;g;;y;;;S;2f;p;];x;v;Ѹz;=S;g-+;e>;ua:Q[:9ƃ0mbt_3~ϡ;1;p;=D=$H=J=nI=ZF= A=M9=2\/=O#=G=Qo=m<<=v=}=s=Ne=xT= @=(===-=s==ub=߀=Qr=\^=F=*=: =<֫gT,<&R =ڕ(=fe;=L4F=H=y4B=/m1== <*E<»u[ ZAܽ:|Wm*I1/@& 귽o)f;.ɾms]㥾dKLz_G" H%A E޾9zƾX \.zhZr,mp)!~rvz1|%{RytmqAdCXJ;y,*JgO辖о'=(i4tvK 彇|Y |?xpDdUE+5$Yh׾YۉZ,a};=]Խt jd8JyE~a3z/qmb<QO>+4 L3ܾv[ľxᣈnnTL-+½_bNcTr|4`|EqO^sIgm3n0- ݾžaY꛾NtpUj7I^1IIn}x~?pGW}B$p³l I>,(m оش=X~%Kch(K4 ЀPW,iH WLpև@jz-`l+AپCŶh;8&v1](G4b#<]p*Ľ狟?[iYT<\<;X+Ek_GY-BϾٕ tnT?&0|#yzN]%ν |-A===%>P>o>=?,?h>8=͓ _>֖>Ӂ>ơ?_?a?;#?>u>S>>ANj=<BU:򎔼4 3:p;>Q9f-˨D~>[>Y>>a ?+Y=?t?Pu?S?g3?=8?@{>R?>gim>>=4[= <ީ; ͼPż옢QL2>e'>6g>`>)-$?0aX?|?z?rh?T?1@?k)??_}>٫>ze>B>=nh=Z$>>m ?*u=?Pk?tI?#v|?bq?e?VY?K?F:?%%?6 ?s>ڕ>fJ>D>?=hI=gr,2`>>*>#?3V?Tw??I}?.v?o o?1g?^?FU?I?8??p>>Y{>#>X===6>O?q>?k?P}??}?y?t?nJo?^i?mUd?]?1T?G?3?g?s>ԣ>A>=vu= '='>#?@\?xx?N??4~?o{?Qx?Dt?Rq?Nm?Яi?e?^?OU?E?#m+?Ý?>\>>m=`$=۞>p\?G?r?}???~?Q|?lz?]>x?u?s?Jq?n?ٛk?Gg?m`?T?{???Q> t>V~>=c>-? j?{?]???~?}?g|?&z?"my?uw?7v?u?ss?iq?vn?i?Qa?1Q?^/?>q><>e~?`?(z?c~????vG?~?}?f|?|?j<{?lz?ky?x?w?v?t? r?\l?a?;E?O?/>~Z?say?2~????o?`?I?~?}~?~?}?D}?L|?5|?a|?~{?z?z?x?7u?xn?6Y? ??"ޮV.$K/յ/@0W-0Y=0>D0}E0D0 C0VA0Q?0>0;d>0>0H@0.I0-c0j201}1r2/ 2և./P0W909Z0k04r0(s0 q0op0Xgo0Wo0p0t0y~0~0#0v0,1h1}[0idEb*h P//>0z0P020b00ś00h0BѠ0;ȧ0ʗ00O0 )1P0H1 1f0Kl/䂯 f^;p!-|/W)00}[0˱00|0r0000-02810i1ʆ@1k1o`1i 1t00跴/vK,/ 00t000X0x0.090t0 1-1,715`11罍1+\1e1Ð0'10ϐ/f7/\ %I./Nf0000 1˞11l+(13>1^1q11O1h1|=1Th0 Y0 80%/ /yk/a_.b/k0Ƿ0܊0y1H%1~81ܒN1 qk131@1Ӹ1M1s[1j1!1˴0p0J*0 //ҳ/j`&/]/+00|0516:1o/\1L71Go1प11> 1 1 o1gy1F4170'0`60\0/ĝ/Աk//ٰ.)0x,0;\1O1ot1)11N1t1111o,W1cZ1i0P0 0i//0Y/**/.:.5.Ue/ 1A1t 141L11@1ϛ1y1&M1:0v0 0pa/]Y2.CLxͮǮîtՂg姮ҮX111OY1bc1~L11=80g$0Y-ǯ!oDOLV@̟2> ]d{ޯ5J抯kr7dx/_y:و-0߰.Lh򰻾簷ذɰ`=VmP,3rίy@ְH9B.8<=(:X5s-dG%^JްhȰ uᇰh? GutΰY|j1BNRU^YXzVQ+PI?]5M) 6p-<찏Ұ.xBޏM{/o^L #96]FQJ-[^ag_\XU.NDC8*) TE}°ެ(~wVܭȰԼ5l =M2z2>x)KYQxY}Z\X/fUrL|E8\-!f$++mZݰ0Ctph|}װ <&ɯ3:{DgG LejKK&FUA7.|h!lV\sc)°2r1Y°ᰶFOq!+.4a4T7!53Ϻ1)#?IEZ욾?)N+sΰ]{ݰOe  7qi1YX't6iϰ"㯰lu7蔰5ҩ5$̰MUX,bwpb S!{ްoɰ[Ѕ{\x)_0}ő*6x°@#Ӱ\հgi(yI$hQ< 氅yڰְ5+Ȱ߾ MpaG0S^arA̰>Ͱװq԰4UܰT]ְ۰Ӱiְk̰˰%Q׻P-',fl*G3w273C1ɰ˰0Ұ_аJ԰d ϰ|а ɰǰx鰰܇vU>4Ս#*h窰'İ˰mаAfҰWdӰqѰϰқ˰'zǰk۲y誰_Bu:\HfvHe(`*𲰓Ͱzmа]ٰհYFܰӰowװ#D̰-ͰEҏZg!X$j}*_~D#W)Vۜ\ǰ ڰdװq!ٰܰX#cа`ذdSðǰ>-ˈH^F#5ݫɰ!ϰLa߰U*Bzhְ]ȰհLѢѤ腋HSbMÝ%{S^o;ذٰ)X`d2=/Q ް$Wqϰ氄=|Ͱt맰VTUhoX%D*!Ի8쾰H'zk&gV 氦hְðsް+pOe30M(0]ȰOɰrm2U7.!U>)?ް'q)ʰRܳ%&Ͱ㙰Cg3y@sT600հ?հƑrgK+$Cn/+ t"|q?Ѱb#ݰ^Qű@, fA>9&䰝ᰓ+6],M ~:J >zR;L`0rX!nذa s%b.FCEﰒ() Xi:}i{1M#nݢ 0Eް`:&JְeC-MH$>1Xs\ҪWwXKիL >K0o.0uP} 1QF"Kn}c׍-soW6LѯhoЧnig*,<[m*FK :fS'h ^C+xȰŕ@ؔr/g4\ܷ&z{W ±yaDZ%l±_ [3lRGtv2z3Y(.jS4mӰI8A4vyǛnK쇱SBh抱α↱YӱM~ӪͱckxVT+>gm2%u+ <Ɛݰ뫰 KGpӅ˱+CD۱՚Pޱ/btرe|ʱ*dJJ;o/F\ ~DxǏ˲5!رGIOK]J;ݮұ_sfVG顱S8e!iJ"+} ꯰7J$ñ; )bM|R'eFԝvA책Kڱ6y±(bզ/1A) OPLjUGFPѱLU^B Yxo;dSi.uErݘ3 %;ϲ%Iͱޱ6=賄9DA>HBl G)fB%j3sloق(6cqwlQXkc*^ۺzh(B\*,Gɲ#]> `<WP\3`  T߲&pӲG)+u@^(f9,j>)GؽS5Vl^[_zmXZXzMyI;42$|P4~"в킯-,kO )dK'`&]J߳ón̏vn<S*Ef$;޲94564+OOh"e>7J9`h @ ,# ֳCѢF gN8Ϫ'߳Nsw 1>Pۯn0,ix>;! U-s_CسIڻ=g"7Sϫ]貳ʳٳϳꨘ[ԝ5Ǵ \@гH 0JZJg޳Pog7h6}\|ZN~ QɔR.lf:ƧkĴlkڴ~ִ3&녴kDBų ǑNh7T2=:aWLH=}^ݳ<dP2^O1s񅒴.$}@r ,zسIԏj#c8C^L;XͻKٯճ6 jF5rY LC>³䖳9co/Q97%ܘGcىo c:`̽>ѳ ᳤w5!&QGy:39Ksmzqd;[AQ;׽@г ϹZ es{}쭳(Mѳضﳿ!:1Higp/1(ٳ#AF]]{^ݳˆHl5J>UWoVP Òa/!ҳP90̞O=V$VBX2)@NɳSٳX.~;"6^?,x>6NEY<`wdh{mJvrdz<%& Ҍ&ﴳM:γne\ܧ! 08(L:;V{%*:-./1m>5<4_LAoO񳸞y(0<\Qb @߲1ӲYϲ8ԲqCZ P " )$ u0 ՜ Cu})KZ&ʳj\}OHe[mumax3-3.10/test/testdata/binary4.ovf000066400000000000000000000740251371432437400175430ustar00rootroot00000000000000# OOMMF: rectangular mesh v1.0 # Segment count: 1 # Begin: Segment # Begin: Header # Title: /home/arne/wd/les/ingenieursproject/oommf/test1 # Desc: Field Index: 0 # Desc: Applied field (T): 0 0 0 # Desc: Iteration: 1623 # Desc: Time (s): 9.184200135885099e-9 # Desc: |m x h|: 9.6355219620542264e-06 # Desc: User Comment: # meshtype: rectangular # meshunit: m # xbase: 1e-08 # ybase: 1e-08 # zbase: 1e-08 # xstepsize: 2e-08 # ystepsize: 2e-08 # zstepsize: 2e-08 # xnodes: 100 # ynodes: 25 # znodes: 1 # xmin: 0 # ymin: 0 # zmin: 0 # xmax: 2e-06 # ymax: 5e-07 # zmax: 2e-08 # valueunit: A/m # valuemultiplier: 800000 # ValueRangeMinMag: 1e-08 # ValueRangeMaxMag: 1 # End: Header # Begin: Data Binary 4 I84ǒ5B??R@A 2r`~=ydigǾ؈ l>8zvſnB\ɿparQ#/]t侕L10)y2j1 /}駾71X/x%<@4<%t <԰^oe<踰ܠT|8" ?[ t7&?yb_4.?~; ^IW?Wٓ?HiZS?~?|>2LQ}6?zd>JA=2?y>`2,^?x{>vN%7?v>B?s]>ۚ?kG>c@)?T?~e?Q?Z~<0($S9"3ܿ6KH6 E׿S!^f ZɿK_) b=y>._e40+jʾW0,xs\1A޿|Q11Y_=&-րY=Rʰ=4Lۙ=Ø$:L6:F\4:ÿȱnǿ:Dҿ:q); H$J;|ұ¿;) ڪ;;<(4ؿ;Oemo;fñ;S Aa;$8;-9G l;vp;Λ1E;&ɯ< ˳˷<#le) fdj7?[\N6̷?wՀI15 qu?}e^??.W -I?v<\$fE?~=Ŵ?|>$+sN?z!>[]?wSK>'e|\?t>:޴?q/>;L?nD>6?g>³;Q?[ t?\p]?;??-ʍN>#?`:M>gZ?yasQ\'ҿb[4 Ls /ÿ0[9*b>j~+!p;GH ٮпM@C..I%R!8/bW x0aQʾ H1\;oᄈ1 t|F!1YO;\yοmZ=)9HH=R$|Y~= =AteԿ = ׷}<0YzhοyJ85'?Tc6E?\6{ĉ?w8 5+_M?~DN2h?v;?~&=ݝ-?yY>_ w?t>Y?od>g6Z?i>Wa?c#>*X?X$?OG?Ee?#? m?G֯{]>zG?j S>Y?z(\en=U?~2 bPk Zӿ29Krh)m?z-Ͽ46/,;Ŀ.5/ҵB;&640kKRѡ0|ZI1OͿm14} 1cb=j:ӿ~i=QkB~z=!_1jֿ(=ێ#w=z[Y5<=38?Ϳ <ژ۱/:1;:lH:˧5ڿ;j%B;ܐd;f<ƒݿc<]b>b<,,,9w>r52Y?I6˫?_/Nˏ65u?xFpL5?d+X;?}M>%OO>?t>1 ?k >?a)>`~N?U ? mL?E`?#޳?+@?>qc?z?\@>?r^>9?{X=G?~c!=9BR?Mp".V;ʿdg.zX/P WNy/|ȿ"iſE;0 +`=0fN54W0Bѿ&Z15V1R 41ton1L~ )1L~=G.ٿ~> =C.ݿ~9~=Bڿ~Ĝ=Ȯ=69߿I= l=\ ќ=F쉱) ;OHݱ;afڿ;uڲ s;'QۿW;̲"b3;=7^2;lUL;3l;yw*<_1< Ń<3`ҳ>jX=W^ _ =e-=i6 |,=`ۆ3 mÿ:=F4L3[=)44B=<$4 N4NETw5F{t:s5< k+6;y.R6{_6$s 5&=hN{%5(9 4Hy!w|Ȁ5Dq>οn58n?%B>6gz?dT6?~޽E49?z >P鴇?ig>ЁӴP>?Wϱ? aϻ?Dl]?$-) ?,*?=u*H? Kp?V3>k?k>v~?xx@> N?}˓="d?];~.=?0 <9?Dzߒ@ ^tsR/K$ʿMk/񠱿a/Y 0)WTQ 60t09Hu'0,6=j0ܿ=!,O1:6U B1)r)Ѿ 1ep1~H=W0-}i> _α8ѿ}q> eNʉ~X=PF]=2M=z=ſ=84:޿M˿;}\s;n; ;"{h;-X+;98IG;GMb;V˱m̿;g2տ;{(߲o;r'MS;둲 ȿ/;|!a;ղB^;Gsաh;䔦ɿ<=ʲ"<F`ҿ<60G)<\(P!}51:2>e:p$5ݛ?(z@6X.?qؾk5Ñ?rt>o?N)?ƴ?,I?=Y+?O?Xa0Q>?kP=>x?wT> ?}P=nm?N ~`A0<٘}>'۱UR}> {Q~[=ʧ>2zP=&<=UO=O<ʄx<.\<5\>8;{Ա.;Km;h];0;̒;wd:`&:7:&@+̿:Ա:~:LJ<;=F[ -;lڿ;JH; ֱ̿; }l;,h;>ҝ;pⱶ;$1­;,z;6daDC;A%O;MP |;[ F;j;|5(o;1 [U;*%O4;:TD;X+Y ;񁲅lt;˿<wrFD0<ǎ<5nSؿ <\γBNY\H\=\ Qax=u-{=f۳Vpy=)γ=r532L=B4y480.L9pzj5? VؿG_6?M?=h>>?_n>?t[>%)?|=?It^}<?󦳫KӼoA?o ?زZϽ6?5lAe?"49{?b?oYqxv0@)r09W:ؿk0zؾ d0[}\e0οR0$C1%H>W-+017L_|1NƿzE.|Ee>.#=|>&Y^}>g[-J~v^=! K)x$=q3ɿ=qǛ=)k e$Vr=\t=qI3PD}=1m/4q<]4{-ɛ&ك5\n}l~*5;5ru6q@sRZ6߉(,@6׿z+XL5ÿd:4 z׻jfP4m߾m3됾 Qp^cA"sq/cؾ&qEyOp?sXοl´q(Y`- <`e-Y걼\o-?,"?a0aB?uPP?z.RҾ)?|v#<P?}I鳯I?}賋c ?~4e ?~>Jm?~^6L?Gv(bq?`8_y0-Wu>tc%0Z9n/0P۲g;0˹^0X%Sˍ1 &B18~C嗿$1oGg;1t}1yM=-Y|>!JL{>9g:(|'$>0ۦX})>^Ҋ~1=ѱQY~C=吱:so=6!q+=>bޱ ݂=lƿ;BO;;>`7`z;:HsUs;7뱃z;5o;4-n߿;3;4`Xw;6*;8}a;=AY;BtP;I)UJ];QIe';Zʲ&#;fYc;sY ,|;žʿi;7 O;4;+; 1Sd;]c;f2^T@;< #j<+e^lh۵>eD?#;i,?S5޾օ?hrfl?qbw?v.՛cڋ?y`:@~?{o{s"S?|QUh?},?Գ6?~:L?ICqSmzv0=Yuo]0kow02ľ։hq'0eP_0x͒S1˿*r?1NܿLf!1pA澰x1¿ͽl[01M&~!=Ư|:>/ϰh{b>A+5X{]>8@Vzʿ|>za}>Hc;AK;@ɱ[<,;AI.l%;C~M;Fʱ/;Jn;;P6F;Wܿ;`rL;kN3";x{%ճۿw;Ob;] /(D;0>.i;_c6(;p²t;JS<#<"[;YA7ta<~ߐ=8J7cI>>x6I6?3g ſ@?Tָ:\?e˸2PdFG?o o6}xf/?}ф;$?~Lpz 0D>wuU0r4 ڿp0bRjh0|9_E0.2P11ؿ8TF1kq WNٿ |1àwn197n0:}>5l!{>8ӿ{!L>Fȶ-s{>=3P+Q|y>%t_g} >Z}~=w]Gg?t=2.տ=cR7=!]ڿ]<dy(<۰]T/bDۿk=~L=28v=@4e};{MCƿ|J[U"Eʖ+:>O<63mgDW#L* UR mH}gtxt Y޿<͵ئrY>u!;P?8=(?@1ƯF?YV̴O^H?g1 ؾ?oJnѥ>2e?tDz'?x>]PVK1?z&N6y?|fV ?~}Iz0E} v+0s(|p0.hK0rY]09 L1(+l;K. w13cV1 >|]$s1Z=:40v}T]>GDo{>> 羻z>Il%Gd{wx>?DZI|y>)V\ſ}S> B*\~=[L (*=4=s3qQ=.N,U,;wC;LV;0Gٿg; dlu;8˱4;}ɱ;rY7;h&'K;_g1;X> L;RFKF;N4[ _;Jkcſ;HE|e;HQ?;I m;K+;L;N ;T2;[wkVѿ;ew)r;sz;.r^;(Ql1;x,;'&m;95\P< Xmz<2|s`$5!x }I6NfܿeC/96\}LW56817"5-;<4!'=Aj3+&3BF(@O$I:-Ի;մ45+󴧚=ÿ+ളH|0nZTL=aF(9o"CznATnĴD9Ͽzy>Sd>{@??)kk#?Ks17N?^Ƚj?i^ Ny?qR좾O?u餳 f?ym"YE׾2u?|%{ ?~H罿{H0D쾌v 0q o'0Ś۶(g;0䑿Z0t"AF!1>3H1@o(%1Ң .z`1W,o2=0 ȿ|>8O{c>At5صz(>I{q)>@^?|f4>+$UX}{>NrX~ik=Kje>=G4aK=J1i;=::ibܷ=dhZഷ qA>?R@\Tb?ힴkl.?:F p?UF5F烬?dUm ?mNM(K}?s~d?wu`&51L{b>AAɽ{V>G{5>>5]|f>+N.}n4>LUf/~W={K쉿=*07Tt=i=Dw,=ymige>}_~C?%%.p3?IYr?]wK?iг ?qJû?v7dwT?zl-:-?}Dѳ PM]zϰ0AVž9ulZ0ogX(7mo0h`Ob0ĿPQ1-)CU@V{^>?4Ұ`{E>C?{>;)M|z>)7+CD}p>Lr~P =F&~=Ա35!kt=?Yڿ=K~=߳ |z#|9YG˴@ <U}1>㴅Hr:J>ٴ&3X? 6봽1*?8,a?T1!5䅸?eц?n澓?u˳hc5?yk.?|L "Qxz#0?Qt10oWO߿l:0B^Ѯ0-!F17,N5<1s1[sh)Tx14Fh=ye0P}ޜ>2LC|}T>(x2̿{>:=ؿ{(>><^{>6oe |>%8}~>9E|~R*=-.AU~_=41Ͽf=M5=Qp=xk<ːՓ;>KJ޿;43Vf;*b(;"vm3ѿ;0x;ޱU;J1; M%v; ;0Ͽ;L %;+-gm8;J9޲;|zT *;n;x;:<;rp=f\⚿gŴƿa𵷴A|gU3L#ilD`eqC:60FYu9X2iyž]1&`}$Hy!e/;##, :UB-~=ЃJ0 y}%>ezش¿iU>Ҡs$Hf?  ?G˴GQ& 1?^ԧؾ>?kٳa־S?ssm{q?x/(r?|5 $)XTz80>ﹾSsƜ0pEi[0;z=XA182271`5a1O{VyB1j=W@07#=0 }>x|\>%R >|>4,{>6ӓJ|4>/6|> c*}> Q8ʐ~^g=d7~=j)dF='X=TI bc=Yy޲%:ښ/ ;>epԿ;-_>;GҲt;4;I<s3<9<±#|5X_X5:/XR4YڿVb 2JV d,uXw\#pa_fEnd ;OPh=h˴|,¿ll[v]pYaAtk@A8+lw~{G(]}zѴ :+& ᡿9=[4Z}'z>BDkt6>}]4>p6)C?3ɴy p?UOۃ?gG/`?qivJ޾r?w11?|a 0uaUy0>d;rrX 0t儾)ey0ʿjNk|1i0HVY1qn`B61h~1!<٠0'~ڊ=a/i}>п|-> HͰ]|^>+|I'>-̽h|>'J-p}>)}<>ak-\~tn=୅.=J!#ȿf=˱t=UGSֿ-=!G֡<1<<ȶB_4<|!R<='<)<`t<c< v܃<$O#bE; ;ۘOa0;ʚκ;ư;Q.=;;DԿY;Y@ݿr;Iݿ;xN ;dY ;R[}Y;As%2;1M/o;;#(8S;pʱA1/;IC:SoQ :\ :l~j:hf:CH:V&I: :auL:J*; Eҿ;L۳$2T;\l^;Чÿ<K"-E<`5*Jf@n>LO{?g3:"?E:!J?`m!f?nv⾉s?v5>m<5?{~ վmyJ0>ep$0~y`_0,B=1@ʿ`Gk1zNO*1=|z@0=P06`~o=/}*>iLJ}%>vj|ȳ>!{mV|8>#5㰲 |~>]<}j>o }=ɐf!~=׭y!h|=?l=_=R+![= +5D rx">{Y>Cf>s>!?+m#iH1(.?Tľϧ?i䳕dr*?t־Y0OHR߿d1krZp1\+~h0hTt;0pĬu= 0\~= R/Y0}'=ۮϿ}>V{d}<4>8P}2>[u}cz>㖰⏿}>T~==ﰡ+$~=sl&=BIw= 6=M{=1ڿ<TfP5|شj6lP㵴y]gnbjdp侯XPtr̾=5,xt|Zʴ؊vʿxtzcU8ٳW{4[}UѾ]<~mQ쳽;iz3IX㳾<$Zӿ%=?z,|}>#Ƨt>Դ9Z?ôpg)???{09PO?aQz?r L_4[?zCľzvH0I.]e#0#+6>R-1) d}b1`oӿ{~=K1e620Y 3}y>M3,}> Vᙿ}z>߰.޿~!=6Eڿ~=[m+~=:VB0=vEu=yWði.=Coғ=zc<?5"|1u/5Dw(2hd5r0ʾ]5Kſo74nz0ʲ݉nrgR*oվhwY0q}SYrkFtK澙 -ѿu߾i(|wt,xnn(z`Up͇{; ೼U|H1J}۾pn~'%]7PCX;p:n PAKoڢ>1/L*?稴O̿F?Q1<m?l\oAt?xHrv0c- IWo0vP<d1H0Pvf"1 i~u0Ö~j08 Ǟ=)q/ XY=e/ę~=i.˫~=ג᮫տ~T=鯥J5~1'=r~.G=h~K=ͰBx~R=!}~t=ňZ=xs\b{=Z=eqϏL=5;[=P鰝 <f,<ְvpo8[OD\(i>Q$V=:2|?/^&%3?aO5?u7)}uCj02j54[1,o( 1 }}y 0tӿѽ C01'/%}=;>S/m=/kԿ&=.:~n=g~’=Sïؿ~=Wtׁ~=Ъ?~=6`޿~=:߰޿=(S=V)c=y$=Kr"=#wп =TM<İl<Uv<ñHv(4N ۭZ!56۽5d}#x>t Vb>"e?E;辷?nxZKNO71Ͽe3/1h{y60fty0ґ/ސϿr< / ʿO=0=H/ҿI=j^/ڿj!=l.5G=m<ܿ/Y=K2kڿ"i=EΎ"i=E۰G /=+M޿H=8~(ij=0=r֋2L=M?ǹ=)l= m\{<ݘapR<."G*|<۰4>˿Af;`F.;4N@@;Wb;Sð`oٿ~;9j3;iu;QK;<>e;'|c; @;TW8:M:F:ʿ:sڰ:Apѿ:BNɿ9k#8ReOȿNȰ$o^8(AD]ñtұ_+,x@-tv 9E2k:iJo;೓(%;!y;ʳ`;賕^: 1|"@4CSٿۋD5bP⽕5V~ 5h,||r* 5'{ѾG_4пzjTh3_zMVJkUzRųҁVz¾J4̀{a(AoԿ{37i|W,rV|U ZJ}I=}*j؄~*iX~óKnz~IWPBvӿ8Խ<y⽂9 M8N˳7"Pճ75m7;9+8c#<9Q/-=' ;dqL=mA}>~VXBw^>q Z?O{H?Y6&_cþz1}z:MĞ0[}%̽/lKآ/;U|/7f<./ky{= i/&`jn=1q.߿=L}eU=`^=m?#d7r=s+yb=s2$u;=k㱰/{=]VwS=I;tC2=1Er=)U{@~3_2Կn𐶿u#(^=k,hFm#H^u2:ۿ:1afӠ:߳: 9: :6dٻ 2Vj 34=Y%4݄g5)~451~-,4-} "4vG}8þs3Zn}*핳+ڝ}Eq}}tƾ=} ijB}设qt\~%j0A~aj~i]R~@V 3@Ik)m轈,"+i[?A-ϿDSh4BT>wCؿ< l<'A[=$`F^=2}e><&ҿw>/yUE? \j # End: Data Binary 4 # End: Segment mumax3-3.10/test/testdata/binary8.ovf000066400000000000000000001666611371432437400175570ustar00rootroot00000000000000# OOMMF OVF 2.0 # # Segment count: 1 # # Begin: Segment # Begin: Header # # Title: Oxs_TimeDriver::Magnetization # Desc: Oxs vector field output # Desc: MIF source file: /home/syukri/workspace/oommf/std4.mif # Desc: Iteration: 5, State id: 20570 # Desc: Stage: 0, Stage iteration: 5 # Desc: Stage simulation time: 5.7528e-12 s # Desc: Total simulation time: 5.7528e-12 s # meshunit: m # meshtype: rectangular # xbase: 1e-08 # ybase: 1e-08 # zbase: 1e-08 # xnodes: 100 # ynodes: 25 # znodes: 1 # xstepsize: 2e-08 # ystepsize: 2e-08 # zstepsize: 2e-08 # xmin: 0 # ymin: 0 # zmin: 0 # xmax: 1.9999999999999999e-06 # ymax: 4.9999999999999998e-07 # zmax: 2e-08 # valuedim: 3 # valuelabels: Magnetization_x Magnetization_y Magnetization_z # valueunits: A/m A/m A/m # # End: Header # # Begin: Data Binary 8 @w!BҀLV!y.!&Sx@sBxa$6 'D+L@>]v%k|oD̽H@}3)&ƳGxvZ9@lyC&d~@(&,[nZs@<y&ߢÉlj@T@k0 'qJI?ɩ@\]':FR $@ 't[L7"p@_6(ɸw1A2Pf(mdJ֫g+i(_|OA2]@6/0X2cg(tq@c\ b)2h(D޵v@]fHh(Mho@Rvl,ni(LaY@V[ [Ѵl]i())b@FԿHi(lإ@L5D7Qvi(ܪ@UIA듳Ti(K{Gv[$^X*i(;=6ZoW9/\i(P=bUjb2"oIi(/:פs^$i(Xs=*lb i("P? HoOi(o@z3| $;ti(T7T?w i($v j!$7vi(`?wCScSi((2VKn;&i(ZpȰпi(8Y&Li()ug$! i(|Haj˟AnN$i(<ԉ.S[i(2-,tYյti(+ O} t_g%i(Dі\_Kf>i(Հ>W@X[nNKi(2Eu@fOc+wLSXi(@΋U:{i(eQ@T6@.8i(v]O @Atdi(-Oݓ@Pki(0آ@Ӫ-#qni(>x@ 3۳ei(ym@dI i(y@m"'4i(:0@dqgQi(X@/:7(]ci(F}~3@OSȴg|B-i(Z@,|, 0^h(W@Qh([o@Ό#a AWh( h'@W U͵#h(6f@h}Ig(@`G& @̎`h(`@KD_2]7h(묝@Ɉ!d׵h( s->g@(־؅pi(X2dV@Fflfטi(t #@h(A4\i( b@n)"Uai({24g) f(d')k)[](vJ(lR8>a:(7>!-!l ܋@8'P0W e$,eR"" pRm@lG:[@?| (@+Ƞ$Azg]@!MYT'AtO ׺W@lg0;(A-fEvgӷ@A\(A`Fjp@lr;h(A+g.`Fմ@sT~h(AslL@>z@L\(A9@\@B)D(Ane9@L@r8'(ATu8W@GM&@ ͣs (AG/@"ANߐl'AWhAoO6xρ'Aa^ A^;ˢL0'AAtЩ3r&AQ 5X1A6\{$ASUA6ܮDJsi(0A5r'$AyN2[w&uļ$)Y@5V!I!.s;!"*c#@$i}4#' #8 @ԊJ$Ak.$_9Uz@DS$}`Mc_6@|,V%sC K@M2ҳ%F4ҡG+N@e%/haI%ڽ@}Dtp&̌;l#@eD' Ee ,Hí@0(ƁWz4@|>'\h(eVV0ՠ#ye(jnF@d<:ᡲb(AXi@gd od(4@R&g(;cJ@Iͩ}̶Poih(17@hЕgyZi(ҝ,x@G0KWi(LM&'_O@_n%VӼpi(k;,@ kD|)%ri(T:q@!{Pi(}Z.eoFch~!i(B }AHi(ɟ-l0i(DF^w0"$zJi(f4ʍw(o!Di(,v~hq-29?i(Kٵxp07i( ܢW,#5 pi(ӟR$E';Ri(Zi8)ۛ+")i(A+@Ѐ/Yi({Bʓt~34qQi(ZXK6#_PQ8,7Mi(fzƖϨDQ<2Ui(tU vdS>@Evi(2V$DW#i(Н@8H6}i(l^* Lqqi(q*Pki(' b=pTTi("]%_ X(Ai(3q ܓVŧf];i(B~労b6i(hU4Syh[~Gi(OZmi(h]6_;tɱi(Jy!S{cvi(n0^ahOr;iUi(fF@I$vwi(0.'s@@򌔳¿i(X@ฉ0N$i(ס뤖@R>N i(7@kaiZi(iN\kǝ@tz˳1i(a­@'Oi(eo@c6Ri(צ@[¹i({9 @utF%gi(F@Nzy5 J~i(U0@>z(k3Hi(/di9@0fqh(#x]!@b(Z2]h({"Ǎ@ ˺T:h(kD"q@] nvg(}LBU@!5~َf(4̴@%жtѫf(G(q@J &Jf(DF@WQ9(?c`f("'@w⪴ 1f(3hT@@@݄۶(sg(]V(@mD`ih(!Z@NsXxa i()˷w˽@ƮpL$ii(櫬{SIeg(VN#,1qbg\((zwX/3(ejڛ 키1y'iI e)+٨Ǩd#:#_h5@qa ߢK@&Dzr!@w;@d(R'@K۰A|Ho%R}@w $AXyC@e.'AfNq25: &geך@mU/(A_>9ѷþ@>_(Au 1@"Ph(Amw@0L@(O(AC~@pf9@! "(A:Rn(Q@wrd@ma}'ACkAIoYn\ 'A3\]-d AsѷA=b<1ci(.<"ˠ gfOi(cD缟j^}i(8)2nXi(/ r/r uwi("~#}wǨvi(X+mz{8Zi(Y($gi(YFDIu$i( ;W9խ|i(kuMiV,ŽCi(Q; r)]si( ІKQڋ'욳4i(9%;e1z:ϡ4Gi(Pl)i[w9i(YeC@Smi(K^: s@6`*i(}K@e*tų>fi(Z9@anѳe:^i(%!@|]߳,Gi(H,@ ^i(-n@ei(H|<@7]yٝi(7="@#D;oi(B{:4@6e(h@o( AY^'UhY@TݛAYlqb$2f@$AZ:^-85@ GRu'At>E W)@_b;(A*d=y7@mi(A7贫@9G∲@0la=(Ap'K@Tl8@?}P'AsssAV O'A#[ AXp(vQmL&AyAawARAE%tB&A&.ft A[n%A+A`fL6ݑ$A =\JAlW,ﮝ"Aen]mAEy0 MA2!6#AVJr@a|NAv]gj&A!!0~HA.'ATP_Ipm@*L(A2<"ahayM&ē@Ol$3J)Y\@ _Q1eA#۪X/b@)Lc f"c@T!KA[SW0!K ?@=؞") I.s ^]@ "q3@th$iJĽlWpq@ K&Ɗ!sK@_wx-(gx.(pzQg(Vf @mFJ;X0Q(1U@(voiRɽ3+N(vpc@:'/_W(m @ #a ̐_( A@׳jJd(Hb@q;6έX g(;B@{JU$gh(>n @N!I#i( 4*@kcѴ i(̭@iEJ|ּi(IU!}Q@!9xv-i(t)~Tji(WKAb 9?ɾi(@VŴNnr&^wIi(wrޣB`F=lÒIi(+e̤ =hX]LHi(fLAΤFg g&i(C uL(i|Ei( VY#J0'msFji(S&(/pi(GVDs%tlbi(7v(ܠ&gxgi(td!-Lme|p'i(:PJzQ*i(["Fi5teGA: i(lwC݆/vi(}'K5-ϊ,i( Z5lWYlۥ=i(FX7(>}%i(3yEa<Жݫbi(.&sŴrBCi(|V{h[Ro]i(,!sեbi(m 0#-qc{ i(x|ޏ;] i(:L pzhBi( =A]1}i(yij@hɳԈƁi(ӳ~@$ [ӳ6i(1Ay@Yr~?ݳ^i(s5@Kcc|3i(X#ԕ@Gڌ di(`C@!f Xi(|X*@K RtTi(@@t:+i(mL@)e\[Oi(pV'$@{ i(- fZ@l_Zzoi(q/.=#@8_Q,+i(np;@ I ]5:h(S+@:Xɵ*'h(̹ (@a. nHRQg(^Ji(@pYֶ]HE)Cf(vyh_@I:j&zie(zeu@wns#cc(̊ @c u:b(A><@ uh˸NTΟb( @MO_c(/0@qb¸kMd(Cبe@q@  _f(iZ@xѳGXh(3aR@@w0i("SSܦUX}'d(Rݴtfh^ ,H(ވIgO"!Tt'.Ux <.醂+Rw%f6@#*,"A@pN3u&ϐ$%@c|;I'7:6@mӰn#E_(ɏ~20@ +h@ĖnF(Y}@ߪ A% ^E?'t9?@οAc#%ifP@Ħ#%Aլ!cD |@-q'A@#E@h e(A+Z|>@Բs!(AM#D@0q&ޖP'A4 AѮĈ6_&AkbA3Fį 8Tg%Au`{A&N2.@Cͯ:F$AW]1A34Y6L"AK@3X?R(AaTs/"S@lbDb(A&xg (K~R&or|y@ ĕ'%vE׸@>q8}$yP)@fO#"4#+D"f@F`8g"x 1@+pD q/!Yr8)@_CZ{!ā|!įT @0S'"1^bU@{_N$.,I~3 @^&^1IW`\PԵ@(@H(՟M(m ^(E@at 5 B(L@t7UUyA(P#Ӹ@ྀnN(Jo:@}j޽K]YZ(cJ i@^j/b(8zo@k,#7Df(R)Y@ җsU Zh(a@`,TVLi(N@q8@t i(bl~@d/FRTi(fM#@P_g,!1i(v!K@ms/zi(]䒇5k%i( |t{a@𵣏Ri(s@vG;شڴ`i(vH f@Ё%pi(*2@'|zXTh(Kl@{Q&kvg(o"@:f(NEc@ /te(ι/@$Eط=Gc(^@${ujywb(~_x@>w#\a( t!I@HfO`(b@Mba灹Yw]a(#\Y@+CO`c(Z?i@fMţuz~f(-@z ;1b]Zi(t: @&rJ4ZSh(-=[q M0fW(@KO,ZyjV(AˁdoBxu&סvu1DN@g7"mKt 6lx'@=)0Bm$^,%+V@a꧄i'_@6mѼ}s'ƌP}@LzG(2i@_v^@SGwe(B@ꥬ@&(nƐ!@K|| -A#E&_R@bB75Ad#"/AI@Ξ̥%A/d_L@@y'5(AqM$s] @v'AC>'s A[Lp(A&AWC/A+#Ȝڽn}$A96 AfVkZ"A2]Aq»2n=9 Azp<"A,e;A_gp$AiZ !FWATp&AFJہ{P3ٽx,A'qa'A1^&}@?R9(Ay1iNR'@wD;](AwϩHXM@O5rne(AZj+8X$.25f(AB&)qk  o7'@8?aT˪G&P@9fZ~]%<=hz@+wB _Xt$q yP@$ :5#锽:@0@=Z:n"zR@@08 t]!Kn@Мs]5"uA 9Ok@ |$6KD@b;R'&p wű@i_(! }wQ( y:/@?]aN;1( 0@@HV4(֘s@qD(f1@7Z4*sZT((PR$@5yH` ݻ^( 3t6@Ѻ|<Sd(~@_?ןGg(SsPK*@v¶j h(*@&Qhy6i(+@)3[j3i(pxް@ᵳ-dLi(L̘İIci(?Jin i(yTaóx1Ki(=4+-o\kɳFG%i(KMB{P@ϳ=3i(pESaa@vz ֳZ{i(jw@pFK޳a/ i(Z1]@G 2Ri(Y%@.{t3i(U0~@]wAvi(3hK@tW? Ai(_4 7;@ϺBG7=ri(D 5@x=5Xwi(ڐ@TݠQ|i(jj@,vzi(춽ַ@q:ަ9i(}@MV@sVi(m;U@~L8Gh(Ol @. bih(M@ZDFJ%,g(Z@dx8Tf(S4@՝`rSOld(pl@&c-Kb(TjXJ@ .0a(Vʢ|@s|3$L_( @DL_(RQ?B@lyHeֽ`($@\m jc(2'蒨@t2?ESY0g(/n @%ֶʋi(Z@];?޴OX.CAc(F? |!P{2(Kc¹'"I-@L (& @,%wybI(Z@Gyf(S채@@ZAg6(u:^@!ZOA=&cwBf@:ګAmEo"YHe@{&Ar[@ûG'AGsAfƧJ#AFAQg[k> AqEu07"ApaloAb$AgTa^Aқ&Af >JfY,A#u['A}5]N&>@VW;2(AJO}Vd憘N@XT \(AR fs@e(A`w]ve(A\eNtTe1&d(A)0h30*yyHc(A=<ऌ< i=<.x')qm@r._ ٸ&/Sv@>|&H;A%hg@WԜ N!9%;U@rafv$JH;@q 'YU#nm@ RBFV]"QV"G@a1" "g O$m@vL$-Cݙ@{?!'C p& Co}v@?Yi(Ccc{ &tVAuIA(w>@ٰ+diU"(#R-h@+3]'(/[@IJնwt;(pb$@;,tyb%/M(tb@z[(>@ m,=b(:3@7uGDf( @@s(VC xh(IA:b@:[<Ti( K@9y@Ti(Qӊ@R'i(뾛v@*CElV,$i(t@ aQti(xiv NJ`@OGܳi(U7߀VB3~ų"i(jEn-Xdi(%MRxi(nSk8 r?ai(,1s3i(A@¹i(]D}܄ꍰmi(_ M͐!`4voi(šLJIIV#b]]i( `T,ubi(I6L"i(ߺsdI4^2J?i(>oOJ!58]i(lQuг{i(RLpI'ߜ՘i(mt#u-8d³.Qi(q A}ų .i(Ÿ.uݲhmʳƗi(hscϳQ8i(;5Æ <ճRi(zc@5۳mi(4(&w@\7Qi(tZE%@wj5^i(Z@b 3`Wi(5S$@:wQ]i(?dTc@l'<W.3i(T쎠@2 %i(/"2u@Q16PtTi(Ϋ@%sRŰi(= @}wwe~4i(DNt@tbM٪:i(Е @'4sRi()`@MrʹA h(Y2uΥ@}n2YQh(X@@! =aAd#Ā=@aRG#A/"AzqtŅ@SW"(A*(lׅM9@8CS[(A<{3ٗa@e6f(A3Ϩ\b(A֝)|8P](ANAbw%/l$Z(Adt":[~2Y(Af|v<ߕ cZ(A.cЭ<:y:E^(Az" k`_5a6z'+@"4f>n|'"r@1%'\M]X&?-=@k{S΄Vje+ݦ%36@JF2Q$yş@-q9t#Mm@w,c"pa{@ZI" ˴+  y@;_%-%r @?'xy@|7edf(BO @Vĸ2/()KIx@<}2eQQÄi('y9j^:óPEi( ۳{Ԇiųi(ŊY<~! ??ȳ2\i( *xP_˳$i(^ q(Tϳ>gi(Eڿ-rdh,Zӳgi(CŋO5IOس7N֣i(]sa@~޳pqi(x/t@i($@Ǽp7oi(U&@`@-nsi((]@}7߱Mi(,u+9@ÝC ~i(8Ք@lSy]i(=@iB:C2_i(>SD@ 1N4i(M@\s7i(s|Wc@U\ɥ`(1i(㣬t@yD=Ti(0<@nȄ Bh(h@i-oAFh('~@TLHu^=g(5ň|@%KRUe(tqI9@(Sķvc(K_.@Zc ;jta(qD+@ nejB_(8g@E"Nv](€!@Z=&Y](hIj@MitT`( 4z@$N d(BQMs@L #i(pSjɃ@BBϴ_JB@d(:٤qHG2*(jXjO=g'}-Ήr@ ~['$PŊW8Wh}@f69A .4"Ěj@:K|$T9(@ʴܥ%xS%@7Cw3&럗@µ$Ћ&t& .S@Ml}Ш8ֿ&UBJ@7H;$;+A&0@,@ꍚs{&.a{.@t&ZW-@}BKk&20V?V@cRDC?%60@"CNi Z8@Pl%GnBAg/WI%AY#И!:q Yb/A'Asґ巃{^a'AزEҌ?'A./Ҩ2k\Yi8'̬'(AԖh!cr !(AiYvS^-c,(A(ŕWĻ?T}-ʟ6(AIY+ooEA(AwS܃#F[M%L(A|2\JzxaJ;W(A +hzhӾ' @;hk _cCa0'@~>@?ƈ2:H7&ܥa]@j ;zٛ%qqV@oܺ5ht%Dx_v @\3{T'`Z(i`@}2,F (e^@ /뱧(pS ADI( NA-Lp)(s@|O!XA({Z+&@U$u&PS(='@;㟅^(@ sq~3d(b@-4Цo)SIg(A7dJ@Hh(%\78@^NEZii(D@xF+zM'i(ɜF@M-_i(j~@V@_i(_I a@)ԣ"F޶i(~ 9# @Dfxq#i(aj@Gi(G[j=i@9"iI i()̲@dN9ܳ i(d^ ׳_i(sa*xYjJЈԳT\i(fq$)bҳWi((Gt-&ϳki(7AxRγQi( z M̳\bi(y@ }*|.˳hji(c*2~qWi,ʳy1-i(~t7K7ɳeAi( hLɳi(7pfdɳRi(|T~!2ʳ_i( 9)|/`˳vέi(lyPͳe˵i(MA uMOϳGi(1-p[fҳi(왛c(ճ p)i(8;6EdٳLi(Z˜W@R]޳0i(ض'o@Ai(8YM{@gi(J0@vpz'i(!c:@NFTG i(@Q0@ۙS6l-i(A @eٰ]ri(? @|wG+dsi(|x@I F;i(C쿻[!@r Uekai(W8@6Fa 3i(0@M@ ,]8Z[i( Y@pj;fh(έl@ԀBojkGh(>-w@fPXIG3g("4¨@+Me(PJxt4@Xַ|c(Ϗ@yw.a(3W@LsQvH<^(]5VD@(JJ](I0@(:,](|7?'@ }AWc`(Ub5%L@1ʪStqf(@&hNi(PG%l m]V(&xc_g'taWJp@Mw&0J T|;@ ι">iQ]f@V Sgu@FVhTro%'eEU@-2M᫴Ja%z~@L7 ώ}%3˓&@ ^Ppv!p%UwR@M 7bfK%O7@wwZ$!xR@88;N"]"IOfZ@xG3="1C 9 Z^ @-R& ko:kޑܔ'/ANq]1"RAO,rY)XkgIm#AU|2+O8n%Ak>}7&A6&BrAΡ~9s,7x"(4~G`@zH;(Y@֚3)nNwM;CO(?_@vK½=[(.*@"es@b(ԔX@I"';ޙf($w+@ V-fh(ڤz!@.K0Ϥ;i( RO@ÛOyd7f~i(ur@L<}ڴWi(JQu;@'7b(])M<@E}q+q6h([@E!V7>#]Ng(;6 9c ϰ6_7(~I==VE~_{J'F A+0e@YGh$}M:9<`'@ܥ!!r UR@EkO&"Rܱ@ta(#'F#@s[LpK$Ò2@ ϣ@{$ZF0@ْ??ЦQcr$V &@(8X&\U?6$>O@k+#8-*@;>:bk`QX"cDJl@a I|!\ќul@:1"2=B4(@(m$ }^'X@KMfY'5dɆ #xG@w6&J8P(2^@hoFt%]MA! ,!o AA)HW3$A=HrEW ;ZҠ%Axƶrvc?)@&A~= ߋA0,F-'A]܊ xM'AeIɄ> r]'A{j]S>k'Arއֱ+Q`H"(AԉY[0׻zEB(ABY@ļT9{኿8'M>#S@f ͮX'^|@=)NYc 3&p&@?G=H &*@drnTeN,%@طNc#c#@M^2}!!j!J,@Sǟ$,v'Wx/@^i'wz6'5@'oJi(TN3t{THw:(Pf@践5m!^7e (VALE).'ix^_A z:e( $uAj_ix+81(;'u@R2']HY-Y7(sE@__^K(Z@$ dY(6x[@?(wa(|@1e 7=e(#p@2۟h(`ͺf0@uzhi(S|EK@guԐs놁0~i(y3@k0 }i(R~ί@N V04i(d[r@vhi(qv(Aaѥ@?C?Ri(Puo@&%jhi(r^Y7}N@5&W]y@i(@yzI!i(h[<@w i(='@#3i(B/p*e@ze<.i(H}@BF/ai(9, u@!54i(j@lwi(:0:Y@ѥ\ Di(-c%@/iݳ̳i(dTn0/okڳoՆi(uM+*ck ׳~oi(ߤojҹަֳ#i(ճi(*ؔqӳ]Ai(>q5uӳbBi(>'qMiӳ]i(ꡎo6ioaJԳ(,i(D@qj\@ճEi(+.Jc׳'i(O\4:SNKzñڳ)i(hA3@)޳(hڔi(Obb@U2&li('#$r@73l"i(l]@Oapi(.5@0}7^*a#\At@qǯu_#F@M3K~Ԩ##{I@JNkY "(,@WQ {Ľ!/@"+!9r (]|@T%"?9R@n$^Tͷ( 0@CU"&,ވ]~Ӥ@A M#p's=ZBˇ@,ܗh(I@2od85'XHAn{zpr$d"-yAhhi8_xW ~v"AlwqL$An,u{0x=mz%Aƒi xt) oe&A2%+<2$rvD Sx.'AYqRxCtv~ `C'AxYPX'AEw~uCC|f (Aה-dZ =*S7(A>GTdj8 'gmY@X$ 9]'l$@|e[v&*OC@R_M%tWo@'JZ,]>$ H@A`Q_? V#p@ "p > D@fGEb%[*:nC@8ɡ(4WZVb7UZ?a5d(:5@^T)Qp,(Q0/@̊:DH(u{AӥzJW'~AMTI(Q }.AȤ47͹(>}(@ dvxY\3(3x@RD;H(ϭ@D'/}azW(@-+T`(Qgz@mqι<)e(v"@MK 7g(? @CM2ܿ?h(+ln@DZ׵ ^i($Ⱦnz@{ݠ8 =i(X^ @Q δi(A!?@)DW1fi(5f@hW\ i(*ŔХ@p?R1xi(e@+aQi(zA@/i(`͗@GvBtZ"5i($+Q%@Ooճoi(fp 4wԳ0i(On#qT@Գ!lai( oHpJ6?Գvi(nGMm.ճii(m{}hm[A!׳<㨳i("B|`EKnmٳVi(=:#Efܳ< i(]xV=S@KfoF{i(՛Wn@QZ.i(kX}@q7di( ̃ `@x7;i(-}(@bK %/{i(I@;bǛi(D1@MT.?Ui(߯@ĵWlEi(YfL@kϜi(|@S3̵JX*i(9%0*@/t{d퍋h(PS@>ng(:2X@THwZ*ֶȇ)e(@+z@tfȔԤc(*W@ 歸Fjea(t ]@V"g^(F;@s&K_ ](#6z< 1'v@x#ё^$ # Ty SP@.fh%})vwdE>@~ w#bCthE,@S,#"8V ^кm @#G p!{e!ZV@!D4! P!2{g@qL!Z~ 8!;Ԭw]@]^|! wS{n@ p("jO =@j"i2|@XZ##T]-L@M:w$L=EEƀP@'o e%"N.W;@X&6sspz o@=?')]@ Lq*@Vmc'J*f!. M@pr!c(JE_ߊCӦ?Tuvb<(qy@QY`žof'vwA:o/-$h$tcȳAh3ipR XYd!A*8Z BH$A޽3F[{#j oē%A5M^AK8*7Tz&A@Aie-"'AJ8Ir Wuن'AшIaV% R}VI'AS* TB (ApWCL"DwV'7NR@l RV';}j@ձ!9&FWe@kS85D2ڕ%€@n?nckM$ <#@1E NZo!Ooz@O($1< B@?4'L j @ֽh(fxvomHtU$/K(MN@q@T*(Fb@̓Z5̦o'A,lU'{kBtAى&(۶"AIB]u(l oz@FDv4Jr I/(~ #@v1i7I D(X&+@]sgRAUT(GI3.&@ln*ȼZZ1^(gs[@3Wp}c(a@/*t▸pVWf(c)Y!@g; 6s[[h(rvg@&: i(34@Z Pvi(N4 @\{1ѣi(hsvAT@s?)bȴJdi(u@f﮶蓴Zvi(s^z@}XƄp i((~^@NTWNi(DN6@+Ejji(#%Uy@O6G잱i(?p7@{)Wxqi(SJ@_is ]i(ͪw61˘@ O"i(Η@:ha i(ħPϐ@6`Idi(HV@5i(@@CY 06i(4pi|@ZpXUSsqi(;nsq@kx–i(m +`@T =i( t?Y-~ݳH-Ui(K1"\<ٳii(HàLjEla:ֳi(fB4r Ѿ1_ӳ_i(lv_B ѳw!i(/##,z!Cϳ6xi(Q|L%4>γi(, }Iͳęi(Y~.ͳci(0]i}t!ͳZi(zb?l`ϳtqi(.FWuiz@ҳIi(&Xk@0L׳Pi(d(PEoZԟ޳tTP^i(w9@t@!Gl0uai(mJ?w@E}ofi(JHrQ@{]Fi(s<@,_?Ei(KqsA@txJi( `@Ͻ7ʴwP{=_i(*p@0#9s =h(0끸@3y̵ g(| U@bz܆f([\?[@Pd0Gd(9 s@ # WYzb(og_Y[@P9WL>h' F@ )/ ֢J'HnB"Q@bɒaݪ\X<&qY1@~dL14nBy% @9 >ݲ#?,@jOK "T$\ B=^@~v C&;.@$n@()%(Q }g(J*tc@ȑנ0@( 6^@*dӿ-t-M:(6@eDŽ^^ӊ(Ro'A5Dpc_\'/.A$(O]JAu^X\㯮(lş8 @ϕtk:/(r y@Ûu94x:%D(igZ_@0DYRBm#lS(09/ap|@\;`9so](c@N wuPc(44t@+ sʸv^f(&rv@udHK%h(P0@r(cG`{h(vm⚟B@1gȯ_i(\bc@2q5 ǔi(K@E@ɬwi(n=O@Β#1;ӿi(R@/:oi( 7>@pk$2|i(JIF@*:hVi(V}@EWRi(2NF,S@'pm72ki("Κ{@P8*4Ii(Tɝ@~\3Ui( ?@ ԥi(l6@;u` i(쪖}@__Eد|i(tgu@?o?i(8@Jo ;Oi(3 x@@Ԇi(Ρj@HjYߦi(ޯI@u޳+i(m݃[Whٳ|i(B#Jkp*\hճki(PАtWѳ(i(zv6γȑi(uJ-`˳"Ҁi(trJ+ ɳJpi(䏚ߢZ>2dz^Bdi(#F{Bų\i({Eų(O]i(BH-݅|ųhi(d=a跄QƳ]ki(N?L  ʳ&|i(YqFy<)ϳ i(UaTduٳ\gi(E?@ s@b^v3i(v2@=nQ9i( E~@zˀ%%;i(WoX@Lv)[0Wi(`or.@D؄駴O^zi(lg7@󱞍ޜ i({#@'oT@h(]u@>QcPf(D)$s@q?) e(ȥX@`u0b(WfM@s(H)Ua(xnP@` hҗa(\@pGSkOe(/0~l&@6÷ai(cU`³zJ T(j!ن#' I=@&eYIw-H@GR}W%6gf(&65@C(1#^ږ@b wu%Gd @1ļ &$1+Y[@`|פ&"@Z*hݵ@b#]4'EPX%;U@[''2yqb> LF˙@b'("t<P7Y(K$ g(w_@>Z_)X9&(,@8E·)'545A^.īI<$nAjw1)06!fL!A12<7U}Z*;iGn1#$Ayr,UIl6%%AIkta +&AxAy@G W<'A8Ie:6S%'A^'D&N`},` (AjgJ%q퀝-?'tO3@ e 9'!b,@Fdm/c&A{@io~ ߲%q@N6ݶ͚"9@@/#0n@zaA'|7@ ek_@ZZ(O#y1'Vy_(sn@]0JEu9(-P+5@KSm#(gT9&@]ʍ({^8AoI(g=A*G|D;\? (ɷArI -6N$(O8)Ns@k[D|Ǜi1(ڟT;"@(VD(_Q@Eϲg: S(`z@C\ x#\(RoE @1jźBb( T@Y(E~8f(+E@܈[wg(jU*@p)'Th(θ, @y;/Rε2VJi(*r@LMIQȊɅi(l&~j6@Ke i(B@Mo´[;i(6X@4N1뙴j[i(gA@ZYc|DLi(-W@|5ܤe,xi(J1FǪ@ϰS7Xi(#2@ C͍Zi(Ȃd@$4i(3R@!+%'wl-C}@݈ ˛%XX:a@SsL&9F>6 @2&kePҶ@y7'02Wg=J @Le~v'"0 >_Cb@v&'d uʑ@av#(*@]Dڋ$V(z5Cyc,?gi(j&[o듦ɤ7P(#j v@I j(B' XS5AS!tJW&A&zA_ /q2eE#RA3Oyt mmT"Aޗow f'O %Awa9biK&A\ؕw$)'AdMVP-R('A#܂9+;_h(A`FzJ'-b}@9ג!'\@r& @X%&P>iz@*멂~$0֝eC@ӓ9!M.J!q@Xה%*p%D)a@gx'ܡGDՒZW4Ki(>uq˲V:V(x@={ӻNb5(tb=@{ȬNb(0H8@xm"f(יA?@`f˨g((&@>~dh(zM@mқ)¶6i(CZ@/Ygwi(~mu@P|i(E(@,϶ԴJkDi('^@$,WhV6i(C!Գ@O ֊9i(@Œ8@ri(F䯐o@?]ti(.cZ@Kxq{Lݛ!ii(td]#@[^D<QJi(QR@jʃ.foi(@@!Hy!Qk~]i(x3Ju}@mSޏi( r,l@m i()7ҽ@&Iu,Api(9@Y׈@vi(I`}@BSC,Pnti(<&s@,b:>i(2^K>V@ |ݳGvi(H)KZvXv' ׳B( i(պqlг^mi(5|Ƭo$˳i(ڂ޼{ųLWi(;I6Xw&- W#i(yV5xJռi(Qݳَ7丳 i( &<Ư{N`Lxi(Sxd&m1\)Hi(3?%l'i(ƩLhYiF$i(^wZLi(՘v?ðɭi(Ns4ɍL@i(S i(m 'SqRҳ9"i(\O@U ǤBoi(c=@4InTXi(L{j@+1$\Rʫi(L8tx@6vw Cbi(S5{'@g_c2͇h(% @B->mѵRg(U ;~@Y㜴X^mf( w@5g0d(@k(SCd(?@Əχ7GWOe(Գ@ & i(+; @%`,$xe()rO?uHC2(,WoE.'6D#O ~lY@5f&0lt;늹@tGb%eۍ!C@Lp w$:bd#@iV߇$`y͔jJƆ@~F$-O4}u@'u<$罪2LR@Җ%6cͩ&\@Jl%4Rh߯B@_%s(N@`hC|1&tzWC @A& e*I(@R&?E@ں%S'jŸQy \(ɩ@"O'ȧ̼C+]4@o'Գh -;y@f,( 2ZhJPӛdT(?D ﭖ܈xh(n*Deg@ Hb( 1&@}G0-@&)(r[:@-&à(^ke'Bt} AQhԍH%idƿA<m\Ur! <; AacY]nYib&S5$Ac'{ 89gB%Ak `eKPB+p &Ah2w [ Y#Q'Ať({F]'ABtI?cL^v'#_@S$@kr[|& ~W@Lw 83%ĩ>@T9LM#Ai@FNE#cۖa:~@@H_Mc 'nz iME3q@ĘJ(s8d`|/{h(Աt@&Ԛ'j3P(܉D@PӴ~0]5(k"@3:Nf^!(5}@s|Ư(]B@p0K'(Œ@*3%Dqڇ(MŪa@D/Tɪc7((=-@yQXgQw7(\3@7춬@U,hG(V1m@oꌅ T( /=B@hAXpӼQ'H\(9@TŅb(b@^8e(XoRY@-ޢg(.5i@Sjz=Rh(<0}@f9&i(rdi@Ux*ki( 6$Gv@ nǃi(c޸@b?i(@p궴~8i(% "@M@T]L•,i(# l\@cF\{m/^)i(J9կ@aJeR\i(hƗ@MR.F2i(#`ͧ@5ByAi(=lX@)@2+i((@ ̴$%i(uAl@qrGbbi(Y~@$k b+Bi(?w@) 6#wi(%/u@!T7i('`ρ@HH.dyi(3xs@0s˥5i(tYmvS@XQK۳qJi(n}avdYӳSSi(Ӥwug%c4̳1i(26MUƳji(Ѳ@cSc*i(7=6%9i(oy~JS$i(v>0{.i(IJRŤ"yCi(L-8\ >yi(Jetޗ%ӆS/i(+D6))vgϡki(pZ[V $ i(<(ęΠV`i(nn`I Gi(?dq=Ui(  YPд3Ci()摣iȿӳܙHHi($:@#ayPri(b+#1@S33m}i(}j#@تǜi(4\䶿@ETݑmi(~}z @ƈû֥[h(5O@ɒʇg9!Z<=g(X@?̑|e(]c V@~^ԩcB re(dڗ6@$nf(AC}G@d?;Ii(~N@j!Rf\a(Hhtim Vج硽_ (LG3CBՇQ9n'To ~t@&Ix&=˧OÆ@ j%q9fa0q'@xb-D%Y]kjQ9 @>'%׮t H_9֊@eE>%Xm8os@ڥUMt%J#L(@E%][A-?Ⱥ@p-R& 0b䵼@idY&L/Jo`@;`&Ly' 6ŵ@vʳ&LEE@ G5nE'{. {s@|c'޴ 冢@FxHl'Dyy?5@9|q(A34F<[0J;\4( b7rY !՚$CT(lBoKd{;Sf(4َx馎i>'&_:h(pe0@Soo%Ѩ)P(lh)t@=v]'L.XA:5&6WA% O,l#h2 `AUx9$ = ٛ"AԩDV.^,S6%AA3c4#Έjs|&A-m /( .d'A^$G%'A2vҮnS'}@O&:&Y<@_uIS}AnL(󿂫@*ܽ fߞ8( qČ@6)Zx((fg@r‘핤 ( ~_Z@P[n[`(v@P#2N$(@ j8h/(9%@x14Dq<( >s[@SZGVhz1J(@HګiU(4z@Qlv- 1](i@Bb(v@uRe(g@@e7i3 g(y@)x ~sh($*@3&C- i(?ER@ ܁"/bi(S fN*@x*"i(\U@qX|:0rWi(jC=@~ȥx4i(3^@)Pmi( +c@![|;xi(wۊ@`i(>P@F4Ui(+×Ozi(OĜ󇛗= hcxi(Hෳ-Fa i(E4?QftF%[( <6(~}č(z,=EĜ'u0v6k' Ix6g%@Ë{&ע8O0߲@ v@ &~SL@n{%aL|m@}%%\46@ײ19%9CWn@h@x &drJG@ANJ&&,eY@`,Fi&-ᅽ8U?@&?Ӱ}/n 8@V\ '"@#!@_ pH'񤍹Y w@7d'-Ӟ9 V&ȣ@z'S;Y@j' ; Ye-T7@3X,(:|O( h~);(,dס$?%5T(հ`\5糥Mb+od(^J%D cti( =,"@dg La(v׈{[@*rUꖹ]d9(A@i G\rcV'QYQAv;,hYq &j|xAo/yȒV"W An $Aϣ C]-@^$&A2˂b 3 Cd8'ABLrfEg'Afk;lx [y'_[@AKGl&6@2a9x"h$rΉ@3: 7#50 b\<@ '. DBy1@nOE( Qm})i('p5ғ@^3=dE i^^(( @mg.Ҡ5L(BY@t^QG<(;f@@6{Y1(՘&@x`Bj+(|z@R]j8P*(mܫ@7ƀ;z.(< @cU,9#Jd7(}fC_@Di Y~sB(M`@Kyտ,M(VOx@ѫˊVW(Q7@``4|^(Ex@{P)6c(Ćn@êɸU49f(պ~@9@-ܲg(^2;m@2Vaަh(XHH7@NUi(\?t[@6^i(?N@++pi(] @lkuGO~i(8@oׯ)̘1i(X%@,"?i(^@8>>Gi( ~@c=h^,Vi(mj|@,TiטCi(Y>O@9]AQL i("2CB@+,07E7'i('^@snF!j_i(}@/XY,Mi(ٜ@.?ѽۤi(zJ@t@i( @JՉ@&kAi(sfMB@``#{i('$[[m@Uٳ!i(IwTB?Trгi(-qqD!dzD/.+g(Iz@ ÂZ'in{ Rj @'8IN1 r@M1db'ީ-"S @?RЛ']1yI炸`@b*(F>աA'NZ%F((AP?Hpgȏ>VuB($5l7;؅9?V(Ϯ}*B#.EFm*c(@m+Kqpii(B+CXԳ"vU g(o@Kn:kW(9Ou@}CIkB(ysKG@Ù _'^2 Ac)Qbb%o2AQqts C~d D"AzlYGa7%AG@Z%Aq9I/=ʽ&Av)R |%*G"X'Aޖ* E`'H<'@!XN>,^@s-\%q@pW' @V &mҗFVDc(HT; j'aA}_Lg(?vY@iLB]C E[(~)6F@C)璼s $N(ѤՑ{S@7/B(+=@傸%E:(™@O @#h^ 6(cȳH@j@͝5(m%@#]h9(J@3z|?iv?(ȋ݈@ icH(CL @InZ?Ҿ%HQ(Ej@PԯIA6H%Y(yi@z H_(c8@,64c(iq!($@^օy[f(2@gfwΉg(I`S@͊ltܛ$h(:Mx@@b ìi(7K]@7́vr^`i(5`@T#ٴ;i(K(@,ˡi(aW׺@o-q>9Ufi(ADWZ@)j3H%i(lYy@ ʙysi(@{7qab{i("i$@t[LP-i(|@@c9ri(-o" @hˉe(i(@w/T]i(>v@ś:8 -wMi( i@i(rȷː@)HeѰ~i()^L%@5* f|i(jz@ΰڳ`,i(yGa@\ гwUi(RW`Ƴi(xnxNؽ@i(IʃH񾇃v[jXi($ U(Ei(r=h,OY]i(II>LGӞXڼi(+$Fq@__՗}i(eyP 8 ޚFg(sؖ0>?S(`װ2JB8 ( zG͆dl(k' S V@u'ߗl/?riuH@I&KIMO8 @2Ļ& b "k}[@ &޲NK`uZ@@z&'D%UN,L@SM?'zM%y@: E)'t3rc7Ѯ@cQP'Uj չ;a@2~v'@S# Vr@9'CyϓrܓȠ@^Gpp'e@dM;@͟f'-N Aˆ@?(C *]XP@CT,(Al}e;Jb;|5(kcC:YW`DI(љ8G_>oyGX(b[v%!b(Vt@_xv|-|t+h(нk Lx_Ci(ar,i@T'A ?c(6,2h@0VnǸ[iL(j{@ЈC}sЍ($¯AH 4l&*A4bE-y%#&ShMAi_i9(DP#AT"M"s0{*7lXp&Aw^ʘ^_5'A/bGo5"I$'gd@ 5oIc$Ŭ7@O]#C\p#d>.@ғ='3R-< 1d8a:v5"'N(͂gp酧si(c v`yʱI$Ie(A @=| `XI.[[(#>m"(Y@p38FQ(Bpk@$dn6J(W\@W8fiTZD(oBP@t\ro=A(nʃ0.@_xHܿtQ@(H>@W+\dC(/S@;͋6H(xck@Jr]7TiO(k~(@ ;ncV(W_@QE<jߗ\(X:!Z@1VĺEda(k$?@|&m]Nd(b>@Wrd"$8@f(,cS@\!"n h(3D@ϑZ2oh(xd%@ǵЮK ,i(8@{H[@vii(ƫ@_%n i( A#B^־@(+JӴ_' i(̩@*tǼi(2L@ѯԸi(r;qݲ@zf hB_ſi(\>d-@ͧPCʇi(,n@Q;s[ni(<6 tu@{')Wmi()@!VBi(]q(@̤O(Ti(á)@cipi(m@gçYvi(g hȌ@N1V?Ui(?}|@0w޽ճvi(Gd߾q@b !˳a@i(_3"a.\}4 i(Jׄ-qت=ⷳ Fi(Az i(֊B@L)'S¦(3i('f8`ОX|i(p;- Qe 1 i(3AԏTi(c%Tљ+|wi(7"ji(-֐E1?z'yСi( B=ti(u@"N!;m}i(6y瑤cg@Kji(AD+VTassi(+ ̣k\i(C!$Ƨ^꫼Y i(*L>X9 i(`n5[[i(A6$brZc@i('sPwi(W'|㍳Hi(5{ɯui(M\!M@<83^"i(:~L@ۥg9mi(Euu#@FglӤ|i(gVV@nVi( v~@h֋4-)i(bWĠ@VFyKi(ZBDA}S"f({op%!'XrKoQ(Fn0#/EZ{S-~(ЅFG4؀yhݫ'kܸ 1@_Ure'ſr 'zB@KԔ5'`N;|KbNŭ@F('$њN A@RU 3'srjx@F90H'BJh\ v]_c@vs]c'O,Z ^o2@wx'bT BZB@Iܢ'},@kM'=x/V$Eۙ@$Qw3'0:Md"T.@ +E|'- W츁@~p(_:ɄW$-9nwT(4 <0d]vo/(+| 0cRNA(k9?M|?#8ry4O(id(rk}@Alݷ}طw](Hhl@҆66=*W( @hϻ)AQ(b@lW4CN( 7 mI@ LXTYAL(:@@v+L(Uޘ]@o#]s$ N(a[@JWlTQ(#@V+'MV(&^@sSz+D[("q@d~cU_(xq @PcϹIc(Y@Ut^+Ze(E.@2zDK3-cg(oh@:^e6J]h(@y@bk] г ch(P@3n爵Gi(E`p17@xb(}*:{i(y<5@vi qʚi(8-wm@mϫ =i(1ӷ@>₴Y;i(,@˰FdbiY*i(~Sض*@T]1TG;(i(bc77@\['0+%Si(A@k Fi(I$佤@HKR 6qIi('MԴ@PMFݒi((}@‰9DGi(KR7@v_߳ׄi(NӴ@E@ҳz`i(U͋G@hdzȒBi(w} w@ UH(^6i(hZ(S@`%8f2Ji([ʓph,J$ɨĐ0i(y|o~石:Vi(O_m'z\+i(ۆ$)C*Ji(O/oWytۇo}i(p$|fNIi(*u1-|"* UyPi( i0UvXr;p)i(g P<{kŢ 5i(d0!naId)i(o>G^ i(OD%+mNtWqi(I^&%CQ; i(%rOrKąi(hs=AEbi(|8ҪA4ni(LZɩ>vb?~Ui(+?i(CRD!**i((iFd5O`2]i(Mai(Ժ.~23i( ɨBt\ i(Uf@u3~dYi( Ad@T"Ni(HxOT@E;"[5ei(H7/y@k2%Pi( T,l@̧ƌ?'۲i( IBI!U]e( 4-tݯ}S(l{k]d%( R(H^'F&2aC' 7' р@5&'~sLS@a@n'77 v@]'E+G% e@%'xYbb M9@+'/ijxr5]f`@ uI״'HW&@"'`/3W'T@:'G4:@Z'F] 0@3`:(R^sF-ԏ[@T(-:WbuHK'"(\dBodbnx2(0 jڗb{+ @(9#QF 6&4L(:N5VH#wV(/D#r%j ](GB_ΩTc(%]Ŋ{Yug(8;0~PAT4Oi(T΀5i(N]@Q埴n>gf(t/%@U~`\(<}@\'\Vgr:( qh@{j W ;'̴cIAmɉQQ;ZV%+cSA&دK ;݊ e["A4>69;d&A)W1#.,o@(cca%ͺٛ@33#5]([C]ߦ_9P\(FW5ݣ$ {i(f!+w5hh(XD@qd;&z#d(T"b@agŁv}B`(:@_AT](LB@ j9Y/Z(a3@m3T^W(vi{@Mdj;V(0zh@BӻL4PV(Szg@?*ػn5W(m<+q@р73ώ3Z(h@&sX}](݈@a;`(j~O@#!s~c(/t@W[P`e(l)+bt@ћ7 X3g( 5l@ֿȶrh(V6e@Bch(*ip|@gu툵__1i(,T<@ Vpi(;O,@,M˴ti('f@qf`Ni(WU[K@J`:^ݼi(-?@@VGa,i(E9=j.oi(GTtY=e(,b/>i TM}X(txu/io /n:(PO ݂YeB(!C<Q)s8鴶'fX2Im~@C{]'V;|)ڃ:@b~'aQ'nX@,'j]_9u@,('\FO|N@ 9'AepSs@ޑ()yc(/7@R(er3_%XqQ+(Pc3v=;c'(wÊIp1h82(.<(͸ՄRD F(Q8mEyJ N("n.=!{|!V([sn--𴼥c\\(EƥhDj+q% Qa(q:ANlYԇ e(DvN?pyMg(EzjdwvT|@&ݻCߐv7(8j%|@uA۩^4'B A,^-&ʠ$c!AxFzaN/eX$A?eޯ;:R%vTe"@؜a'o;; &NEe}ue9U(=BIQrZ\Rg(ʊÈ^i(EN@hV4Z$*h(ގB@lҐf($ @y8nd(`|@}[b(JN3&I@ulPͥa(@PRX@c۸{`(ν#P@4luӌ6G_(Z@+Ld|_(@Ȳe`(F{k,@GՐ9a(&—@pg~eGk0c(Ӱ{}@0cڗ6d(R*k@񤌩ʷmDO+8f(' ы@K$?^hg(g@XeTGh(P}@6 ;_bh(_@=Bf >i(hJq:@Y'Ryi(q#`e@ %i(1C@\2i$i(9 ૢ@UXa8>i(Lh㖲@NԳi(yfЮ@rhd~Mi,i(Z˵t@kݳi(7Gc@{)ʳ2#i(F ^@L#ei(sd@vd y,i(U^@fa4mmi((9@VFUi(a9]@B? d]ni(2Gu@EA|~Di(⹋6t@='u2i(YYZDX@sWm)(i(E}-G^eE7!i(oW|tB\x^EA]i(73(SQZW)o(i(TIRM UQt\EUi(A3p Ri(.ZD4[W8i(\1@r05i(ksΧβ M8Zi(G"쁬G'Ϸ8f(23QnaJb_(L9-#P(&1 N,<(8}olj8wRDA-(*.O̘be7u%(/ש5$(ryH~G*'(0L|rG5+(9ӱo`)oZ0(iJ[Y{U>]=<6(ǧ8ݠ@gD<(ژ첔WuA(nmTjG(+s vfkKM(,AjQȸ]5R(LЃI{KW(I& bU[V>a[( o)0֧ʻ0_/_(k-Wh?(jab(I3ڄI0>ݗd(W_Q(xCf5}f(-ƛ&Oջ]h( MxY1B⢿lRi(yC+3} i(;#mlL1Ri(􀭱@"6 *Wh(N`BH@˵e(4IÑ@LZqb(\(eE@ØFQ;(W+x@ˇv1cڧ'=TA!_*c:0y5>:0y5>msB_ext_peTW; <;;ը;bq;J;;T;;;F;&;;^7;kٺ;4x;;;6;M;8;;";b;4;;T;;.;;^1;@;; ;L;;\;%;5;};4;n;;2;x:;8;;;;E֤; ;P;ק;V ;4y;f;9RQ;{;; $; ;S:­:D:(:LS9LS(D­S麭 ${;9RQf4yV קP E֤8x:2Ļnǻ4ɻ}˻5̻%λ\ϻϻLϻ лϻ@ϻ^1ϻλ.ͻ̻T˻ʻ4ɻbȻ"ǻŻ8ĻM»64xkٺ^7&FTJbq;ը ;^[;\;;jl;4T;:;3 ;ex;:K:\E4:p9p\E4KϺex3 :4Tjl\^[>寻0[K»tǻc̻F6л\|ӻk?ֻػ aڻwۻkܻݻ޻'޻T޻ǭݻ;ݻ;_ܻuۻfڻ6ٻ׻ֻջrӻѻ лdλҞ̻ʻȻD"ǻDŻdûo߻"$xJteҰ^Bũͫ;O;r;sM;-;;;`;;ڼ;־;~;;;-;s;;=;];;;;';r; 9;;;;;";;W;;Jl;o;l;;W;*;X;޼; ;.;;T;x;;k;;M߹;;7+; ; 0;t;F{;b`b;IhG;!+; ;b:#:A:696A#b޺ !+IhGb`bF{t 0 7+M߹kǻͻxһTֻڻ.ݻ ߻޼ỜX*WloJlW㻪"Ỡ໙߻ݻۻ 9ڻrػ'ֻԻһл]λ=̻ʻsȻ-ƻĻ»~־ڼ⺻`-sMrOͫSЭ;!;;;ʃ;;m;Ѝ;✽;M;;;Y ;/;KW;;g;;; %;G;Ac;w;́;;Sn;J;;ƿ;GP;;;!;} ;;));P;;&;;I;f;;E;"c;~;; ;F; ;S;劼;; ;NԞ;;0y;q;adU;FQ7;*;"::t]O:ޛ9ޛt]O嫺"*FQ7adUq0yNԞ 劼SĻ ˻Fһ ׻ܻ~"c仛EfI컎;&P))} !컒GPƿJSnỡ߻́ݻwۻAcٻG׻ %ջһлgλ̻KWʻ/ȻY ƻûM✽Ѝmʃ!SЭ˯;-±;9;ŵ;@ӷ;;7;';Q;ނ;=;;{:;Y;;p;q;;;to;';>;Z;;8;;;j&;0;=;;)>;2;;; ;";@N;p;ҋ; ; ;;_;y;';Wq;;[;^G; ;0ؾ;a;b;;dُ;i;d;D;E#;{i;:uB_:tC9tCuB_{iE#Ddidُba0ؾ Ȼ^Gл[׻ݻWq'y_  ҋp@N" 2)>=0j&8Z>޻'ۻtoٻ׻ԻqһpлͻY˻{:ɻƻ=Ļނ»Q'7蹻@ӷŵ9-±˯;ɳ;ܵ;N;5;G;|;T;%;N;ң; ;e;;5B;*;4;;W7;һ;@;;E; ;J7;;;cQ;>;x;g;ן;^;N;L; l;"<.h<<2f<<;N;;;P;;p;$;u;;@&;9;;';3;j$;Ev;XS;/;} ;:L q:292L qǺ} /XSEvj$3'9@&˻Իuܻ$pPN2f.h" lLN^ןgx>cQJ7 绽E廦@һݻW7ۻػ4ֻ*ӻ5Bѻλe̻ ʻңǻNŻ%ûT|G5Nܵɳ;)ȵ;9; ;Z;ݞ;;D;P;a;Յ;x;;;D; O;;Ġ;R;;e;z;S5;;;gQ;;ŏ;;;A;\<<ٙ<y<`;< <V<֣<7<;;:4:94غ>dRΪTԳPͻػo!⻅}`g6Vd0qC7֣V `;yٙ\AŏgQS5ze仮R߻Ġܻٻ O׻DԻһϻxͻՅʻaȻPŻDûݞZ 9ﷻ)ȵO;*;;>;;4;]S;;B;N;^;;ԥ;4Y;;;k;;uh;gP;G>;0;&;;;;;;4\gPuh㻀kݻڻػ4Yջԥһϻ^ͻNʻBȻŻ]Sû4>*O^c;;T;R;;.;;N;;w;;z;,;;3;0;js;c;g_;Wf;w;*;r;;a;@N;o;;< <<F<î;S;^;`;0;:@:9Ϲ@0`^S>îл<߻@Nz ' 2(5|{n s % k PF o@Nar*wWfg_ớc޻jsۻ0ػ3ջһ,л;zͻwʻN;ȻŻ.ûRT^c.;9;';[; ;{d;;w;fY;;;;!;;";;9;:;[};;(;;;+{;;<(;;$;|;ޠ;;;;;;I;>;w;v;;;1;;T<@軐I廋;޻ۻػޠջ|һ$ϻ̻>ʻҌǻĻU» п@X);;;`B;W;4;\p;0K;7;g4;C;d;r;;>8;Ф;$;;^;;;I;[<)]o <+"<#'<'< (<'<'<%%<>#<: #%%'' ('B>'U&b(%#+">o }fiEdw kxf)][I^ﻴ$Ф>8r޻dۻCػg4ջ7һ0Kϻ\p̻4ɻWƻ`BĻnC;-;p;a;;K;t;;;; ;;k;;V;;;[;4;#;);k<<<;< :<9<7<5<1<+<7%<<,<{;:':8X7{5d31.4,b)F&S$Y!r SnV [īuﻎ_θ)໇Pݻڻֻӻpлͻ ʻǻ8)Żx»׿[; ;y;4;;A;;x;;YN;;7;;c;P;7;:;iZ;;x<4<<\<9 S S oj5C 9 \4xiZ:7Pc滪7߻YNܻػxջһAϻ̻4ɻyƻ Ļ[@;p;^e;M;,J;y[;];;;};;ؤ;^;3;%;4;b;;<}5B<Fx:6s2B/+x,($q!6<6n  {"rRkﻫ^:řaݻٻ{Rֻӻϻ̻ɻǻ.Ļ|;];S;O_;;; ;t;;t;T;7-;%;<;u;;OQ;Y;E;);');;z;O;bD;fY;};+;k;$<'<+<۩/<3<7<W<<;@T<_Y<9^<'c< i<3n<|6s<\w< [|<<7T`6OKJyE;@W<73۩/+'>$ QIGb * ZYpk+}fYbDOz仧')ݻ)ٻEֻ>һ9ϻ̻4ɻeƻ;;;;4[;;/;;q;R?;,;:;k;;^;;;T;<@<-E< xJ;< <>î;<:<>î >;xaE;B e1;56\!2-)%"Mp.jT\G scYUF@=iܻٻԢջ'Pһϻk˻GȻ;;(;Do;;@L;;;tq;7h;l;˾;%";g;b;"<V<*<\<]Vyn&)·䕃3w"o#hXax$[TN&HIC>95/0E,(("9$,t x` =@  nJ"bg%"˾l7htq廉ݻ@LڻֻDoӻ(л̻ɻy;!;\!;Kt; ;l;;;;;a;H;o;e;<DT@9G4x/+&%"61sIY = +%h@^nxur޻8lڻDֻ&oӻл)̻a;7;~;t;E ;;K;;;;E;;;}<DT@9G4x/+&%"61sIY = +%h@^nxur޻8lڻDֻ&oӻл)̻E;x;P;,;;R;;;;b;I ;V<*<\<]Vyn&)·䕃3w"o#hXax$[TN&HIC>95/0E,(("9$,t x` =@  nJ"bg%"˾l7htq廉ݻ@LڻֻDoӻ(л̻ɻG;k;;'P;Ԣ;;i;=;@;F;U;;Y;;c;s<<<;<@<-E< xJ;< <>î;<:<>î >;xaE;B e1;56\!2-)%"Mp.jT\G scYUF@=iܻٻԢջ'Pһϻk˻GȻ;;;;4[;;/;;q;R?;,;:;k;;^;;;T;E;);');;z;O;bD;fY;};+;k;$<'<+<۩/<3<7<W<<;@T<_Y<9^<'c< i<3n<|6s<\w< [|<<7T`6OKJyE;@W<73۩/+'>$ QIGb * ZYpk+}fYbDOz仧')ݻ)ٻEֻ>һ9ϻ̻4ɻeƻ|;];S;O_;;; ;t;;t;T;7-;%;<;u;;OQ;Y<}5B<Fx:6s2B/+x,($q!6<6n  {"rRkﻫ^:řaݻٻ{Rֻӻϻ̻ɻǻ.Ļ@;p;^e;M;,J;y[;];;;};;ؤ;^;3;%;4;b;;S S oj5C 9 \4xiZ:7Pc滪7߻YNܻػxջһAϻ̻4ɻyƻ Ļ[׿;x;8);; ;;p;;;;P;;);θ;_;;u;;;;ī<<<[< ;< :<9<7<5<1<+<7%<<,<{;:':8X7{5d31.4,b)F&S$Y!r SnV [īuﻎ_θ)໇Pݻڻֻӻpлͻ ʻǻ8)Żx»׿nC;-;p;a;;K;t;;;; ;;k;;V;;;[;4;#;);k<<<8;Ф;$;;^;;;I;[<)]o <+"<#'<'< (<'<'<%%<>#<: #%%'' ('B>'U&b(%#+">o }fiEdw kxf)][I^ﻴ$Ф>8r޻dۻCػg4ջ7һ0Kϻ\p̻4ɻWƻ`BĻ);@X; п;U;;Ҍ;>;;$;|;ޠ;;;;;;I;>;w;v;;;1;;T<@軐I廋;޻ۻػޠջ|һ$ϻ̻>ʻҌǻĻU» п@X).;9;';[; ;{d;;w;fY;;;;!;;";;9;:;[};;(;;;+{;;<(î;S;^;`;0;:@:9Ϲ@0`^S>îл<߻@Nz ' 2(5|{n s % k PF o@Nar*wWfg_ớc޻jsۻ0ػ3ջһ,л;zͻwʻN;ȻŻ.ûRT^cO;*;;>;;4;]S;;B;N;^;;ԥ;4Y;;;k;;uh;gP;G>;0;&;;;;;;4\gPuh㻀kݻڻػ4Yջԥһϻ^ͻNʻBȻŻ]Sû4>*O;)ȵ;9; ;Z;ݞ;;D;P;a;Յ;x;;;D; O;;Ġ;R;;e;z;S5;;;gQ;;ŏ;;;A;\<<ٙ<y<`;< <V<֣<7<;;:4:94غ>dRΪTԳPͻػo!⻅}`g6Vd0qC7֣V `;yٙ\AŏgQS5ze仮R߻Ġܻٻ O׻DԻһϻxͻՅʻaȻPŻDûݞZ 9ﷻ)ȵ;ɳ;ܵ;N;5;G;|;T;%;N;ң; ;e;;5B;*;4;;W7;һ;@;;E; ;J7;;;cQ;>;x;g;ן;^;N;L; l;"<.h<<2f<<;N;;;P;;p;$;u;;@&;9;;';3;j$;Ev;XS;/;} ;:L q:292L qǺ} /XSEvj$3'9@&˻Իuܻ$pPN2f.h" lLN^ןgx>cQJ7 绽E廦@һݻW7ۻػ4ֻ*ӻ5Bѻλe̻ ʻңǻNŻ%ûT|G5Nܵɳ˯;-±;9;ŵ;@ӷ;;7;';Q;ނ;=;;{:;Y;;p;q;;;to;';>;Z;;8;;;j&;0;=;;)>;2;;; ;";@N;p;ҋ; ; ;;_;y;';Wq;;[;^G; ;0ؾ;a;b;;dُ;i;d;D;E#;{i;:uB_:tC9tCuB_{iE#Ddidُba0ؾ Ȼ^Gл[׻ݻWq'y_  ҋp@N" 2)>=0j&8Z>޻'ۻtoٻ׻ԻqһpлͻY˻{:ɻƻ=Ļނ»Q'7蹻@ӷŵ9-±˯SЭ;!;;;ʃ;;m;Ѝ;✽;M;;;Y ;/;KW;;g;;; %;G;Ac;w;́;;Sn;J;;ƿ;GP;;;!;} ;;));P;;&;;I;f;;E;"c;~;; ;F; ;S;劼;; ;NԞ;;0y;q;adU;FQ7;*;"::t]O:ޛ9ޛt]O嫺"*FQ7adUq0yNԞ 劼SĻ ˻Fһ ׻ܻ~"c仛EfI컎;&P))} !컒GPƿJSnỡ߻́ݻwۻAcٻG׻ %ջһлgλ̻KWʻ/ȻY ƻûM✽Ѝmʃ!SЭͫ;O;r;sM;-;;;`;;ڼ;־;~;;;-;s;;=;];;;;';r; 9;;;;;";;W;;Jl;o;l;;W;*;X;޼; ;.;;T;x;;k;;M߹;;7+; ; 0;t;F{;b`b;IhG;!+; ;b:#:A:696A#b޺ !+IhGb`bF{t 0 7+M߹kǻͻxһTֻڻ.ݻ ߻޼ỜX*WloJlW㻪"Ỡ໙߻ݻۻ 9ڻrػ'ֻԻһл]λ=̻ʻsȻ-ƻĻ»~־ڼ⺻`-sMrOͫũ;;B;^;Ұ;e;t;xJ;"$;;o߻;;;;d;D;D";;;Ҟ;d; ;;r;;;;6;f;u;;_;;;ǭ;T;';;;k;w; a;;k?;\|;F6;c;t;;[K;0;;;>;^[;\;;jl;4T;:;3 ;ex;:K:\E4:p9p\E4KϺex3 :4Tjl\^[>寻0[K»tǻc̻F6л\|ӻk?ֻػ aڻwۻkܻݻ޻'޻T޻ǭݻ;ݻ;_ܻuۻfڻ6ٻ׻ֻջrӻѻ лdλҞ̻ʻȻD"ǻDŻdûo߻"$xJteҰ^Bũ&;`; ;;.s;+;;F;b;'$;;^;k;-;;©;`b;;i;f;;;;R;;;,;];t;xk;7@;;p;;D;;`;;%;,y;f;&;L;_;);a;y;;mٳ;w;+; ;˖;ݍ;D;t;ID^;mLG;A6/;;`7: ::(: aa9 aa( º`7A6/mLGID^tDݍ˖ +wmٳyaû)ƻ_ʻLͻ&ϻfѻ,yӻ%Իջ`ֻֻDֻֻpֻջ7@ջxkԻtӻ]һ,ѻ;ϻRλͻ˻ʻfȻiƻŻ`bû©-k^渻'$bF屻+.s `&W; <;;ը;bq;J;;T;;;F;&;;^7;kٺ;4x;;;6;M;8;;";b;4;;T;;.;;^1;@;; ;L;;\;%;5;};4;n;;2;x:;8;;;;E֤; ;P;ק;V ;4y;f;9RQ;{;; $; ;S:­:D:(:LS9LS(D­S麭 ${;9RQf4yV קP E֤8x:2Ļnǻ4ɻ}˻5̻%λ\ϻϻLϻ лϻ@ϻ^1ϻλ.ͻ̻T˻ʻ4ɻbȻ"ǻŻ8ĻM»64xkٺ^7&FTJbq;ը ;>ADōGOIK{MNOOOON{MKOIōGDA>>;73]/+~'A#ri&K k ^2!ǭݻֻ лɻBû-L#Eڦ֜"抻!ц₻2~"vo^iEbJ\&;V^dPiJVEM@ ;]/6j|1g,(nX$!GP%۩).,. 2P7ɻLϻDֻT޻Jl} Nٙ]% 9 Y!p9&*B/37{ <@CbGĞJ}MP\KRPT)pUZV|V|VZV)pUPT\KRP}MĞJbGC@{ <73B/*p9&Y! 9% ]ٙN} JlT޻DֻLϻA>ɻ»d%ڶZ;KE zs܃l9e-_ X7RL\GA(45)9\[>CF\I/O$>U[bho`Xw&~7w%m\y٩^2}» Ȼϻֻ'޻oLys frS$7).s27W<]@-E&HI= MPSsVY{[b\B]^^B]b\{[YsVSP= M&HI-E]@W<7s2.7)S$rfs yLo'޻ֻϻ Ȼ»2}^y٩\m%7w~&`Xwohb[$>U/OF\IC\[>5)9>(4TV/j*7&!7V" '+$o0Mg5U:?ERK\QW`0^dTl7zs+{YŅ\cPs͢a41g>^)ǻ\ϻ`ֻ޻l)) l`;0 /}A!F&9,u16;;@E xJN*SWƔZɱ][`̈b1dPeOeOePe1d̈b[`ɱ]ƔZW*SN xJE;@;6u19,F&A!}/ 0`; l ))l޻`ֻ\ϻ)ǻ^g>41a͢sP\cYŅ+{7zsTld`0^W\QRKE?U:Mg5$o0+ '"V7W:#k'k,s16;sA9Go vE&4,72D8R>BYDKJPUx$[J<`0dlEim_psu v7w7w vus_pmlEi0dJ<`x$[UPKJBYDR>D8724,vE&>o '#p V.h@N;&Wkܻ%Ի5̻eŻ9`+ͻRGKC-w5{zrjmc[\yUOHB](= 7`2XM-m(S#E=B<@4ϴw- I$$E(|-03J8O>;DiJPW ^e^mu)~iu둻3o󐼻û}˻,yӻwۻ*从p֣w 7i+"0a(. (5;}5BH`6O[U[XaMglRqUu=yt@|~ss~t@|=yUuRqlMgXa[[U`6OH}5B; (5.0a(+"i7w ֣p*wۻ,yӻ}˻û󐼻o򮻋3둻iu)~u^me ^WPiJ;DO>J803|-E(I$$ -wϴ4 YcCg$Y)b.3|9X?vEKIRJY`]}hpxJրpDS3 +*Y0n4ɻfѻ aڻXIҋ2f7;Q IxV#[*18o?FL1M>T2[xa#htnt`y~aoa^Z?Z?a^oa~`yttn#hxa2[>TL1MFo?81[*#xVI;Q 72fҋI컜X aڻfѻ4ɻ0nY*+ 3SDpJրxp]}h`JYIRKvEX?|93b.Y)g$CcY _+ =TjP$;).k42:=@ۏF,MT Y[bj@s|>l|ΑeErİm򷻤~nǻ&ϻػ޼f 4p {X]b(%,,d3:yUBIQ_Y`wh"oלv}Wt2i\ߊ ` `\ߊit2W}לv"owh`_YQIyUB:d3,,b(%X]{4p  f޼ỡػ&ϻnǻ~mİrEeΑ|>l|@sjb Y[T,MۏF=@2:k4.;)$PjT=+ _ѫD < #;yt$ )/4:B@2G$WN΂U3]d5mu(V\d O3hsխ!aĻLͻk?ֻ ߻ qC{N ~@`'U&-{5i=EM+V9^f.o3w~ojl1 ܏MM ܏1ljo~3w.of9^+VMEi={5-U&`'~@{N qC  ߻k?ֻLͻĻa!խs򦻶h3 O\d(Vu5md3]΂U$WN2GB@:4/ )t$y;#< D ѫI.*| ގ49r2$;)/4 ;A@HROӿV^If `oIsxU쐻і *Kٱ2_ʻ\|ӻ.ݻEN0v 5|hB>'$/X7?H2QpZ'cm#v΃ER JyG`ɫSSɫG`yJR E΃#vm'cpZ2QH?X7$/B>'h5|v 0NE.ݻ\|ӻ_ʻ2ٱK* і쐻UIsx `oIf^ӿVRO@HA ;4/;)2$r49ގ| *.IckȲP 5#-)."46;AfHPP,W7_ch\qzqg;SR7䟻Sצ82x:)ƻF6лڻ"c_ﻀd% < '108gBgK<UN^ iN*s?}䕃 e=[~QddQ~[= e䕃?}N*s iN^<UgKgB810' <% d_"c仝ڻF6л)ƻx:82Sצ7䟻RS;qgz\qch7_,WPPfHA6;"4.-)#5 PȲkcYށƜKbn std"E(Fo.4 ;A IGP`X*`iF!sG|:S?̛.8aûc̻Tֻ~y컏6V 2( (0':CM+kX1c3nB[y4E·5b;*@FQd ^j\wł@؀0i顯A 4KṼKṼ4A 顯i0؀@ł\wjd ^Q*@F>;0'v<' UWq ׻ͻ»oYF. OgX$a=kvR2\ey6E֤w0 ˻[׻${-zX6>##6Xz{-$[׻ ˻0wE֤6y\eR2v=k$agX. OF׉>6/ )"}y iet&# ݺֺXкtʺú740Siiĺʺ>HѺ/غmߺI'm>" U6 &-4fD  +寻M߹SĻ^Gлuܻ}n@M : +7DRxar7й?y0I4,Ώż!μUռ]vۼ߼··߼]vۼUռ!μΏż4,I0y?й7rxaRD7+: M n@}uܻ^GлSĻM߹寻+ Df>lv$k`VhMYD<4-&6 U >"'mIﺸmߺ/غ>HѺʺiĺSi083U#\ĺ[ʺ Ѻ^غD+{ {Wn W#*t19E.BKT^iUu\쀻ɤGߎP 劼 ȻԻo!⻋𻀀XF QI(5PBgQiaչrNU<>Vѭz żU,мtڼEp㼐Wdd0EptڼU,м żzѭ>VGQ$\Tg[[sr&2 ק˖>7+0ؾ@&˻ػ?@NsP@U$1-?N_Ero1*AXμۼg66gۼXμA*1oEr_N-?1@U$Ps@N?绋ػ@&˻0ؾ7+>˖ק2 r&[[sTg$\QGk>+f6o.&Q_6,/U on$IY,LٺUѺ}ʺúF`#JeQmw]ǐKl,🩺X&\ЮSVºXɺ,кغwTOnbσ"N*W2K:CLM ~XcNp}V ݍ^[ a9Pͻg5ܻ컦hoS+Q:nSJB \o 5xe\E/Ǽ׼[FdY  d3d3 Y Fd[׼/ǼE\e5x oB \nSJQ:+Sohg5ܻPͻ9a ^[ݍV }Npc ~XLMCK:W2N*"σbOnTwغ,кXɺSVºЮ&\X🩺,lK]ǐ/䄺k+mJ͖֛i!OS2 FǺκ}ֺߺ~>Tw+t /4=$,5> HySS_k4yD\ 0NԞbԳϻ<߻d𻎶J 7%3SD\VAk፼ ]9!μἪF-o ``-o F!μ9] ፼Ak\VSD37%J d<߻ϻԳbNԞ 0\D4ykS_yS H>5,$=4/+t Tw>~ߺ}ֺκFǺ 2SOi!֛J͖+mk/䄺qAAy鸺ɺgӺݺ`@󺁕n hW`;&0{;mLG4Tb`bqij$R&>îӗMλ5a{%&%no8NMgm`@tڼtڼ@m`MgNno8&%%{a5Mλӗ>î&Rj$iqb`b4TmLG{;0;&`hW n`@ݺgӺɺ鸺|> Cܖ-eϋs쁺Kz@rijbyZSJLwF*Z@.G',r628E@> DӋK]RGGZ?bYjƌsB|s!RCJQ|)["PʺԺRߺDu@ Rc $A6/:IhGadUdEvNSߨ54ɻݻwrr(u=VtԼԼԼԼԼԼԼԼԼԼԼԼԼԼԼԼtVu=(rrwݻ4ɻ5ߨSNEvdadUIhG:A6/ $cR@ uDRߺԺPʺ"[)|QJC!RsB|ƌsYj?bGGZ]RӋK DE@>8r62,.G' FEIEυ! ',42h94@.bGNV}_hOr:|6Gm7LrȺӺprߺ4C3Z 3 !+FQ7DXSdw^_t7t;ֻP@b) EAl]E@ ԯX9X9X9X9X9X9X9X9X9X9X9X9X9X9X9X9 ԯ@El] EAb)@P;ֻt7t_^wdXSDFQ7!+3  Z34CprߺӺȺrL7mG6:|Orh}_VN.bG4@h942, 'υ!IEEF 蹐X/4 : '0;#)0-7X> F"FO3XIakIv+e^yFRcp~?ºκ)ۺS`7ex *E#/>S N`|t慻Eǵ4ɻk⻒'bAxaqttttttttttttttttttqxabA'k4ɻǵE慻|t`S N>/E#* ex`7S)ۺκº~?pcRyF^e+IvkIa3X"FO FX>-70)#;'0:  4/XX{@QOp^ǹιչk/ݹ uYtY7ͼ [8C 6!r(4;018ͳ@IۖS^)ciru\{w4s­ ºϺb޺"{i} "0\A#TLih ?A.λ ۼ>;_A`eA`eA`eA`eA`eA`eA`eA`eA`eA`eA`eA`eA`eA`eA`eA`eA`eA`e_>;ۼ A.λ? hLi#T\A0"} {i"b޺Ϻ º­sw4{\ru)ci^ۖSIͳ@184;0r(!6C [8ͼ 7YtuY k/ݹչι^ǹOp@QX{ҁcՓ똹F꣹jݩw$7ƶɽB6Ź͹mչL޹E|v] ~6X'09]CN[PKhIvDK#嫺Ǻغ~ )8 <QYQkuZխ4ɻ컜 # # # # # # # # # # # # # # # # # # # # 4ɻխuZYQkQ8 <)~ 꺳غǺ嫺#KDIvPKh[N]C90X'~6] v|EL޹mչ͹B6Źɽ7ƶw$jݩ꣹F똹Փcҁ $>&+m17)>ND5LƧS[Ddmk{v$DܚH1Uȗ iNSɹӹV߹,빥*wO ((\E4At]OuB_L q4Ӎ@ND#gͺ;H?&" 9 UxM->îĻĻĻĻĻĻĻĻĻĻĻĻĻĻĻĻĻĻĻĻ>îM-x U 9&"?;HgͺD#N@Ӎ4L quB_t]OA\E4((O w*,V߹ӹSɹNi𭹘 ȗ1UHܚ$Dk{vmDd[ƧS5LND)>7m1+$>& fPV0]f*eluU}Ba(2)^ EsA)Je๸ʸӸݸ->cu b_&A(0%:FLS aap6ޛtC2ɽϹ4] s+5f@Ycy3ź-踰ݸӸʸe๸JA)Es^ )2(BaU}ulf*e0]VfPfP8V80]8f*e8l8u8U}8Ba8(8288)8^ 8Es8A)8J8e88888-8>88c9u 99b9_&9A(09%:9F9LS9 aa9p969ޛ9tC9299ɽ9994]9 ::s+:5f@:Y:cy:::3:<:o;o;o;o;o;o;o;o;o;o;o;o;o;o;o;o;o;o;o;o;<:3:::cy:Y:5f@:s+:: :4]999ɽ9929tC9ޛ969p9 aa9LS9F9%:9A(09_&9b99u 9c98>8-88888e8J8A)8Es8^ 8)8828(8Ba8U}8u8l8f*e80]8V8fP89 9$>&9+9m1979)>9ND95L9ƧS9[9Dd9m9k{v9$D9ܚ9H91U9ȗ99 99i9N9S99V9,9*9w:O ::(:(:\E4:A:t]O:uB_:L q:4:Ӎ:@:N:D#:g:;H::?;&"; 9; U;x;M-;>î;;;;;;;;;;;;;;;;;;;;;>î;M-;x; U; 9;&";?;:;H:g:D#:N:@:Ӎ:4:L q:uB_:t]O:A:\E4:(:(::O :w:*9,9V99S9N9i99 99ȗ91U9H9ܚ9$D9k{v9m9Dd9[9ƧS95L9ND9)>979m19+9$>&9 99ҁ99c99Փ99F99jݩ9w$97ƶ9ɽ9B699m9L99E9|9v:] :::~6:X':0:9:]C:N:[:PKh:Iv:D::K:#:::::::~ ;;);8 <;Q;YQk;;uZ;խ;4;; < #< #< #< #< #< #< #< #< #< #< #< #< #< #< #< #< #< #< #< #< <;4;խ;uZ;;YQk;Q;8 <;);;~ ;::::::#:K::D:Iv:PKh:[:N:]C:9:0:X':~6:::] :v:|9E99L9m99B69ɽ97ƶ9w$9jݩ99F99Փ99c99ҁ9X{9@Q9Op9^999k/9 9uY9t9Y97:ͼ :[8:C :6:!:r(:4;0:18:ͳ@:I:ۖS:^:)ci:ru:\:{:w4:s:::­: ::b:":{i;} ;;";0;\A;#T;Li;h; ;?;;A.;; <ۼ<>;<_;<ۼ< <;A.;;?; ;h;Li;#T;\A;0;";;} ;{i;":b:: :­:::s:w4:{:\:ru:)ci:^:ۖS:I:ͳ@:18:4;0:r(:!:6:C :[8:ͼ :7:Y9t9uY9 9k/999^9Op9@Q9X{99X9/94:: :: ::'0:;:#:):0:-7:X>: F:"FO:3X:Ia:k:Iv:+:e:^:yF:R:c:p:~?:::):S:`7:ex; ;*;E#;/;>;S N;`;|t;;E;ǵ;;4;k;;<';/;E#;*; ;ex;`7:S:):::~?:p:c:R:yF:^:e:+:Iv:k:Ia:3X:"FO: F:X>:-7:0:):#:;:'0::: : ::4:/9X99 :F:E:IE:υ!: ':,:42:h9:4@:.bG:N:V:}_:h:Or::|:6:G:m::::7:L:r:::pr:4C:3:Z; ;;3 ;!+;FQ7;D;XS;d;w;^;_;t;7;t;;;P;@<: D:ӋK:]R:GGZ:?b:Yj:ƌs:B|:s::!R:C:J:Q:|:):[:":P::R:D:u:;@ ;R;c; $;A6/;:;IhG;adU;d;Ev;;N;S;ߨ;5;4;;w;r:8:r62:,:.G':*Z@:wF:JL:S:yZ:jb:i:@r:Kz::s:eϋ:-:ܖ::C: :|>::::g:::`@::n; ;hW;`;;&;0;{;;mLG;4T;b`b;q;i;j$;R;&;>î;ӗ;M;5;a;{<%<&%î;&;R;j$;i;q;b`b;4T;mLG;{;;0;;&;`;hW; ;n;:`@:::g::::|>: :C::ܖ:-:eϋ:s::Kz:@r:i:jb:yZ:S:JL:wF:*Z@:=&Y:`:;g:n:Iv:::.:\:&y:-ݗ::Ñ:i::Ϸ:c:2p::x:U:<:V:;};W;[;;8';4~0;R:;7sE;9RQ;ID^;jl;F{;0y;dُ;3;Ϊ;f;;p;37;l6;ld<,<)!<B2<0E:Tw:;+t ;/;4;=;$;,;5;>; H;yS;S_;k;4y;D;\; 0;NԞ;b;;Գ;;<;d;;5;,;$;=;4;/;+t ;;Tw:>:~::}::F: :2::S:O:i!:֛:J͖::+m:k:/:]ǐ:K:l::,::X:&\:Ю:SV:X:,:::w:T:On:b;;;;σ;";N*;W2;K:;C;LM; ~X;c;Np;};V ;ݍ;^[; ; ;a;9;P;g5;;;h;G;Q;$\;Tg;[[s;r&;2 ;ק;˖;>;7+;;0ؾ;@&;;?;@N;s;˖;ק;2 ;r&;[[s;Tg;$\;Q;G;k>;+f6;o.;&;;Q_;6,;/U ;o;;n:$:IY:,:L:U:}::F:`#:J::e:Q:mw::::83:U#:\:[: :^:D:+{: :{:W;n; ; ;W;;#;*;t1;9;E.B;K;T;^;i;Uu;\;ɤ;Gߎ;P; ;;;劼; ;;o!;;V<ѭV<<H:/:m::I:'m:>";; ;;U;;6 ;&;-;4;<;YD;hM;V;`;$k;lv;>;f;D; ; ;+;;M߹;S;^G;u;};n@;;lv;$k;`;V;hM;YD;<;4;-;&;6 ;;U;; ;;>";'m:I::m:/:>H::i:Si::0:74::t:X:: :#:&:t::e;i;y ;;};;"; );/;6;׉>;F;. O;gX;$a;=k;v;R2;\e;y;;6;E֤;w;0;; ;[;$;;{-#<#<6;6;/; );";;};;y ;i;e;:t:&:#: ::X:t::74:O::\:!::&::n:;ve; ;;; ?; ;$;+;1;٥8;@;$G;P;|X;)b;r l;ɇv;؀;Ɇ;;H;D;ʢ;;mٳ;[K;k;F;;p;`;&< <8<&<%%;<*@F;<0<'<'<$/'<l;|;Α;e;E;r;;İ;m;~;n;&;;޼;f; ;<<4p <{<l;;|;@s;j;b; Y[;T;,M;ۏF;=@;2:;k4;.;;);$;P;j;T;=;;+ ;_; ;Y;;;c;C;g$;Y);b.;3;|9;X?;vE;K;IR;JY;`;]}h;p;x;Jր;p;D;S;;3; ;+;*;Y;0n;;4;f; a;X;I;ҋ;2f<7<;Q <T<2[T;;D;iJ;P;W; ^;e;^m;u;)~;iu;;;;3;;;;;o;;;};,y;w;*;;p;<֣;J8;03;|-;E(;I$$; ;-;w;ϴ;4;@;<;B;E=;S#;m(;XM-;`2; 7;](=;B;H;O;yU;[\;mc;j;r;z;{;5;-w;KC;G;R;;ͻ;;+;;9`;e;5;%;k;W;;&;@N;.h<V<p <<'#<<>o <D8<72<4,o <<'#<<p <V<.h<@N;;&;W;k;%;5;e;9`;;+;;ͻ;;R;G;KC;-w;5;{;z;r;j;mc;[\;yU;O;H;B;](=; 7;`2;XM-;m(;S#;E=;B;<;@;;;;W:#;k';k,;s1;6;;;sA;9G;;^;;);\;`;;l;)); ; l;`;<0< <;41;a;͢;s;P;\c;;;YŅ;;+{;7zs;Tl;d;`0^;W;\Q;RK;E;?;U:;Mg5;$o0;+; ';";V;7;;!;7&;j*;TV/;>(4;5)9;\[>;C;F\I;/O;$>U;[;b;h;o;`Xw;&;~;;;7w;%;;m;\;y٩;^;2};;; ;;;';o;;;L;y<U;/O;F\I;C;\[>;5)9;>(4;TV/;j*;7&;!;;!;GP%;۩);.,.; 2;P7;<;A;\G;L;7R; X;-_;9e;܃l;s; z;KE;;;Z;;;ڶ;;;%;;;d;;;A>;L;D;T;Jl;} ;;N;ٙ<]<% <9< <<;;;d;;;%;;;ڶ;;;Z;;;KE; z;s;܃l;9e;-_; X;7R;L;\G;A;<;P7; 2;.,.;۩);GP%;!;nX$;(;g,;j|1;]/6; ;;M@;VE;iJ;^dP;&;V;J\;Eb;^i;o;"v;2~;;!ц;;";;;֜;;Eڦ;#;;L;-;B;; ;;ǭ;;!;2;^;< ;<><A<>;<7<3<]/<+<~'HA @ A @ A HI\t>L:~AFMf͢KҎ;(::f}2 Id2ۿ?X3Ӆ xM2Lj_\\!DD4u?P(wZ퀿i4 ߋlv*--<;;[i^0"5BA^~M('JEBUUU۷oSrr2i4R*V)""vE#yJ:;;ÇTRRBGu… iDJr9jh4xbڵkSGGWx) H03[Ve___ᱛ7ofG---+#"޴iz[uWHʓIIIaiiiz m3??_XGzzSO.ٳgmɕDZ,NLL;K,auuubwlll,y ޖ'{Kpp0Ka4yJ%::{{{h4Wǎ;˗/O}9L~=O0؜H 䦦&vZZZ8((H2~ܹz̙#Vݻwꉊr \ ,p*ߟu: myr ۺy0~Ϟ=bv!yL&[ns@] ³q8_LUWy8q"gdd˗`0baN/^^hG ;t0W =vؠm$%%ׯ_%PQQ1qƹWLEEE,:3_3g۷ow*iٲe\VVVծX,޺ud{'Oƛf^tdqӧO;lʕlX$A@@:tkjjXqOO777s^^3fЉ ԩSfffJ̛7ϩa ͒ی#?^XGLL'Oϟ?]\Pg'Ȃ ?I]>}*7a7lxbjJjW3كl;e_V%W,„iii###%t:S}w7'HTUUE_oߒV%HFwrss)''ǡ>}/_.<oRWWuuuj@6mEEEQBB%%%JrBd0vJT*y欬,I{'Ay͚5v׽qF.\D DvE&婵UlqNh46 ݸ4=z.HxV[!S^w˓d6iÆ ͛aBVy\LL _Gݹ2=|xbyJŋ1| L!_Eiiwk׮ycE ߾}^pa ?8=$ &!U E(Ւq&#""F SL>z?=>!{? J=9b>|?Y=R>q|? M=|?'< >O|?|?Np <`>|?Մ; >|?x`q">|?$>|?5&>|?-̼=*>W|?E d0>{?_ L;>J{?TDK>z?[cb>;w?#܁>7t?d=>Mm?Ю̴>(b?[Ծ>N? >}6.?]|,H?>bF*Z?ӳ]>8^P?HȽwDa>̾X#*>7>"] <>°K՟Ӿ9>qdI==$rlg E#=*y(%ԁ|I4A~E'U~$ҋ ~9(Qئ~S*>ܽY~Ѯ}?ZH F}G:` @}T; W}xv:n}zj j}<Io}VA}CB}w-]t}E }Gt}`H ɍ}nM"b};Z}B 7tzu>S=]{?/==Y0}?P=M=}?[=>=~?==/~?K;<==~?m<n=+A~? R}? > }?. _y>|?顽#>{?7o;>z? "}X]>v?db>p? >we?ZJҾY>7S?- HL>V4?lh*E ?6R?EN?U>Oa?e3E"?@l+>.Y >ZFO׾^e>a `+>4pl=y2x,S]$=Q'|CI:0+Խ*{>Z=|?[==}? i==s~?Ź]==H~?j= V=~?3<=~?͆<ٲ=~?<=1~?G ;B{=C~?_Q̺=|~?˩%վ=*~? Њ=M~?Rɼ=~?~}=Ih~?[=}?5>y|?ɇ㽚>L{? @>+8x?j^9o>=r?N{>th?>X;>}LW?_->C:?&? ?By~?u>xNT? F ?옡3Y.,>kf{۶>W@9ݾ4^>3]ƥ7>7mIcs=>v$s1=kk{ %1=X}½`ѯ<~`Ţ ;Up\RZ#_*(Nbr9\8X|>e=|?ދ=5k=V*~?&= )u=~?_=2?=?0=F=F?=zc?<9=e?|.;硍=b?GBM=[?I=P?r}=@?³ļ!=j$?.=~?V=~~?;!=Ů}?ݽ- >v,|?4w5+>by?KWuY>pFt? ϓ*>k?ƾt\>[?>)@?"7?!l?d>,?>˰LP?7=nG?o10>AA>8e>zXԮW>k }3k>1u8j=vz9Fc=}*ʽ">=y~o%9<;1Ef:@;1*#;v.ż/!z2*X?!JlC;9sԹX1Tx+$92]eISŢgk3UhJh>D);Rb>٢4̻ԇpX/ڼk@Gf<\K;;0]x 0<^ƛI<ۈj9<y`N|V>*=G}?˫=ע0=JV~?x=9=y~?a=0C=Q?p=K=|?[Iz?ϧO9G>u?\󍾱>tEm?9>^?W>E?_\>}9?/9? >d9J?=H?Vn4@)?py>06>/3Syt>gvt0>+MsA=Vy:=p|J2Խz= ~@7.8= ~[Ors<{6 ,z; Ҽ*3 ?~vW"{ ;WIQr L;lbt*пL*'d<\`odcL<[ *Wtz!&#|>Uc=]'}?Ʀ=c"=l~?j=p=?lc=a=wm?j!=\ =К?5<&=9?_Q<*=?I<-=??K;s1=?\s6=?&==?mkqD=?ܷwQ=?Fh=iq?HZ=_?Tj=A|~?Rνľ=H}?/i >u{?FZ9>`v?r~s>c>o?J>a?ˈ>XK?! >&?3{3? >Fʛ?&>H?DldU87 ?YݾT>:>'Rz>4"M@}> &d:xI>"qmLRZ>i xJW={Pཤ4=}o^rw=~0=Qn6[xE?=r}=süoi=.Լ8 3==N~Ҁ>~d|G>|<1}?z{=Xg{?X<,>w?e> q?JDY> e?pj>P?:_|>$/?- ?:>BB?e>hH{j?G;?!>^J%q>*FL7ƾP>X_(6`>nXXb@)>v>z;|r=&}$t=]M~)H=*KBC<_];+T`yɻdOU!s9ҳ ]T37Ճ=E삼ݧMSףځ)%2w?,$r }H<7*uJ8*8< L<Fc<fV<ż%A<)os<Õra<~Һ=%jB>=Ԣjc?|=q{Ҩ=T.Z=\~J>Eo}>Q<4}?K=Ǡ<}~?=T<*?Wf=h<?`#=`y<?ax?/ s-Y>r?13>g?~վU>6U?%]>p7?& ? ?$>? ]>G?D(O>y?&?j>0>HѾ>}>H[v>kf)I<>t_*>Fyj=e|_=#}Sq=u~Oϩ ="`<|z豒:N8..6!b$JSҺ!޼bB2J-<#,A/X?/֋-/PE˲S$w"Nf7kS-:-Zc"&Ԃh}},>Gei<4}?>=1}<|~?=<-?h=3<6?Ɋ%=<?A3Šy?WdJM>/t?z>5j?ɾ2>SZ?з>T??" ?4?H8,?<dz>n+FW!?#dt=AO?{;,}?^ pr>5|Zݾ<>wU狦D>Fhhv*O>osK6W!>xs_=T{ϽR=}О=~^m+=ze0S Y<n닼DM;;i;;<;fr;)fPǕ;(oKjE]G\3aɻϣM ̷:6;h9\:'/I= #=C*k~ٙ>C}E>6< 2}?=Y=^nJ<~~?=&j<4.?)j=p<?x}'=Bג<?%<]2<?h<?G<?(=?XrrR={6?=#=w~?=A齀= }?K>>az?5VA>u?aw>|m?d3>^?:>F?޳?!? L2-?T>,C!?>OC/?151K#?7ųL>7*@M꾈>8Om(>GdM׃b>p}C1>\wku >zX޽= } =f~ooG=A'k<GF}y[> <-}?=H!<|~?UY=ɹ@<7-?l=a{? I6>v?3zt>^o?Na>b?侧>M? >+? t+|?T>?E ?@T>D"?GȽ6 ?7Ǿo1>8>pG -O >I``;u>uGnR A>zuV> z[-=܂|óV=D~a=&= I8~< Py :@< ~%LI<TdGW<H3m:r`m˻79F$x34c;bGJ<B@m:ỵ:qU=rݻa }*>;'}?<=z>;w~?=w<*?o=?<"w?1zf>Xnq?ZŤy,>^S?5 >5?~$ ??;?_>D#?oF;2?P{"17???+> N?ɾ9h>QO[lw'>MkbQ><tr*-J&>y2O>_{½&޼=}JO{=.~\4= \n<(*<ɡ/; K9wFq7Q!;&Hp<<]hp<CQa1;:W8B<{?bwʹ6$}Q9>Y;}?^\=+;p~?i=;O&?cs=<?[0=2<2?x?gh Y>-Is?W͎>Oi?^ǾĶ>]Y? 5>;>?ta?N?.6?~?>~C$?Gy{=?)?|+H) ? >Y5[׾1>mU3>gbs8c>CrA7'4>vwR >.{ѽv=`Z}hΖ?=U~EDQ)=|Qa׼+<ֲ:rd''} W>nW8;}?Z=꫃;yi~?BM=D;-!?x=g=<?4=y?2 XL>Jt? qz>~l?Zz>mm^?@>2:G?eQ?Su"?L 09:?>A,#?_>B^C!?On,W/v?羰vVu>*,徲H>`N>cSu>(o G0gB>^vy>[zi㽭S=|4t=az~U)>=_Do=<4n*<jq<8CŨ<7fˀ<+~;?cZкt3g;-)F<sҥD<K;5:q:[H<_ ;5 ;L-= z~x> ')}3>Rմ: }?HY=.*;c~?K=s;?4s{=e; ?{7=;?=y<8?p<<*?ic<?g{m6=B?p=Q~?=-}?C>z?@/Hc?>^nv?dm|>%o?>b?޾>N?U >3.?lP)?x>>' ?)d>vD#?. 6?zľw?H$|>FJo>^%K>l XP>tZ$`!>NkyUj8=_|c = 5~hLS=>q =pS:<*;<N)e<ҭ~&<޿;quKYٹ*n֌b,M?sи;=ES<<N<wO$!;::I*}>A7}?Q="<:3`~?=5R;?j+}=0;ׁ?U9=6;?P=:/;@?k&=>]?3✽O=M~?нü=n}?0I k>Zp{?9>3>aw?}tb8l>zq? Uʗ>f?оH>zsU?!TUl>S8?'"v ?I ?:K?>'EP{#?e4; ?1ɞ"7)?e(>=˾>fxYJ>ijW_> rh2o,>Xxh={8M/=},N}i=V=@饼֣<i=<_t  ڵ<4W *W)}>d(}?Q=#)^~?oB=*;U?}=A;́?{9= ;?)=$$;?%<;%?7d\=r?‘n=9~?pA]=}?|=|?*'>x?.ao\>s?>j?>c[?k>UbB??b?55q?Q>sDY"?y={@l?mU* ? >qc3ھ>&S4>eO{m>poBH7>?w e>2{vrϽ=}N=~"%e2=z%[=s=˖<2_Ǽu<|]ռ><638k'}>/\}?!=0K_~?F=Jw:?8 }=?;?8=T2;?&=L;?.y?PzM>Iu?@>m?%Ŧ>{`?V>J?&.֧>t'?J0?H>CZ?7'>osD|?Hj32N?!K>'뾖>L|>am|>4nRB>/u_>n\zI 9=3(}ҕEˋ=$~T7!F=qɎ߼hG=⧼= =<3ؼí9Q#}>9B}?S=ܺja~?'p=`9?{={; ?M*7=ޫw;?{=ό;?01<܉;?Wvz?@*?>v?<-u>r3p?(">e?2=۾Մ>"R? >y2?.+* ?kV?:AV?y>[^G?㍽l9?%w?t`&O>C sľ>M\]k>ikd%N>*%tT*>}y&X=5||=z~LRY=R.=.HļG=Ws˼\=YCp<I,<9Lʼ9;!;;@:}>_uGY }?=e'f~?O=w98 ?rx=%;?4=ιy;ۿ?$<V;?ak<č;`? O<[;A?`;rt;)?:Pl;}?0g;[?IK=Uz}?Tͽ>bw{?2`2>x?/md>nr?LŜ>{h?t;/>OqX?iG>}v IW?Y;P*Ծn>cW}>EhuxM`Y>A\r8C">rxP =*|6경b=3~c tl=U-]*e'==+$=swJ =<g?<WҼƠv;|9C;[<#$X<3v><^x;r:I;c3F<^;6U:}>2}?~=m~?=1:%?t=5;?w0= B;?X0y?_\kS>LSt?t>`l?Ws>|]?hr>xE? >?a9?U>BI?/|=|Ex?5"c/?]>#1>)YQm^>!d3$~d>Zp'G+>bw={ýU=}}t~=V'ҖJ=xVZ-=Ӥ }=7:r=MP;q0=I;Kp=K;΍,=ʿ;)=K:Zm~>9}|>%}?=0yw~?B==鞙:,?ro=yxX;?x+=g;?]";?Uܻ;?50 ;?|g;!?6<?}x=!z?9NC>.u? w>n?4{|^>8lb?"eY>LM?;!>^*?4? ?r>'%I ?50>tPJ~h?+j7] ?ݾ!.s>V&?%>Jax>v`co>nXۖ4>^!vE$>z*ֽ=}}.]=~Vl=VW=)_H6=!o'=a !<z;f<+d9;{ 廼%;C:-$;zG<0%ˆ9 }8'>T1}?W=P~?W=M:3?7^j=g#x;)?&=";?S<;Q?oߒ<;0?Z)<<;C?;;?ס":4;?Aq:;? Իa;?F';?imR#;?T<?7ܼO;u<?Qp&<?!%846=;?Yq5Rf=B&?_$=2e~?dؽu=}?h4>z?A_5>Hw?r5e>^q?Ԫ>BVf?\Aϳ>gS?<>4?/˒??GE?}>M?勽? ?'j#>8>=EC$;i>i\˔1z>_ki2=>2t/({ >z$=}ƛ竐=~Svnc=C">=T;A=^!;n=i;d蜥=/y;(+=7:g~>0]97\}/>:;}?*=~?=~;48?uf=ޱ;J?b#=k;?~{?W5{P(>Cjx?qyT>s?Cv>ʫi?x}Ծ>pY?0 >=?*̎>;v?@E ?2>qP'?j;IlF1 ?],\b>ft ?>2E;NܾM>:W)->hE|yE>%s 6>Cy=Ȯ|~-[#=d~jCn=$F2 G=s; |$=|hy<߳?<G;ɼY;ߴ];-=<)<i-kxp;19y;/J;<;Rc<%;ǚ< D; <D:< ;Xwƙ}><ɋ>}?r=W~9"~?=#;9?e=潔;*?"=Z;)?<;?Zy?cZE>t?Ow>l?]ɾԚ>F^?Y`>UE?t$s>ޘ?D:BW?b>Q?¡=xL ?cVs4@>t9>n2F>ERkC$>e[M>eqzE>Rxso uk=3|-='~π|x=C6O=a[-*=:(,<5P3'}a> <}?]=fM:~?G==;F8?g=m;ǖ?8$=;?p!z?[W$7>2v?ve>n?Pw>ib?86Ʋ>ݭL?>QT(?i>?1>sR ?B>辱>)z>LL¾ȋ>,AbxYT>pwoMUYI>SIw%={˽L=<}=~TV=3M:0=~2<W!9Q;R<1v;䉁c}>!87}? = :F~?=mb;4? j=z;?z'=i;?B<}U;?՚@z?"!L=+>.w?-YU>>p?f>pe?l->dR?bJt>t1?m:?\>>[@R?YY>U ?<۽RD;?Ll;eN&m> ->wFϾ>^h/e[>bmVfj3!>)v*R"={{Dݽe=}嘽=i~6fTf]=8H5=pv=-<)9<;f)}<;Ei<5<(=0;Xk=`;.J='YD;,^=+9j~>Z};> :V1}? =q:y~?y=ʀ;0?Jm=s;ː?G+=;(?~<;?YL<;D?ٽT*x?Ђ!F>ʫr?xv>"h?徔ԙ>ЧW?˞>9?_6y>I ?9Q?gR>N Y#?6K>~.0>( e>F @oݾt>=ZJ +a>,kw %>t.=}zWd;=O}#>=~yb=?!V8=bDH=6ew0F< ]; ڼ\;;h%I;ɼZZG;7<޷;ʺ }>_O:,}?9z=;|~?=-;).?Xo=7;?-=`;g?&<^;?z{?jg9>y?Nw7>Q"t?yz@d>qk?۾q>[?m>@?j27>l{?͒O3>h>G[0?<.QJ>zY6L> YW>^9꾤:>Ve>h$ヾ_)>sLF;(=y=a|ѳ7=h~Af=).f@:=TR`<869$<~i;CG޼FR;Y;U1ʼVH< ,ͼ 1</@;k8f;7<5*!a}>9j+}?=b:z~?z=Aq;>-?p=?8;ݍ?ɛ.=J;?cӿy?mU*>`u?ޞLS>ˏm?Ӿq1>_?)o h>F?8.>Q?qM>l>]d>|=V`>Nw}=.|>Ť>|2bC>yRUi>}f~(W,>QirG(=$ym =+|L½C=7~X:i=~zum9=)4~}[>T,}?=f:{~?£=:;-?*p=\;?6.=t;?<;?<&;?e/C?abԏ?B :a?yaѼ]N;?+XH܅<?W*<?_4=B^?Az=M~?(ýT=)~?0g=֧|?*=bz?)d>,ov?5C>JXo?;m̾Bu>b?9*>K?;$*H>%?/Jf>>] >>w[X>BDQ>@]$>Gy+>;N[Ⱦul>( dS>/>qT(=lx*ڻ=d9|ѽ=~hk=~ソ$8=7fzp<{3;o e;7żhj;tμF}Ɛ>c.}?>=K|~?cU=:&/??s=JIK?%`0Һ?u :?6ʼv;p?2Lx<? #MUw?e@5>6p?žc>"e?Yя>CP?Q6&@>A,?mIH/>G=>Z(^>D3>^L>hK >gھc+++>d$q- 퓚>sIҾ#n>@aYlNi1>o4c`h3>Qw^={߽=} l=@~<6=p(.o(;;}~>(.}?=}~?y#=G:/?n=.;U?Q,=K&;C?p6;?R3&fx?9{F(>'r?῾S>[g?w>E>$T?.p"Wr> 3?3{EH>>x] >/a>ka>P>ƾ2Wھ>M_%>cE)ݾxAp>X^I3>Anl>mvѯ&dh=i{Tf=ɑ}+i n=ؔ~Qp4=lMxn20}}>!?,}?y=o|~?8=E,9.?qo=#H;?`-=X;?;}<{4n;#?R<)N;?R`< ;?jGs?2[E>Oi?`z>^gW?;>,8?[B>8?2A]!>I0>ZdDj>շMU>̤]18r>>KX0r>@EWZ~q>@\͏d4>l x>v/V=}{͗=Z}}xco=ov~O4=c6MI }5>A'}?=[x~?=) 8+?q=Gg:?/=L;S?C|<];?<68;?_k<:+?<_9?;Î?%;#?sOw˄q?ҩ? R?gRp%??ZVS?A};<?8<?'F<>?L0Ck =? B=x3?Hi}=~?mBD=h}?;=|?G(=Ry? }>Et?88> k?Vtj>eZ?FK>z=??d>d?JT\>g>ryfq>iZZ8(>柾A=S>C){>B<+U"Qr>Y 85>9ku>sOuī8ko=z9.&ј=x#}Dƽq=X~5=7~〆 aD}># }?=s~?9=9'?/u=`e;Q?FV3=Z;?/ =c;?<74;?{?E馽yq=B~?|۽–= }?H =_|?A=y?]w/| >&u?yyP,>l?龥A[>o]?o>A?W?,*[$Z> >g>BٍRC s>b! ;>y7lr>Vsx5>i|G>|tGA=P-zqN =|Ͻ'9s=l9~A7=~ f$y}->_躑}?z= ,m~?&٧=:}#?x=7;?7=aw;?=$w;?l]<==;?t׆<:?><?ru?wO!>m?㾀OM>|_?>E?9i>M?iY e>2>i>g=au>xGPH>%@>o3/5r>sSľ\5>hI&>sI_=TySy=4|ؽu=~e$9=~1}[(>3}?=8j~?=: ?z=W3`;?Ž9=;?=0;o?f<L;?Y'7o?ݾV@>Sa?9nLfv>yI?7>f>l!? -X=շ>>9i> r=Qe>d5WLY>6*>.vr>Qx^˾b$6>{f蓾}>fr4 RI=Py~lk=y|xw= }><=~I+< 2loQ<k-/u;7H;"ռl<޼j|M<PLۼ0<ľ(;u*<N<%@<;<_? <Z<2)<]<d;= ;U,=LO;Xel=`Ʉ:*=0.Y=[q~(>s}>xs}?=eiJ:Dj~?=k!; ?; {=;E?9=(;»??=憖; ?$<:c;}?A[<[:?7;Q<ц?.t<5:?ӿ;G C?;rnԻt?hԺy?0Ż [?7K?׈$?]?xDIp?ؾDk5>c?uYurh>.L?A%4i>&?\Vį>>#jD>i >0g>6`/>*N Fr>\N3RҾs6>dK^#>qbZ=x|==:|y=~}b==~s$4t<*%iv&U;Oܼ;3l@G<U޼*<3(E;V@(<`B;׃<k2<:mxX}|>Ɍ9}?,@=&:n~?=AG;#?ڌx=;?{7=;E? =;?D:<V;?a<;9?3L<9?<~=?;Uxp?;3»??!+?uj0`m? &?j-<?=ݼ;!?>l s<?X7O<?lo-=5`?N=~?uȽ=D~?̙=*}?- =>{?Gi=w?t>8q?"hӾ8/e?\ \>ʔO?.1'>L-+?\T:>*3>`*j>)>7jb>0UAN>WܾY3Eؗ>5%q>Kؾ37>kOc> q[bH=e\x#2=4{%z=;}OýN>=~ܠ\<@}\<{Z;3;ۺ :;LVf;M=< #< ;J$&N}\>zJ:z&}?==V:v~?~L=Nj;)?s=Tw;Ȋ? 3=r;2?y=v;?Ҡ r?Uξ ">if?u&P>K6R?r7.>(z/?W6R9>V>iBk>ViH>/CX?>ϾW7Ȗ>k!Ϩq>H:Z߾8>a%ߣRQ> pkZ=vw))={|=}ɽ?=y~j?k<_ u%eњ}>Rǝ:q3}?K3=F";p~?=1;0?in=8;Տ?-=;?L<.;0?H<2;?ױz h?̦F>AT?H+ }>n3?O>Z>Zi#>oe>m> \;>%þ;yƕ>I uOq>;Fyq8>`|⨾C >6o s=VwT/}=&f{\}=)`}н۝@=b~A;yBq;d(<~㼗<;t:?#<|6;~<D4=<*<;<-F<+x }>':z?}?=é+;S~?^=x%;W7?3i=;?}(=wL;?.Ni?[=>fV?m(q>7?MД>&?h>o>o=o> y*_>¥n?Δ>>Wq>C9>^Ʋ >xMnz==v95ǂ={{4=5}·׽,B=I~_VC }>2 :_H}?ɯ=*;~?=&;=<??e=;?$=e;[?<;?l4vj?xl4>X?@%ؿg>X]:?MKf>  ?͗gq$>fk>cGpͯ>{ fa>Cӓ>{O p>@ X\:>W\?P >gmQ =Qv=;=z> Z=<} ޽sD=/~Y;kXr;a7<2FU<玼|;,E"_1}<>G|:M}? = ;~?=ހ;Y??Bb=ݤ;z?s "=;4?{?ƻ;d:?IQ;LZ8?n&:к? FG?Gn?#=K?T]9ۺe?"vv:?}>a;? |_<?tR2<?J8=?䢅e'=v=?^K=Ͼ~?28 r=}?,X=Kw|?uO= y?Dc =t?g>6k?[T,>Z? #e^>`=?HJy>?of͗>e> qV>tфdB>k?nFΒ>!گ fp>z>:>QQ[/ʶ& >l$=-uA\§=z=|ÇE=~z<~:4i~0.|>l8O}?=Qf:~?F#=F;@?ia=a;?ʝ =3;? !a?$2?fV=?R?ց:?ϩ;s??ڼ.4eA\?* U>#@?F>?#e_>e>טq>9IƑ>d "#%o>G:;V;>Y70&>Lk 0=JuUGon=2z&ą=|WBG=Q}Ѿ,<~b;O"=ܓ;q<h=2;=A:+y<=s$d~>S:|>ƀO}?=gr8~?=t:JA?a=g;? =]/;J?V<` ;5?b<?>x>>=cW?99=l~? hٽ%Y=1~?_}=|?9FF=Qsz?V=0u?hWn=Um?`]`y>]?MM>#B?ZGDr>?ci>p>q>.=0h>`L|ϐ>hi4'_o>g8!G;> X:a>jZ=tLnά=y2r=w||^`I=}ý=ײ~aZ<FS^;˩'U; ? $;K*<>!3;Y5J;R)<,M;<< /<<S<I<~@v<%x,|iL>TL}?ڎ=_~?=-!:=@?a=;?F =([O;H? <jJ;o?Εh?ػꈻ?Q#`Lh?\?mb:?9y ;~?oμ,)a<?e{ -_?G3F> E?Av{>?Hb>>5r>4l=0jҠ>szO>% _*n>(5Tk;>WVj$ľ>Ϻit=%= D;j=m;f=j ;m)7=Co3:a~WI>L|>KdRG}?=ɼ~?+=)=?c=Gt:c?r"=y:I?Oۃ`?d>>qSG??s> O?`>W>rR>8=l;3>cRh>>I@v~-D"n> 2~w;> T:Ⱦ>7hk=PsW=QGy2$n=|g O=}xzν=~,<"4tjg <.,;8 <>YP)<Y2;f;5IdMd2<;D<(@:|$>Z?}?`=ct~?OQ=:EC9?0g=%?X%=(9j? T=}b}?w9z=kE{?{}X=_@w?ruw{=po?(G} >a?$7>nuI?#=4l>!?-_>V>q>=n>LM]U%>a0m>/[p x;>R̾U>^gC=Z0s\=x(ԍ=W{* 2S=}}Eӽ#=}~1Ì<**'(l4N:|*>@һ6}?$=T~?< =x4?4j=@K?(=>:?r<Ą/?h<)$X?pY<ڛw?c ϒ=w?n8;=:gp?B^㾏>c?d0>TK?B;d>$?щ]>˗>mq}>K>b|o>>7Wԍ>3UNm>,| y;> QnAѾ>f] =rDan=xu+ა=ڸ{TX=^}VؽS =i~8Z<hq!<30i4+<˽A<A0R<ѡ@<Î 4;iMZ3;h|L>1 ^.}?ʍ=G}~?k=wq0?q%m=a?B+=f%?< "?f<9.?;-a?1;$i? vT?<]?:?м<?6<?{,jM<)?_=<?8a=11?^R$=~?ߦ;=}?T1![={?+sAv=%x?(D = q?|k߾e+=9d?çp)>mM?9\>-(?[m>>/;q >5>pB>G!kZ>&⾙Y6m> )k;>BOuվ=_>eu`y=!r%@f=axi/=I{ h_=@}ݽ%=V~ҨB`<U:vd8a<<%,**;I}>/(}?͏= x~?d= ,?9n=껩?-=)޻?ۀO?6T>5+?Y$Zא>n>2pz>3>qq> \n>Hپ9Jm>&6F*<>sMtپ:>>dK+=lqj=5xf2{|=\{yz g=R!}Ƚ὜O=A~8Ȱ<I{GY<~Gp;K-b;}m/>vTa!}?NQ=Ss~? =<(?fp=+s?^.=n&4?<2?e<Ia?Ol?_|ּm;?ʧB<?&+3z<:?YZD\/oQ?$y4@\L>@:.?fX^>k>79pϑ>H>\r>'^`ҋ>8JѾz;^tm>#̸K=>Kݾ>)c*&=q~n8=w5蛜=,{ s=_}wf&=,~/#<~yUA6{ <}0p>nבD}?H=;Km~?=- #?sdq=r?R/=?2o?s<;y|?5;B?c;.?';Lᶼu?.`V۾1?jW¼?ֳ ?FS?B₼R?+q?쯵s x?ټ94?L ;?+w<?Y\<?6M< T?e<~? ! =2 ~?~%r&=k|?c_T=>y?={"s?Ӿ;=6g?O u>OS?b82+jC>)1?Vv>%U>XoЊ>s\>Os6>ĽK`}Y>ԲȾ#6>]n>C tIK>bg=p]r/@=ww8 =|zS=*|q轮7=8~m~KE<}o>G}?=e~?=Y$?q=?/=ѵ?mO}Loo? Rܼ ػ?t:?a+<?W<?z5K<%a?%/<~?;콩<,~?%! =ՠ|?W^7=y?cwg=)s?Ͼ_=h? >x!U?/h9>T4?lTm><?n,>q>tt_>ɛbt>6y@p>=7pB>#Go>aAd=ov=0w<;=zۋ=|'M=}# =~|]ӌ<}y>6}?|=/tX~?=K?q=s?/=&?s6?S'Od>`8?!n>U>yu@9>Cghd4>ηB6s>HF>2#F2辻>E`>FoAzU=v=7e={~z=!|p콕j=} 鳽L+=~Ȃr =TܚAk=!=-b=h=oC =<=nL;98=?J<\6=^˕<$=kX<}M>ʪ(|?=&:A~?=)"~?Bq=, [?'/=a 0? @<%?<<+?r<3?~#{]?%Rb:Xw?ɪ;?Ў <)g~?M׹t<}?UFv9?7QӖY>%! ?Am$ԃ>> u>"f>pVDmZw>%yM>@D;2뾙&>m_UL#' > nvI}$=\FvS?N=+z1д=yL|=} U=j~`6=9"@605=wU4^D=5K=]>=m17=S}M=^; g=ݕI<}{d=T<Q=w"<"G=]E<F=f<[A=<+:=;h<3=]C =`-=+ua=y9%=f =*=9==CM~>$=|>\ec{|?Q= a~?b=g^~?8q=F\B/?:X/=l1]6d?e<`a}??P;:{|?Jʃ+x?k愽qu?[փJv?0>!,{?t?c?J?ۼe8)}?p99?'"ͼ?OO? ƒ^u? ?:߽x~?S\4&}?Q@.s8uOL> ?elg~><>*uЍ>jg>xFǫ}>sV>9PB1>Q^ a>|mn;>uC@$=(yK={ l=5j}Ug=q~a/s=@ =T4s=Fb`=Lb=I`D}=XbQ v=̺J=r;& =UD<=iU?=W=5~=O=&~6>J=|>-p|?==뙽}?e=/7w~?xp=~?@.=K?<漚&?y<\f,?[r<'*?AR%i>?M~a=>g?k~t>>*?v^><:hW> ̞Ge>@c>G@Z\/t'>lo>tA>^y+=h{΀,=|1=~[=~w:=~^=~9TX=~y|=7$ߥ=Y$F9= ;p=~a>=|7>ӽ{?X=Aӽ}?5B=HѽQ}?do=l8ѽ8~?|-=ҽk~?rA?j g>>Ryv> 1<٭iZ>IP>8 v> >T(X>BR[P#o@>Ȃką/>sAz!>Bx?c>z<>+|3=[}| =Y~O7x= :~ U3=0~]=&~ ==J~-=,l~7 =K~;=~< 8<=~\=?{B>[z?L =u{?m=8|?n=a|?,= 0.}?<?}?em?Ij W>>bvD>4%=h>j~>r1I>M [ >og;K'x>Y c>i؇T>rSAG>=cvGr :>x&*>szͯ;>|Bz>|4&>|? >| >|Ds>|.;>>5}5J>>|;$>|3<#>ٰ|'|ڮ<ă>|M<>|}v<>7} <>e"}?=v?>}\=>}՜=q>|D= > |S>Be >!z# End: Data binary 4 # End: Segment mumax3-3.10/test/testdata/oommf_ovf1_binary4.omf000066400000000000000000000072321371432437400216560ustar00rootroot00000000000000# OOMMF: rectangular mesh v1.0 # Segment count: 1 # Begin: Segment # Begin: Header # Title: /home/arne/wd/les/ingenieursproject/oommf/test.field0000-pass00000.omf # Desc: Field Index: 0 # Desc: Applied field (T): 0 0 0 # Desc: Iteration: 0 # Desc: Time (s): 0 # Desc: |m x h|: 0.39767182754665648 # Desc: User Comment: # meshtype: rectangular # meshunit: m # xbase: 1e-08 # ybase: 1e-08 # zbase: 1e-08 # xstepsize: 2e-08 # ystepsize: 2e-08 # zstepsize: 2e-08 # xnodes: 25 # ynodes: 10 # znodes: 1 # xmin: 0 # ymin: 0 # zmin: 0 # xmax: 5e-07 # ymax: 2e-07 # zmax: 2e-08 # valueunit: A/m # valuemultiplier: 800000 # ValueRangeMinMag: 1e-08 # ValueRangeMaxMag: 1 # End: Header # Begin: Data Binary 4 I8|-+=^'R6X2ȱ?#R?8Z> \Rޘg> Oƾ`yZ=Կx#>3??;*;0?zN3.?tH>\>?Do?=P??wjo >IZu!>X0C> ?ve>gbl> ޽|Y?gךֻ?[Ϊ8>PzUt?Kӿ/>bi >2خn/1#,>UQ>Sb?Znˏh4?V?v:νVbӞeNſG?ZѾ<ǿ&BL?e/>`߼>|">g¾Dk<)? FV!?_>g=E?t۾y3&%l>^ fyahR=PH =M>›fh_@_ЅU?%@g?:ofk6?0>?KV= ?q?Nf=O#?ܾ?RA>I?ql@0[>l> h8=Ѣ>޿|=Y=\?Yk?ltwA>Ժ˿OeB3%>ij ?1'y?'¦G9.?) 6|)9;b >06(?Yn?[Y6>X?Lv3 ~>bdݖ=?L_>̿&)0f"O<?JV>K"?7"m?s tX?틿d%WG|?oD1Z=L@?{K?G"?o>쒿a )%?I? c>>+Ŀ?hf~?cs>#?i@>ӱy?xʽiW4?:={?.P?Q>P?A>hR4^z,?a">۾da.$<>* ?3pϾ̿.'!ֿ:MT>?o >+οNa땿Jo>hm k>?$B? =?Yfi>لt> X? ? d>n>W{?w?-^>4Kÿ6El0=Ͳf?~F|>a>!?]?LMs=_=k?_;ԽiD?b>hA߾GDB>F!?&Yl>̍+?%Q?AE?!>%?HJwB?tqԻԹ£,?h:H@>od>4?_B3>/ cy$>?p>T?x]}?zGgv5>6=>??n*u?vfkR >9hm>ki2]?5y>2iڛ>e L>\L?ZF?>?'4;>D>V2p>͆?%"? &6>: 8@?o>? <@T?/2?8x=Q>$IFZ >y eh>= {?v=?Q|?(? ??:?A) Ծ%?$^qW1vf>G6=fb?tpa?ys>\ED=-m?x|=R~0&>q 2K2|W?>+[þ R;ۅ?ǿؿ?8m,?Hܾ]>@] V>j 3?g)ښ=E`i>Wifo ?U>?6@>͡?_?  }m>4׬,> ݿ2?4m7&#bD!?u=ϔ>? ?9k=`+?v>==m ?o&?![4W۾=L?cx:`~0FǾi?g?+B>nȿu5=|G`>7RQ? X>O?8>?@_?.I>pֿY,Դ8.y?sLyP6t >WQb(z=>4? >._?0D>冿?.&n>|?(,??&>Ὰ΋'j?f0߿o]s:M?+ >wJq>ɾ=(Tc;,?Xԫ>J>ܢb 2>>?$c'/??q=F ʿ7N?4|e(ο67>I>s?jy"󁴾}XX?Z?0>ȸ3?UŔx8>2B?$bX^?t>"2}z=ەG}?Zx>B>H*:?}Z>? 3^?0L`?lJ>>D>q0?dZ>$xI?Z>H|sfof>pKQ?x;?/{8}ӿ2?6"g-?vc3< ߾&>1>"rO+ >GHt1p ?a4?)?"a>˿ 很?J`?wMx\>C>0x!?to9p>W=i?&ʿ=>'R>t7ֿ(X?d'o?+<?dxh>T7?.PfP<ƿr >cNF\>.W߽3()b??ļ?kZd?\j>:>i UZRC!Fɾ,a?hΊ[WYh߾L!=Xb> />?I?5 A?T(>4?.>N>?N=?VOZs>5*z\{x?vW1>@WJ?rԾ4Xl>?>x-?6R7/\.pO9^iD? 6 # End: Data Binary 4 # End: Segment mumax3-3.10/test/testdata/oommf_ovf1_binary8.omf000066400000000000000000001666771371432437400217050ustar00rootroot00000000000000# OOMMF: rectangular mesh v1.0 # Segment count: 1 # Begin: Segment # Begin: Header # Title: Oxs_TimeDriver::Magnetization # Desc: Oxs vector field output # Desc: MIF source file: /home/syukri/workspace/oommf/std4.mif # Desc: Iteration: 5, State id: 20570 # Desc: Stage: 0, Stage iteration: 5 # Desc: Stage simulation time: 5.7528e-12 s # Desc: Total simulation time: 5.7528e-12 s # meshtype: rectangular # meshunit: m # xbase: 1e-08 # ybase: 1e-08 # zbase: 1e-08 # xstepsize: 2e-08 # ystepsize: 2e-08 # zstepsize: 2e-08 # xnodes: 100 # ynodes: 25 # znodes: 1 # xmin: 0 # ymin: 0 # zmin: 0 # xmax: 1.9999999999999999e-06 # ymax: 4.9999999999999998e-07 # zmax: 2e-08 # valueunit: A/m # valuemultiplier: 1 # ValueRangeMinMag: 799999.99999999977 # ValueRangeMaxMag: 800000.00000000023 # End: Header # Begin: Data Binary 8 B!w@!VL!.ښy@xS&$axBsD' 6@L+%v]>Do|k@H&)3}xG@9Zv&Cyl@~d&(n[,@sZ&y<ȃߊ@T@jl' 0kIJq@?']\ RF:@$ ' L[t@p"7(6_w2A1(fP۽dmJ(i+g@]2AO|_0/6(gc2X@Ԛqt셥 \ҝc(h2)b@vDf](hH@ohMvR(in,l@YaLѨ[ [V(i]l@b))HF(i@غl7D5L(ivQ@ܼAIU(iࢨTG{K^$[v(i*XZ6=;9Wo(i\/jUb=P"2b(iIo:/^s(i$޵=sX bl*(iӑP" ?(iOoHz@o |3(it;$T7T w?(iޫ vȷ$$!j(iv7?`SCw(iScV2(;nK(i&pZ԰(iۿЉ8&Y(iLu)!$g(iޘ jaH|$ZYY<(iDT2Vz4,|'mm(iݳb|V+`f۳(i@ &.$W(i}Qy42.v R(iڜ?ݑpΕ5m,R(i3.5"f59O(i߇xc}WW=Dt7(iWECZA((i"T{(}ED9e(igzƱKU#I(i>}6NnA(i$Լ@W>Մn[Xܨ(iОKN@uE2w+cOf(iXSL@ӿU(i߉{:@Qe@6T(i8.@ O]vtAƭ(iυd@O-P(ik@0(inq#-@x>ۑ3 (ie@myݥd(i I@y"m(i4'@0:Qgqd(i@X(7:/(ic]@3~}FSO(i-B|g@Z ,|,(h^0@WQˏ(h@o[a#ν(hWAǪ @'hϳ U W(h#@f6}hə(gI醾@ &G`(h`@@`2_DK(h7]@՝d!(h᠆@g>-s ؾ((ip@ȔVd2XlfF(if@# tA(h(i\4@b ")n(iaU2{gˤ4(f )k)'dّ(][)(JvRl(:a>8!>7@ l!-'8 W0P,$e"Reà "@mRp@[:Gl( |?@ݻA$Ү+gz@]A'TYM!Ot@W A(;0glfŽ-@gvEA(\A`@pjFA(h;rl.g+@ՂF`A(h~Ts@Lls@z>A(\L@Ј9Գ@\A(D)B@9en@LA('ٻ8r@W8uT@&MGA( s A"@/GlNA' IEuAhWOoA'x6A ^a;^A'0LAtA&r3A1X5 Q6A${\AUSJD6A0(isA$'r5ҳ2Nywܺ[$u&@ӈY)!I!V5!;s.@ʝ#c*"#4}i$# '@ 8$Jԡ$.kA@řzU9_$SD`ᗩ}@6_cM%V,|s@K C%2M،4F@N+G%שeh/@%Ia&ptD};Ì̚@#l'De eE @æH,(—0W|@4z(h\'>Ve0V(ey#@Fnj:Sd(ivEV2D$(i#WH8@(i}ш6lL *^(iqqڠP*q(ikb 'Tp=(iT_%]"X (iA( q3]fV(iߢ;Bb~(i64UhhyS(iG~[ZOm잕(i6]ht;_ɕ(iɇyJǒ{S!(ivcha^0nrO(iUi;@Ff”v$I(iw@s'.0@(i‚@XN0(i$@>R(iݨ N@뻲7ak(iZi@k\Ni욋zt(iױ1@aO'(iܡ@oe6c(iȐR@ׁƳ(i[@ 9{Ftu(ig%@FyzN(i~J 5@0Uz>(iH3k(@9id/f0(hq@!]x#Zܐ(b(h]2@ύǣ"{ٲ (h:T@Қq"Dkn ](gv@UBL}~5!(f@״4%(fѵt@q(G J(fJ&@FD(9QW(f`c?@'"w(f1 @@Th3ۄݟ@(gs(@(V]`Dm(hi@Z!XsN(i ax@wˇ)p(ii$L{(geIS1,#NVgbq(\wz(/X(3NJje 'y1 Iiٳ+)e#d#:@5h_ aq&@K@!rzD@;섾w(dܹ@'RA۟K%oH|@ơ}RA$ wX@۟CyA'.e :52qNf@͚זeg&A(/Um>_@÷ў9A(_ɤ> u@嚀1żA(hP"@˩wm@L0A(O(@~C@9fpA(" !@Q(nR:@drwA'֗}amAkCYoIA' \nA d-]\3=AsA'P<A W2-^BA'^A.'dx/A&'zA"夗E!D2A&&l&AtV8YA$yЍ1AoA!¯j7A `$Ӟ"DAA%򴽛 ?2'AoA'ӣ#B3DLka71J%w:h@RXBxx#\  @{tD_ Ϝ!)q6@︱w"L4-1! (*A!@Wo#ÿW9#H@ȕ#EA$C@π$!ds~*@ƐHU$N'I#2x@ĝ՜%ϫŨ}h@ D&& k@oS:(O؈N,z^q(if\%fK%(J,(]Px @褘r,z%< ĵ(ZKG@`C/zw (^@-5aM _(cJ@ŷpdg~/K(g s@X{R (h9@Xdלfk(in"[ɂ@N_#1t(i"l@fha`y+(itU@2)bWQ(i:ld/My(i}Bz@E#(i|\Qp.Bc2(i=Sf(ib˹]JIXL8(ik!Iܾ{0(iL\e xKq#X(ifGpMy(i׿Ű%Q/(iؓH"w(V#F5#"(i]Zr B(i6@^B\n(ił3+uWrb>(ic1<"<.fg (iODcj(i}^8n2)(iX /ڀr/r(iwu #~"vʨw}(i릆+X{zm(iZ8(Yg$õ(i¶FYID(i$u W;(i|9ViMuk,(iC ;Q])r(isQKЖ '̋(i4z1e;%9:(iG4i)lP[(i9w@CeYS(im@s :^K`6(i߄*@K}t*e(iޱf>@9Zna(i^:e@!%]|(iG,@,H ɱ(i^@n-Э(ie@<|Hy]7(iʝ@"=7;D#(io@4:{Bb,Ep<(i=X@tԑw\/J(ieF@5ABш0(iQRh@Ĉےf} =(i.@{|b (hx@e'L(g9@U͹4`Ar(g%ߙ~m@iъ; h(fEm8 @X`;W1w(ei@U4g`ê;(d7@|Eu dB(dmP@7Nr)=0˿(dIA=)@ЏFPp(em@Gza](fȒ+T@din dDr(htz2n@|Nz6:a(iņ@}@۾hA (o'ֳ^Y@YhUAכT$bqlY@f2A$㧢Ķ:Zȝ@58-^A'uRG E>t@)W A(;b_=d*@7yA(im@7@G9A(=al0@K'p@8lTA'P}?Asss VA'O銝A [#(pXA&LmQvAwaAyEARA&Bt%A tf.&[A%nA+Lf`A$ݻ6AJ\= WlA",Am]neФ0yEAM A#6!ޥ2rJVAN|a@A&jg]v!!AH~0A'._PT@mpIA(L*"<2aha&My@֕O˸$l@\Y)J3 #Ae1Q_@b/Xƪ cL)"f@c!T!0WS[AK@? K"Ǭ= s.I )@]^" cA<@x(#' .w>J`R@3q$htWlJi@qp&K !@Ks(-xw_xg(.(gQzp@ͼ fV;JFm(Q0X@U1Riov((N+3@cpv:(W_/¹'@ m a# (_@A j(dJ@bH6;q(g X@ԴBɡ;UJ{(hg$@ n>!N(i#I@*ٴ4 ck(i @JEiɪ(iݼ|@Q}!UIvx9!(i-)⌧t~(ijTAKW?9 b(iڻnNV@w^&r(iIrwl=F`B(iIe+h= (iHL]XΔALfgF(iد&g Lu Ci((iE|YV m'0J#(ijFs&Sp/((iډDVGt%s(ibl(v7xg&(i۩g-!dt|emL(i'pJP:Qz(iܝ*5iF"[AGet(i :wlC(ivɱ/5K'}-(i,l5Z lYW(i=7XF}>((iޚ%y3ɿA˜hu (bTN@ M(c_O@0/bq(dMk@eC@qȪ(f_  @ZiGx(hX@͚@Ra30w(i׌SS"}XU(d'tRhf(H, ^Iރ"Og'tT! xU..<%wR+͂f@6#",*@ֵANp&u3@%$;|c'I@6:7nm(_E#@02~@h+ (Fn@}YA '?E^ %@?9tAǿ#c@Pfi%A%#c!@| DA'q-#@@EA(e h|Z+@>A(!s@D#M&q0A'PA ׏4ĮѵA&_6AbkF3A%gT8 A{`u.2N&A$F:C@A1]WǍ6Y43A"LAlEksA(R?X3"/sTa@SA(bDblgx&( &R~K@y|roݳ %'@ոEvq>$}8@)PyOfԍ#4"#@f"D+`֧F"g8@ѷ1 x Dp+!/q@б)8rY!{ZC_!|ׁ@ Tد"'S0^1@Ub$N_{~I,.@Ņ 3Щ&а^`WI1^@P\(H@(Ġ(M(^ m@Ea(B 5 t@LƬt(AyUU7@#P(Nn@:oJƣj}(ZY]K@i Jc^(b/j@oz8,k(fD7#@Y)Rs (hZ U@a,`(iLVT@тN@8q(i t@~lb/d(iդTRF@#Mf,g_P(i1!ѥ@K!vsm(iz/䘑]5(iݑ%kt| <{햺(i䱘Z(E휨-(i:fP ")(iڵx[OKZ,6_(i6knF)3(i,hl!ݱbI(i۔bpASL1c(i~(iR@sؙ;Gv(i`@f Hv%(ip@2*Xz|'(hTȗ@ϬlK&Q{(gvk@"o:(f@cEN/ź (et@/E$(cG=@^ȯ{$(bwyju@ꓤx_~w>(a\#@I!t fH(`O@b˷abM(a]wY@Y\#C+(c`O@i?ZMf(f~zu@ق愤-; z(iZ]b1@ :tJr&(hSZ4[=-0M q(WfK@Z,O(VjydAxBo&uv@ND1u"7g tKm@'xDZl6B0)=%,^$m@V+ia'@_m6's}@}PƢLب(Gz@ںi2@^v_(ewGS@ۺB@(&Ԯ@!nA- ||K&E#@鯲R_A57Bb"#d@IA/A%̞d/@L_A(5'y@$Mq@ɜ ]sA'vA s'>CL[A&A(pA/CWڜ#+A$}nA 69ƯVfA"ZkЁA]2qA 9=n2A"<pzA;ِe,A$pg_բ ZiAWF!A&pT{JFA,x3PA'íaq'1œ@}&^A(9R?Ni1y@'RA(];DwHϛw@MXA(enr5O+jZ2.$X8A(f5&B kq)'7o @?8&G˚Ta@ּPZf9%]~@zh=<_ Bw+$tX@Py q$#5: @:@0ʍ"n:Z=@@Rz 80!]t@ЧnK"5]s Au@kO9$| 6@DկK'R;bp&@źw (_i掟!} (Qw腞@/:y a]?(1;N@0 @(4VH@sq(D@1fZ7(TZs*4@$RP( `Hy5(^@6t3 <|Ѵ(dS@~?_(gG@*KPsS³v(h j@*yhQ&(i6@+j[3)(iƉ3@xpj<(iqv@QzT2ϧ(iߠ<@kw9`jiU(ik虫8nZZ(iG" !(:X&tu(ihpy[0y(iMxTG4(i*.Ȩm:(i*655Dai@(i3hnچxR[vj(iin5Bv08(iݟ$( , ϋ(i~x*ΌQ@ݐ(ijsQ+*~v(i3ih'6(icUb4TO"'Đ(iޓ1n TV駯(iĪNIgH u(iL:1&3(i'cFQ3&(iWzr> W(i߄Ld-ĘLcI(i߫J? ni(iͯy̆öaT(iK1xo-+4=k\(i%GFP{BMKʆ@(i߻3=@aaSEp zv(i{Z@wjKFp(i /a@]1ZG(iR2 @%Y{.(i3t@~0Uw](ivA@Kh3 ?Wt(iA@;7 4_GB(ir=7@5 D5=x(iwX@QT(i|@jjv,(iz@޽ޤ:q(i9@}VM(iVs@@U;m8L~(hG@ lOb .(hi@еM%JFDZ(g,@Zxd(fT˄8@4Sr`(dlOS@lp-c&(bK@JXjT Ԋ(a0.@|ʌV|濩s(_L$3@ D(_L@B?QRyl(`eH@$\(cj m@ᨒ'2E?2t(g0YS@ֹ n/ŧ%(iދ@Z?;](cAC.XO ?F|̊(2{ǒP!@-I"L( @ &w%,(Iby@ZG(fy@ۄS@ͬ(6gAZ@^:uAOZ!&=@fBwcA:"oEm@eHYA&{r@[A'GɐAsGǎfA#JAF̃gQA >k[A"70uEqlapAoA$bՐgA^aTA&үJ> fA,YfA'[u#]5}@>&NA(2;WV}OJ@NdVA(\ TXR@sf A(ev]w`݂A(ee\eTtNA(d&103h0)Hyy*A(c<= <'x.<=i@ؖmq).r& _@vS/H&|>%A;@ghW%9!N @U;far$v@;HJ' q#UY@mn 썌"]VFBR@G"VQ"1a g" @m$O$˖LvC-@П'!?{ &p C@v}oC(iY?{ccCVt& (AIuA@>w+ٸ("Uid@h-R#+(']3@[/JI(;tw@$bpyt,;(M/%b@btӬ([z@؁>=,m (b@3:u7(fDG@@Ā V(s(hx C@b:AI[:(iTة<@K @y9(iT@QR(i'@vEC̓*(i$,Vl@t (iߨtQa@`JN vixGO(iߜߑ7U~3BV(i"]O3ؠ-بn(iށdXM%R(iއxkSnܴr 8(iޕa?,s1(iޥ3A@(i޶­D]}(iȊmM _v4`!(i۫oIJLښb#VI(i]]` u,T(ibI6(i"LIds^4(i?J2Oo̐>!J(i]85luQ(i{IpLR'(iߘ-u#tmd8(i߲Q.}A q᫑(i. u.mh(i͗hݭcs(iƜ8Q;< 5(iߩR@czۺ5(im˭@w&(4Q7\(i@%EZtw(i^5j@Z`3 b(iW@$S5w:(i۽]Q@cTd?<'l(i3.W@첫T 2(i%@u2"/61Q(iTtP@Rs%(i@ =ww}(i4~e@tNDbt(i:M@ ÕЁ4'(iRs@`ٲ)AͣrM(h @ʥu2Y2n}΀(hQY⻷@@X<= !(g[`M@Xa*y~_(eY3&@aJ=bA("WS(*@9MlA([SC8{<@a3A(f6eب3A(b\õ۝)8|A(]PۨAN%wbA(Z$l/ە"td:A(Y2~[vǾ|f f4"'|n@r"\'%1&X]M@=-?S{k%+ejV@63⯀FJə$Q2@ԟy9q-#t@mMw"c,@{ap"IZ + @y %_;@µ r%-'?@yx(fde7|@ OBĴV(/2@xIK)4g<<(O'LQAr¶5D("@x@<"Zؑn(1[@^:(Gk*@eG:#(WJljH@*G(H+[f)(`lªyL@_&r(eoSS@@*ze} (g]}@_~xbQp(i[ñ@lܹ_(i@_B&Շ5SW(i`a@̠gtw<,gQ(i׎O@@G"-m&(iC@E6f(i߂H%@p,[Rm6)(i߾^iQc"Ӣģ(iߦ.Y|2 ʚ:(i߈$MjXWŷ 5(it *@ʺjCKl5(ihG @ ,Ld F(ia={FFbpSֻ(i\sf \+(iYnəfHY(iWWmqAVչ|(iXqwC<֑(iZ۷[r.Yc:(i`5qM}b,(iiʔAg(iu1}>e2(i߄QQj9y'È:^(iߕEP{ Ťi(iߦ~dr-EZ,h(i߼g5OCOI(iߣN7@as]~(iqp@t/x䖃(i@$7p(iޏo@&U-@`(iݰsn@](7}(iM@9+u, C(i~@8ySl(i]@=Ϻ2C:Bi(i_@DS>N1 (i4@M쯴s\(i7@cW|s\U(i1(`@ty(iT=D@<0B Ⱥn(h@h-i(hFAo@~'HLT(g=^u@|Ŧ5RK%(eU@9IqtS((cv@._K cZ(atj;@+Dqn ߞ(_Bje@槤g8"E(]vN@!ڼ&=Z(]Y@jIhtiM(`Tג@z4 N$(d @sMQB L(i#@jSpBB(d@BJ_ĥ:2GHq(*ׇXjOj'g=-}@r$'[~ Pף@Ƚ}hW8W A96f"4.@jƚ$|K:@(9T%@պ%SxC7&3w@ָ旟&$Ň@S. &t}lM&8@׳JBU$;H7&A+;@,@0&{s@.{a.t&š@ح-WZB}&kK@V?V02Rc%?CD@ݔ06" iNC@ψ8Z%lPABnG/gʆA%IW❶#Y q:!A'AԨ/bYߑs{냷A'a^Eت؎?A'ۚk2/.8iY\A(''c!hA(! rvYiA(,c-^SۻW(}T?A(6-ۣI+YA(AEooۃSw[F#A(L%MJ\2|axzA(W;J+ zh'Өh@ c_ kh;'0aC@>~ԟ@ƣ?&7H:2@]a j%力z;@Vqqo橹%th5@ v_xD͚R<$ʔ@ӑδ&  C"h8O@WA!"͔tB!@b~@˴%s&"Fa@Ye(7+(Z`'T@`iΎ}( F,2@^e/蜠 (A SpDҏЙ(IAN ۱-()pL@sO|(AX!@&+Z{u$U(SP&@'́=;(^@Ѓqs (d3~υ@bo4-(gIS)@Jd7AH(h@87\%N^(iiĒZE@D'Mz+Fx(i@F-M(iΡ_@~jV(iٴ_@@a I_")(iݶF@ #9 ~fD(i#qx@jaG(iߌ@i=j[GIi"9(i߰ @)9Nd(i߹ ^d׵ (iߺ_jYx*asԉПJ(i߸\Tqfb)$(i߳WtG(&-(i߮kxĎA7R(iߨQzͶ ̃M (iߢb\} @y.|*(iߝjh~2*c,iWq(iߚ-1y7t~ɇ7K(iߙAe Lh(iߚp7Ƀdf(iߟR~T|2!(iߥ_|)9 `/(i߭vylP(iߵeu AMOM(i߻Gp-1f[(i߽cӛՄ((i߸)p E6;8ٓd(iߦL։@WZ]R(i߁0@o'(iA@{MY8꧍g(i֘@0Jzpv(i'@:c!GTFN(i @0Q@S(i-l6@ A]e(ir@ ?+Gw|(isd@x|F I(iȇ;@![CkeU r(ia@8W3 aF6(i@M@0, (i[Z8]@ý̺Y ;jp(hf@lB(hGkjo@w->IXPf(g3G@֨4Һ"+(eM@ܪ4txJPXŵ(c|@y(a.w@W3vQsL(^@f]G~ThVF%or@ՕUEe'M2-%aJ@ݡ~z 7L%}@&3P^ %p!vp@ձRwUb7 M%Kf@7݆Ow$Zw@Rx!;88"]"N@ҀZfOI"=3Gx 9 C1@ ^Z&R-:ok k'ɠA/N"1]qARӬYr,OkX)A#mIgԳ|U+2A%n8O>k7}A&那rB&6 J9=H9~ߡ("x7,s@`G~4Hz(;տ@Yn)3(OC;MwN@_?Kv([ʧ=@*.eƓ"(b@s@X'"I(f;@+w$V (hf-@!z0K.(i;ɤ@OR dyOï(i~f7@ru} @7V!E(gN]#֌ 6; c9(7_6=I~EV='J{_~Ɗ F€@e0+A$ԾhGY9:M}@'`<!! r@ΦRUOkE"&@ѵܔRat#ɚ(@#F'Lٯ[s$Kp@2 ${@Ȇ@0FZ??$rcQ@& óV\&X8($6?Uވ@O>+k#Ֆ@*-8kb:>;"XQ`@lJDc aɡ!|I@luѽ\"ݍ1:2@ˀ(4B=$۠m(^} @X''YfMK Ʌd5@Gx#(P8J&6w@^2oh%tFAM] !!,A oܷ)AA$3WHWErH=;Ҵ A%Zrƞx?cvA&@) =~ A'-F,0A] ܍A'MxޕۄIer >A']ێۑS]j{A'k>ۇޱrQ+A("H`Yԩ0[A(BۻEz@YB֩{9T'8@S#>M fԦ'X͉@|ے^YN)=&3 c@ן&p?& H=G@օ*Tnrd%,Ne@컺#cN@#¸c!}2^M!j!@,J$S'v,@/xW'i^'6zw@5(iJo'NTt3(:wHT{@fPm5( e7^!AV.)EߑL'A_^xiÇ (e:zAu$ xi_j(18+@u';H]'2R(7Y-Y@EЮsџ_(K^_@Zd $(Y@[x6?ߘ(aw(@|e1(e=7 @۹p#ہ2(h@0f`zu(ih@KE|Ssug(i~0@3y˽ 0k(i}@~RV N(i40@r[dhv(iԪ@aA(vq?C?(iىRО@ouP%&(ihj@N}7Y^rW&5(i@y]@ʶzy(i!I@<[h؍(iތ w@'=˚(i3#@e*p/Bqؾ*(i߳A]q>˨u5(i߳Bbq'>ӸiM(iߴ]oi6oԌJa(i߶,(jq@D@\(i߶EcJ.ې+(i߳'S:4\OڱzKN(iߩ)@3Ah)ַ(iߔh(@bbO⎋2U(il&@r$#'"l37(i@]lO(iޓpa@/<.Ш(iݍt@;͓=cz(i۞gn@w/~g5ӓ(izC@!vϣ_.+y(iЧVJ@s`9XP&+(i…@+Q*3d9(i)e+@m* V (is3̢O@.PhQ(iO@Ș7=܀=AT(hk%G@Щk#p/b~`z(gV%w@JKO^_2A(e~М@h!ktϗlH(cn6ar@Ҵ;~ w-B(`Q/@1)oqG(^f+ͤk@;Ϊ@ad ,#4(]YC@gV_^(^l:-@Π$$0(c @} (i6C@o-Aq,5g(_@gv; ё ': s|&NZ@uPv$N Z 2k_N@z2j!< *] ܗy@ζw ̰ C j"a3b,@5hDrhb#`@5>vK^7}0#a*@ҞtA\qѪ#_u@ҞFK3M##ԉ~@I{kNJ" YӜ@Ѥ,( QWǦ!{@Ч/!+" r9@|]("%T9?@ʋR$nT^@0 (&"UC],@~'p#M A=s@BZ(h,@ΥI܎oƜ2'58dAHXpz{n$rAʆy-"dhh Wx_8iA"v~lqwA$Lٌu,nm=x0{A%zx i )tA&eo%2r$2<+A'.xS DvRqY ~vtCxA'C`xPYA'X~wECuA( f|CZd-ݑ*= A(7SG>jdT'܁ 8@Ymg $X']9@؁$le|&v[@לCO*Rۘ%M_@oWtZJ'$>],@㇊H _Q`A#V ?@қp" p@D >%bEGf*ױ[@Cÿn:(ơ8bVZW4?ZU7(d5a@5:)T^(,pQ@/0QD:(HA{uzӦ'WJA~TM(IA.} QÙ(74@(}>xvd (3\Yʿ@x3DR(H;@㰭ψ/'D(Wza}@۝-(`Tͅ+@zgQqm(e)<@"ӱv KM(g7@ ?2MC(h?@nl+ZD(i^ @zn$8{(i= @ ^X Q(i@?!AWD)(i̓f1@f5\Wh(iԫ @Д*ְ?p(ix1R@e+(iڭQa@Az/(iՕ@ͯ`tBvG(i5"Z@%Q+$ bSa(i߲opfw4 (i߱0q#nO@T(i߲al!pHo ԛ?6J(i߳vmMGnՌ.(iߴih}{m!A[m(i߳<`|B"mnKE(i߮VE#:=ܗf(iߟ <@S=Vx]ofK(i{F@nWղQ(i.Z@}Xkq(iލd7@` x(i;7@(}- Kb(i{/%@Ib;(i@1D?.TM(iU@ߢlW(iE@LfYΙk(i@|3S(i*XJ@*0%9{tã/(hd@SPϯВ(gn>@X2:*ZwHT(e)@ۻz+@ft(cԋ@W* (aejF@] tV(^g"@;FK&s(] _@F<~ w(`@@呃[ (ej4@ۀm'=O5(i5tYײ2KN؉(MGN zB㮝'G4@!%B2"9@ST#JV\s@#&!̻M {@#1nȏ !ϣI@sN8 0_4;"DR\-@r &/"X{2h@2· OcY"(ߺ@ NJ u&'_!Ag c@^Zv!k6p z!h@},C (i<#@Fdf((i/ob@8I&7V(i!@6L(iZ@fcy'9+(iΕ]@PAnX) i(i/@{1BVi(iONs@!X2 l&(iـV@B#%V$f(iV$@F \^c(iFKOG@zf d (i4w@S7#bMR(iҥ@"(i~%@@B1{ͧG(iB@YFլ[(i89q@{{G!] ѥ(ina@rMe}kJ(iߑ@czarVL7I(iߦx3@BEj)2qL(i߱ *P{jۂE(iߴb/- ؍*ި(iߴ%;k.'*v(i߱mmq,4/X"&(i߮sh-#`(i߬VqtY2-ZR((i߫yݨuSݱ 1L.K(i߬Ot&ESҠN@(i߮;Ur'tӹU(i߱o3՟]D(i߳re ?-P[؍Y\s(i߭@@PS%hf.wv)}@>Ed#w ~tCb@,Eh"#,S V8@ m^!p G#!e{@VZ!4D!!P @Ϟg{2!Lq!8 ~Z@]w;!|^] @n{Sw"(p Oj@="jű@|2i##ZXĸT@L-]$w:MEE=L@PԦ%e o'N"@;W.&Xpss6@o z'?= @])@*qޓL'cmVJ@M .!f*(c!rp_EJC(4%L(iߖ} ,I(iߕ~YǍ.ۺ(iߘc}i]0ε!t(iߟZzߌ`l?b(iߩqtuWF.@zi(i߲IkX&L0@(iߨPEP(dޟZo(i^PTt@tԾ@9wlG!(iau0@w?Jmo}E(iۥf@QrHJ{ư(iF]@`>'h@F /) 'Jَ֍@Q"BnHab&<X\@1Yq1Ldŗ~%yBn4@Վ 9#ݦ> @,?" KOj \$T@ڴ^=B&C v~ȉ;@n$@.(%)(ЫQ (g}@ct*Jב(@0@^6 d*Ƚ(:M-t-@6^DŽe(^A'oRÄpD5Ǥ'\_cA./$(ľAJ]O^u(\X@ 8ltϢ(/:k@yΏ r49u(D%:x@_ZgiBRYD0(Sl#m@|pa/90;\(]os9`@c N(cPuw@˙t44s +(f^v@vբr&du(h%KH@0Pc(r(h{`G@Bmvg1(i_@cb\5۱q2(i @Eԫ@Kɥ(iw@O=n1#(i;@Ro:/(i@>7 kp(i|2$@FIJVh:*(i@}VE(iRW@S,FN27mp'(ik2@{連"*8P(iI4@T塰3\~(iU@? ̓ (i@6l `u;(iј@}E__(i|@ugt?(i?o@8oJ(iO; @x 3ԥ@(i߆Ԙ@jjH(iߦY@IށuѦ(iߵ+W[mh(i߶|kJ#Bh\*p(i߯ktPڔьW(iߡ(z㩛6v(iߑu`-J(i߀"Jrt +(ipJ2>Z(idB^F#B{(i\{šEκ(i]O(-HB|(iha=dQ(i߀k] L?N (iߡ|&yFqY)<ݣ(i߲ aUٓudT(ig\@s @?E3v^b(i@2vn=(iم9Q@~E %z(i;%@XoW[)vL(iW0@.ro`D(iz^O@7gl(i ޿@#{o'(h@T@џu]Q>(fPc@װs$)D)?Ջq(e ֨@X`(b0u@MfWH(s(aU)@Pnxh `(aۣҊ@\SGp(eOk@&l~0/6(iaUc`(T Jz!j'#I @=ܿ&wIYe@H-%W}RGfg6@56&(#1(CNU<^@yY/\j#@N&PL+I@ɚ^5d# Ly$nkT@Isz# *Tni@־A#i$miy@ M:w3#r( z\,@ǘni%$SΖ D~)$@ſƤ$NNp"@Ï>O%uw bG@ d& ĥ1$@[Y+1&|`@"Ô@h*Z'4]#bXPE@U;%''[žbqy͂2@FL >('bP(&9X)@,E8')Aӛ545.^$ @}C-% XX@a:&LsS9@ 6>F&2뿭ek@ҡP'7y2Ɨ0@ J=gW'v~eL 0"@bC_>'&v dЈ@u(#va@*D](V$zcyC5(ig?,o[&j(P7@v j#(j I'BA5SX S&WJt!Az&A/ _#Ee2qARO3 żtyA"Tmmعof wA% O'܌͂awA&Kib9ڬ\wA'۪)$PVMd-A'(R9#_;+A(hF`ʺz'J@}b-9'!ה@\@ &r&%X@zi>P*$~@Ce0!9Ӿ!J.M@q%XͶ҈D%p*@a)'xDZgDG(iK4WZņu>q(V:V@xӦ{=(5bN@=bt{(Zo@8H0x(f"m@?Af`(gƨ@&(>(hd~@Mzm(i6)@ZܷCg݌Y/(iw@um~P(i|ك@(EϚ,(iDkJ@^'VhW,$(i6@ԕ!C O(i9@r@8(i@oF]?(it@Zc.L{qxK(ii!@#]dt@V>K^2| (i߽vGZK)H 'vXv(i߻ (Bqۮջl(iߦm^|5$o(i߃{(iWL6I;-&wX(i#W 5VyJx(iӅٳQ7(iް <& `N{(ixLxS\1m&d(iH)%?3l(i'LƽYĆh(i$Fi^w(iLZvט?(iޭɬ4sNL(i@ԕS(iߵ qS' mR(i"9@O\֤ U(iʲoB@=cɹnI4(iXT@j{L\$1+(iR@xt8Lwv6(ibC @'{5S2c_g(h@ Л%m>-B(gRԡ@~; UY(fm^X@w g5(d0@徥(k(dCS@?Ϗƅ(eOWG7@ݨԬׯ (i &@ ;+,`%(ex$)І?Or(2CHu,oW'.E O#D6@Yl~&f5l0@;t%bGte@C!$w pL:@#db$Vijy`@J$F~O-@u}4$ČML9T#@iA#ENF:ac@@~ń' cM_Hzn@q3EMi (Jʏd8s|`(h{/@ҁtݫ퇔&(P3j'@D0~P(5]@"kN:3(!^f@}5s(|@B]ˆK0p('@¥%3*(qD@aMT/D((7c@-=ŢXQy(7wQg@3\7(Gh,U@@m1Vo(T @B=/ pXAh(\H'Q@9ŨT(b@阶b8^(e@۞YRoX-(g@i5.S(hR=zj@ΐ}0v܅0(i.{RJI"(iCy8-L \(iy>ޜteJ%(i/S))6D+gv(ikpV[Z(i $ (<(i`VI`nnG (iǩd?U=q(iPY  3Й(i߾Ci)ꊿ(iHH@:$ya#(irP@1#+b3拑S(i}m3@#j}(i@\4TE(im@ z}~Ϩ(h[ֻ@O59gʒ(g=_H t@…֍9%>EemX@so8%tMUL#J@(%EA[]@?-&R-p0 @b&Ydi/L@`oJ&`;'yL@6 &vL@EE'En5G .{@sи{'c| @ʅ'lHxFyyD@5?(q|9F43A[<(4\;J07b ! Yr(TC$Bl{dKo(fS;x4'>i(h:_&@0ep%ooS(P)@t)hlv=']AX.L:&Ѧ5AW6ʒ %#l,OA` 2h9xU= $A" VD^.A%6S,#4c3AsjA&|ھ- (/ mA'd.^놮G$A'%2̮Ҽv'Sn@ع}&O&:@(L@ȫǢ*(8f @q 6((xZ)@gfr( @Z_~ n[P(`[@v#P($N2@µj (/h8@%941x( VGZS(J1zh@H(Ui@z4vlQ(]1 -@iⅭB(bЋ@vꦕu(eR@ۓg3i7e@(g @ԙy x)(hs~@*$&3(i -C@RE? (ib/"@*Nf S*xŲ(i"@U\:|Xq(iWr0@=C鿿jȇ~(i4x@^3@c+ [!ӳ(ix;|@۫w`(i@P>4F(i߉U@r<(iF@Bk)W(iOhHޏq(i߻mz< Em(iߍiI' |(iHgxA_(i3uƫ(jo(iމmywp G-։(iQ#O(iݕ;5cT}"(i{ $^sH(i܇1P1H(i < jКdAr6(iۧh)l'(izn(iݍzOOh =(ixcH(i߻<-@*qyrwА(i~@~V@ x (iL@lAS1RM7(i-#\@sē)@D(i[-(@"I38=(hʵ^K@̶ƕsIL/(g3i@=&*EV.(g[M@ctN Wy T(f` @]3 @EZuR(hA]oP@іwK8aF>(i 4؞EfQ?([%Ft͂ ~(6<(}E=,zu''k6v0 @%g6xI&{O8@0& @v ~@LS%ƛ{na@m|L%%}64\@%91βWC9@h@n& xrd@GJ&JNA,&@Ye&iF,`-@?U8&}?@8 nҽ/' \V!#@"@'Hp _ Y@w'd7 9-@&V'z;S@Y܏'jY ; @7T-e(,X3O|: ((;)~hd,%?$(Tݖ5Ւ5\`(do+bM^D%J(itc @",= d(aL g@[{vUr*(9d]@Ar\G i'VcAQYQ,;v& qYhAx|jy/o"VA W桻n A$׋ @-]CA&$^ڂ2 3 bA'8dCrLBfA'׏gEkf xl;'y[@[_A&lGK@֦6x9a2$h"@Գιr#7 :3b 05@<\' D .@1yB(EOn )}mQ(iߐ@5p'Ed=3^(^^i @ (ײ.gm(L5@YB^t(>8(iG@~ h=c(iV,^@|jmT,(iCi@O>ŦYA]9(i LQ@BC2"0,+(i'7E7@^'!Fns(iځ_j@}YX/(iM,@ٽ?.(iݤ۽@Jz씺t凉(iޛ@@J@ k&(iA߃@BMfs{#``(iߦ@m[[$'١U(i!?BTwIrT(iqq-!D(i߷c(g+./@zȦIñ (gT<@'=(;v(i7v2ED@G#~|?(i8.T8qjC0 '(V4`%3_ [W( jjd `vL&V'xŔn @=l^G&~4F,G&@zR6<&pB4=MA@G3&D̃ O@lJˌ&Gb4[{@? &fT<jV@&-ù@=0&Ç" s9@x&\*G?:@s'(p啪"@Π,۹'Z>)w {ni@ jR' 1NI8@r'bd1M-Ωޱ@ S"'ᤛR?Iy1]@`(*bA>FZN'((F%?PAgpH(BuV>5$;7l(V?9}E.#B*(c*mF+m@qK(iipꋴXC+Bv"ԅ(g U@onK(Wk:@uO9IC}(Bk@GKsyҶ'_ A 2^ƆQ)c%bbA2oѦqQ stϝA"D d~CGYlz%7aA%Z@GAږqȧI9A&=/ڪ R)v%|A'X"G* *'`E@׹'<H H<%(*/@DN^ &j!>)}@^,>NX%\-sq@'W…p V@ ңm&(cDVF ;TH_}Aa'j(gL@Yv?]BLi([E C@F6)~)C(N$˸ s@S{դ7(B/@=+(:E%@h#@ O(6 ^@Hc(5ͱ@j@%m]#(9h@Jң?|z3(?vi@݋ (Hci@ LC?ZnI(QH%@j߉E6AIP(Y%H@iyz(_H @8c6,(c4Ƌ@$(!qiփ^(f[ޢy@2fg(g҉w@S`Itl(h$@ΘxMʴ:b@(i @]K7v7(i`^r@`5竓#T(i;@(K,(i@WՂaq-o(ifU9>@ZWDA3j)(i%H@yYly (is@ʼnaq7{(i{b@$i"L[t(i-P@@|9c(iեr@ "o-(eh(iؤ@ՐT/w(i]@v> 8:(iܳMw-@i (i@˵reH)(i~@%L^) *5(i|f@zjΙ(iъ,`@aGy \(iUw`WRƫ(iɬxnxN(i߭@ʫIH(iXj[vU $((iE=rO,h(i]Y>IIӍGL(iݼXqF$+__@(i}e Py(i< 81Xm}t]1(iaQCDxymY(izmJ})(iَa*eazx?4(iذ(''VmsCɍ*W(i5zРosfK(iו 33Έmm(i׻4Y|nqf(iحC(()th~(iڔ~0ڨ. "(iA2|  \(iߜVp6徹BU(i޷@occ(iԁa@0}ic1 #(iX@g?z53(iy.v@QrA(iq^ @/Y0e(hip@:ޟF(hXް@ ~XɕID(i0@-i>(gFֱӖߧs?>0(S`ɤ8BJ2( Gz d'k(l S @V'ul@Huir?/&IMIK@ 8O&2 b @[}k"&į KN@Zu`&z@%D'@L,NU'?MSMzˆ@y%')E :7cr3t@ނ'PQc jU@a;'v~2 #S@@rV'9⏓yC@ַr'ppG^d@e@;M'fͥN-@A (?ͯՉ C@PX]*(,TC}lAJ;e(5|;bCckWY:(ID`8>_G(XGyo[b%v(b!x_@tV|v(h+t|-kxL (iC_@i,ra'T(c? A@h2,6nV0(Lêi[@м{j}C(ЉsA$H&l4 A*Eb4#%y-AMhS&Ԕ_i9iA#ƶPD("M"T*{0sA&pXl͒7ڨǞ^w^A'5̼_ژ/"5oGb'$¡I@҄dgҺ5 $cIo@7#]O#p\C@Ո.>d'= <-R3v:a8d1(N'"5gp(isv cy`(eI$I@ A` |=([[.IX@Y("m>#83p(QF@kpBd$(J6n@\Wif8W(DZT@PBo\t(A=or@.0nHx_(@Qt@>H딉W(Cd\+@S/ͤ;(H6@kcxrJ(OiT7]@(~kn; (Vc@_Wb"drW(f@8$@Sc,!\(h n"@һD3Zё(h¸o2@%ګdxϴ(i, K@8[H{(iiv@@« n%_(i@^B#A J+((i '_@*(it@L2у(i@ݨq;rh fz(iȿ_B@-d>\Pߝ(iˇC@n,;Q(in[s@ut 6<)'{(imW@)BV!(iţ@(q]O(iT(@)ԡic(ip@mYÈg(iސv@h gN(iU?V1@|}?սޡw0(iv@qdG! b(i@aa"3_4}\.(i ȱq-J=(iF zA(iߝ )L@BS'(i3(f'`8(iޭ|X-;p1 eQ (i 3ԗA(iTT%c+(i܄w|묠"7(i۝j-z?1E(iڡy'B t=(iٔ"@um;!N(i}y6gc(ijK@DAaTV+(iss +\k(iտ$!CY^(iՇ *X>L(i 9n`[[5(iך$6AcZrb(i@'s(iݏwP'Wڢ|(iH{5ə(iݾu@M!\M38<(iл"^@L~:9g(im@#uuElgF(i|ۤ@VVg(iVn@~v 4h(i)-@WbFV(iKyADBZōS}(f"po{rX'!%(QoK#0nFS{ZE/(~-Fоy4G'ݡh k@1'erU_ r@Bz''5KK|;N`@Nb'(F$@A N'3 URjrs@x'H09F \hJB@c_]v'c]sv Z,O@2o^'xw Tb@BZB'I⻯@,}'MҲk/x=@E$V'3wQ$dM:0@.T"'|E+ -@W (p~:_-$W(Twn9 4]d0<(/ov+c0 |(ANR?9k8#?|M(O4yri@}krlA(]w}@lhH66͆(W*=@ Ϧh(QA)@bWl(NC4@Im 7 TXL (LAY@@:(L+ݞv@]ޱU]#o(N $s@[alWJ(QT@#V(VM'+@^&+zSs([D@q"Uc~d(_@ qxϵcP(cIݛ@YtU(eZ+^@.Ez2(gc-3KD@ho^:(h]J6e@y@ ]kb(hc @ʤPn3(iG@71p`E(bx(i{:*}@5Ƽ<y iv(iq@mw-8m(i= @1>(i;Y@,bdFˏ(i*Yi@*S~GT1]T(i(;@77cb0'[\(iS%+@A k(iF@$ĦI RKH(iIq6@M'FMP(iے̊@}(D9(iG@7RK_v(iބׄ@NҢ@E(i`z@GUΗh(iB@w }wHU (i6^(@S(Zh8%`(iJ2fhp[$J,(i0ѐ|y~o(iV:_Oz'm(iߑ+\)$*C(iJԏo/OtyW(i}op|$(iINf|-1u*yU *"(iP0i rXvU(i)p;gk{

o^(i %DOWtNm+(iܲq&^IQC%(i֫ ;r%KrO(iՉĘ=shEAϣ(iԘb8|Aҏ(in4ZL?bv>(iU~+?(iRCD־(i**!i(O5dF(i]2`Ma(i~.(i32tBݨ \(iݶ @fU~3u(iҙYd@dA "T(iN@TOxHe5[";E(i@y/7H%2k(iP@l,T ?(i' IBI(e]U!-4 t(S}{Ɍl]k(%dH(R F'^'Ca2&7 @ ''&5Ls~@a@S'n 77@v'] %G+E@e '%ք bbYx@9M'+xji/@`f]5r'Iu WH@&'"/`@T'W3':Ó4G@ǧ:'湹Z]F@0 (:`3s^R@[-F(TW:-ub("'KHoBd\xnbd(20bj (@ +{Q#96δ F(L4&V5N:H(Vw##D/ j%r(]BG_פ(cT%](guY{0;8TAP~(iO4Tڗ(i5@]NQ(fg>n@%/tU𑭃(\`~@}4d;9A&)#1W@o,.%acc(ͦ@(]5#33]C[(\P9_ήF5W(i{ $!f5w+(hh@ΈDX;dq(d#z&@b"Tŷga(`ӉB}v@:A_(]T@BL9j (Z/Y@3a3m(W^T@{ivjdM(VȌ;@hz0ӗB(VP4L@gzSع*?(W5n@q+9Ej=(io.GYtT(e=>/b,i(X}MT /uxtˠoi(:n/ OPȷ (BeY&߈(7v߆C@|%j8Au'4^A B^,$&-A!cˈxNazFA$Xe/ߓe?%R:;ʯ@"eTv'aز ;;oeEN&(U9eu}B=ʆZrQI(gR\(i^@NEVh(h*$Z4@Bޝl(f@ؒ $yǨ(dn8@|`}(b[@I&3NJPlu(a@XRP@c(`{@P#6ul4(_G@ZdL+(_|@襁eвȼ(`@,k{F9G(a@ى­&~gp(c0kGe@}{c0(d6څ@k*R멌(f8+ODm@ڋ '$K(gh^?@geX(hGT@}P; 6(hb_@˼_fBЧ=(i> @:qJhR'Y(iy֋@e`#q (i%@ŬC1i2\(i$@ 98aXU(i®>@㚼hLN(i뾳@fyƍdhr(i,iM~@tZݻk(i֋ܽ@cG7){(iٰ#2@^ F#L(ie@ؿds dv(iݬ,y@^UƬ4af(imm@9(FV(iUϩ@]9a ?B(in]d@uG2~|AE(i֜D@t6u'=(i2@XDZYYmWs(i()^G-}Ee(i!7Et|Wo^x\B(i]AES(37WZQ(i(o)RITQU M(i<\tKDZk(i2gjGEw߰4(i'< ;?N9s(i^I4d-:z'w(ivCt53(iU*~0o(iݽAU -M-* (i ^X% (ibI?n }9(i۟ijvL(iJL+n~߿(iZOqmF=_#(i<[@ {2dV g(iQF_ 0L wK5(iףZ-kB(i03\p΢(i7p) !(i׏A=%x%o+(iئUP*&W_(if_f^l(iܝ̲C3* B:(i( +hh>Q&(iUʁE3AR p(iԥZ.[4D(i8W1©\@(i50rskΈ(iZ8M Ĭ"G'G(f8Q32an(_bJL#-9(P¨د1&(<,N jlo}8w8؟(-ADRO.*b(%u7e/($5yrG~H('*|L0r(+5G9)`o(0ZoU{Y[Ji=]>(6<Ǭ@נ8(o ֭0)(_/_0W-k?h(baj(3IIڻ(d>0(Q_WfCx(f}5&-O(h]YxM B1(iRl3+Cy˚(i }lm#;1L(iR@* 6"(hWȣ@HB`N(eϒ@ܑI4bqZL(\(@EeFË(;Q@x+Wvܾ'ڟc1AT=_!$e\c?d>?'%澅P*]?9"N??8kf辗w, A%= wQ=>~G"J?$F?(YZE#a~o>md?)> / <Σ m.h > [1] The expectation values <...> are calculated by taking time averages. The sums Σ... are taken over the different cells. The input temperature is chosen to be 177K. We allow an error smaller than 5K. NOTE: The exchange energy in MuMax3 is shifted by a constant with respect to atomistic simulations. Due to this difference, we need to add the following constant value to the divisor of [1]: shift = 2 * (Aex/Msat) * NCell * ( 2/Δx² + 2/Δy² ) */ package main import ( "github.com/mumax/3/cuda" . "github.com/mumax/3/engine" ) const kB = 1.38064852e-23 // Boltzmann constant func main() { defer InitAndClose()() // Prepare the PMA film Eval(` SetGridSize(128, 128, 1) SetCellSize(4e-9, 4e-9, 4e-9) SetPBC(1,1,0) Msat = 580e3 Aex = 15e-12 AnisU = Vector(0, 0, 1) Ku1 = 0.6e6 Alpha = 0.1 Temp = 177 M = Uniform(0, 0, -1) Run(1e-10) `) m := M.Buffer() h := cuda.Buffer(3, m.Size()) mxh := cuda.Buffer(3, m.Size()) cs := Mesh().CellSize() Vcell := cs[X] * cs[Y] * cs[Z] shift := 2 * Aex.GetRegion(0) / Msat.Average() * float64(Mesh().NCell()) * (2/(cs[X]*cs[X]) + 2/(cs[Y]*cs[Y])) // update the time averages in numerator and divisor of [1] in each step from now on divisor := 0.0 numerator := 0.0 nstep := 0.0 PostStep(func() { nstep += 1 SetDemagField(h) AddExchangeField(h) AddAnisotropyField(h) cuda.CrossProduct(mxh, m, h) divisor = ((nstep-1)*divisor + float64(cuda.Dot(m, h))) / nstep numerator = ((nstep-1)*numerator + float64(cuda.Dot(mxh, mxh))) / nstep }) Run(1e-10) temperature := (Vcell * Msat.Average() / (2 * kB)) * numerator / (divisor + shift) // [1] Expect("temperature", temperature, Temp.GetRegion(0), 5) } mumax3-3.10/test/timedep.mx3000066400000000000000000000004731371432437400157220ustar00rootroot00000000000000/* Test time dependent parameters. */ c:=4e-9 setgridsize(32, 32, 1) setcellsize(c, c, c) Msat=860e3 Aex=13e-12 alpha=0.2 m=uniform(1, 1, 0) anisU = vector(0, 1, 0) f := 1e9 Ku1 = 1e5 * sin(2 * pi * f * t) tableadd(Ku1) run(0.5e-9) TOL := 1e-5 expectv("m", m.average(), vector(0, 0.9909376502037048, 0), TOL) mumax3-3.10/test/timedep3.mx3000066400000000000000000000006221371432437400160010ustar00rootroot00000000000000/* Test time-dependent vector parameter with regions. */ c:=4e-9 setgridsize(32, 32, 1) setcellsize(c, c, c) Msat=860e3 Aex=13e-12 alpha=0.2 m=uniform(1, 1, 0) setgeom(circle(32*4e-9)) f := 1e9 A := 0.01 B_ext = vector(A*sin(2*pi*f*t), A*cos(2*pi*f*t), 0) tableadd(B_ext) run(0.2e-9) TOL := 1e-5 expectv("m", m.average(), vector(0.6773565132629695, 0.7201919931496306, 0.02121575360227688), TOL) mumax3-3.10/test/timedep3Region.mx3000066400000000000000000000007721371432437400171530ustar00rootroot00000000000000/* Test time-dependent vector parameter with regions */ c:=4e-9 setgridsize(32, 32, 1) setcellsize(c, c, c) Msat=860e3 Aex=13e-12 alpha=0.2 m=uniform(1, 1, 0) setgeom(circle(32*4e-9)) f := 1e9 A := 0.01 defRegion(1, xrange(-inf, inf)) B_ext.setRegion(0, vector(0*t, 0, 0)) B_ext.setRegion(1, vector(A*sin(2*pi*f*t), A*cos(2*pi*f*t), 0)) B_ext.setRegion(2, vector(0*t, 0, 0)) run(0.2e-9) TOL := 1e-5 expectv("m", m.average(), vector(0.6773565132629695, 0.7201919931496306, 0.02121575360227688), TOL) mumax3-3.10/test/timedepRegion.mx3000066400000000000000000000006261371432437400170660ustar00rootroot00000000000000/* Test time dependent parameter with regions. */ c:=4e-9 setgridsize(32, 32, 1) setcellsize(c, c, c) Msat=860e3 Aex=13e-12 alpha=0.2 m=uniform(1, 1, 0) anisU = vector(0, 1, 0) defRegion(1, xrange(-inf, inf)) f := 1e9 Ku1.setRegion(1, 1e5 * sin(2 * pi * f * t)) run(0.5e-9) m_ := m.average() expect("mx", m_[0], 0, 1e-4) expect("my", m_[1], 0.99090, 1e-4) expect("mz", m_[2], 0, 1e-4) mumax3-3.10/test/timedepRegion2.mx3000066400000000000000000000006431371432437400171470ustar00rootroot00000000000000/* Test time dependent parameter with regions. */ c:=4e-9 setgridsize(32, 32, 1) setcellsize(c, c, c) Msat=860e3 Aex=13e-12 alpha=0.2 m=uniform(1, 1, 0) anisU = vector(0, 1, 0) defRegion(1, xrange(0, inf)) f := 1e9 Ku1.setRegion(1, 1e5 * sin(2 * pi * f * t)) run(0.5e-9) m_ := m.average() print(m_) //expect("mx", m_[0], 0, 1e-4) //expect("my", m_[1], 0.99090, 1e-4) //expect("mz", m_[2], 0, 1e-4) mumax3-3.10/test/topologicalcharge-skyrmion.mx3000066400000000000000000000012731371432437400216310ustar00rootroot00000000000000/* Test topological charge calculation: for bubble/skyrmion S = -1,1,2... */ tol := 0.015 setgridsize(64, 128, 8) setcellsize(3e-9, 1.5e-9, 8e-9) // aka CoPt Msat = 1000e3 Aex = 10e-12 anisU = vector(0, 0, 1) Ku1 = 0.65e6 alpha = 1 m = blochskyrmion(1, 1).scale(4,4,1) steps(500) expect("Skyrmion number" , ext_topologicalcharge, 1.0, tol) m = blochskyrmion(-1, 1).scale(4,4,1) steps(500) expect("Skyrmion number" , ext_topologicalcharge, 1.0, tol) m = blochskyrmion(1, -1).scale(4,4,1) steps(500) expect("Skyrmion number" , ext_topologicalcharge, -1.0, tol) m = blochskyrmion(-1, -1).scale(4,4,1) steps(500) expect("Skyrmion number" , ext_topologicalcharge, -1.0, tol) mumax3-3.10/test/topologicalcharge-uniform.mx3000066400000000000000000000021061371432437400214310ustar00rootroot00000000000000/* Test topological charge calculation: for uniform state S = 0 */ setgridsize(64, 128, 8) setcellsize(3e-9, 1.5e-9, 4e-9) // aka CoPt Msat = 1000e3 Aex = 10e-12 anisU = vector(0, 0, 1) Ku1 = 0.65e6 alpha = 1 tol := 0.002 m = uniform(0.0, 0.0, 1.0) b_ext = vector(0.0, 0.0, 2.0) steps(50) expect("Skyrmion number" , ext_topologicalcharge, 0.0, tol) m = uniform(0.0, 0.0, -1.0) b_ext = vector(0.0, 0.0, -2.0) steps(50) expect("Skyrmion number" , ext_topologicalcharge, 0.0, tol) Ku1 = 0.0 tol = 5e-10 // changed by Arne, 5e-11 failed on GTX480 b_ext = vector(2.0, 0.0, 0.0) m = uniform(1.0, 0.0, 0.0) steps(50) expect("Skyrmion number" , ext_topologicalcharge, 0.0, tol) b_ext = vector(-2.0, 0.0, 0.0) m = uniform(-1.0, 0.0, 0.0) steps(50) expect("Skyrmion number" , ext_topologicalcharge, 0.0, tol) b_ext = vector(0.0, 2.0, 0.0) m = uniform(0.0, 1.0, 0.0) steps(50) expect("Skyrmion number" , ext_topologicalcharge, 0.0, tol) b_ext = vector(0.0, -2.0, 0.0) m = uniform(0.0, -1.0, 0.0) steps(50) expect("Skyrmion number" , ext_topologicalcharge, 0.0, tol) mumax3-3.10/test/topologicalcharge-vortex.mx3000066400000000000000000000010761371432437400213060ustar00rootroot00000000000000/* Test topological charge calculation: for vortex S = -0.5, 0.5, 1.5... */ tol := 0.0005 setgridsize(64, 128, 8) setcellsize(3e-9, 1.5e-9, 4e-9) // aka Py Msat = 800e3 Aex = 13e-12 alpha = 1 m = vortex(1, 1) steps(500) expect("Skyrmion number" , ext_topologicalcharge, 0.5, tol) m = vortex(-1, 1) steps(500) expect("Skyrmion number" , ext_topologicalcharge, 0.5, tol) m = vortex(1, -1) steps(500) expect("Skyrmion number" , ext_topologicalcharge, -0.5, tol) m = vortex(-1, -1) steps(500) expect("Skyrmion number" , ext_topologicalcharge, -0.5, tol) mumax3-3.10/test/topologicalchargelattice-skyrmion.mx3000066400000000000000000000014001371432437400231670ustar00rootroot00000000000000/* Test topological charge calculation: for bubble/skyrmion S = -1,1,2... Based on topologicalcharge-skyrmion.mx3 */ tol := 0.005 setgridsize(64, 128, 8) setcellsize(3e-9, 1.5e-9, 8e-9) // aka CoPt Msat = 1000e3 Aex = 10e-12 anisU = vector(0, 0, 1) Ku1 = 0.65e6 alpha = 1 m = blochskyrmion(1, 1).scale(4,4,1) steps(500) expect("Skyrmion number" , ext_topologicalchargelattice, 1.0, tol) m = blochskyrmion(-1, 1).scale(4,4,1) steps(500) expect("Skyrmion number" , ext_topologicalchargelattice, 1.0, tol) m = blochskyrmion(1, -1).scale(4,4,1) steps(500) expect("Skyrmion number" , ext_topologicalchargelattice, -1.0, tol) m = blochskyrmion(-1, -1).scale(4,4,1) steps(500) expect("Skyrmion number" , ext_topologicalchargelattice, -1.0, tol) mumax3-3.10/test/topologicalchargelattice-uniform.mx3000066400000000000000000000022301371432437400227750ustar00rootroot00000000000000/* Test topological charge calculation: for uniform state S = 0 Based on topologicalcharge-uniform.mx3 */ setgridsize(64, 128, 8) setcellsize(3e-9, 1.5e-9, 4e-9) // aka CoPt Msat = 1000e3 Aex = 10e-12 anisU = vector(0, 0, 1) Ku1 = 0.65e6 alpha = 1 tol := 0.002 m = uniform(0.0, 0.0, 1.0) b_ext = vector(0.0, 0.0, 2.0) steps(50) expect("Skyrmion number" , ext_topologicalchargelattice, 0.0, tol) m = uniform(0.0, 0.0, -1.0) b_ext = vector(0.0, 0.0, -2.0) steps(50) expect("Skyrmion number" , ext_topologicalchargelattice, 0.0, tol) Ku1 = 0.0 tol = 5e-10 // changed by Arne, 5e-11 failed on GTX480 b_ext = vector(2.0, 0.0, 0.0) m = uniform(1.0, 0.0, 0.0) steps(50) expect("Skyrmion number" , ext_topologicalchargelattice, 0.0, tol) b_ext = vector(-2.0, 0.0, 0.0) m = uniform(-1.0, 0.0, 0.0) steps(50) expect("Skyrmion number" , ext_topologicalchargelattice, 0.0, tol) b_ext = vector(0.0, 2.0, 0.0) m = uniform(0.0, 1.0, 0.0) steps(50) expect("Skyrmion number" , ext_topologicalchargelattice, 0.0, tol) b_ext = vector(0.0, -2.0, 0.0) m = uniform(0.0, -1.0, 0.0) steps(50) expect("Skyrmion number" , ext_topologicalchargelattice, 0.0, tol) mumax3-3.10/test/topologicalchargelattice-vortex.mx3000066400000000000000000000012001371432437400226410ustar00rootroot00000000000000/* Test topological charge calculation: for vortex S = -0.5, 0.5, 1.5... Based on topologicalcharge-vortex.mx3 */ tol := 0.001 setgridsize(64, 128, 8) setcellsize(3e-9, 1.5e-9, 4e-9) // aka Py Msat = 800e3 Aex = 13e-12 alpha = 1 m = vortex(1, 1) steps(500) expect("Skyrmion number" , ext_topologicalchargelattice, 0.5, tol) m = vortex(-1, 1) steps(500) expect("Skyrmion number" , ext_topologicalchargelattice, 0.5, tol) m = vortex(1, -1) steps(500) expect("Skyrmion number" , ext_topologicalchargelattice, -0.5, tol) m = vortex(-1, -1) steps(500) expect("Skyrmion number" , ext_topologicalchargelattice, -0.5, tol) mumax3-3.10/test/uniaxial_full.mx3000066400000000000000000000015241371432437400171250ustar00rootroot00000000000000/* Test uniaxial anistorpy energy based on: The design and verification of MuMax3 AIP Advances 4, 107133 (2014); http://dx.doi.org/10.1063/1.4899186 Test for one random-ish angle. */ setGridSize(1, 1, 1) setCellSize(1e-9, 1e-9, 1e-9) V := pow(1e-9, 3) // Msat = 100e3 AnisU = vector(1, 0, 0) theta := 17*pi/180 m = uniform(cos(theta), sin(theta), 0) TOL := 1 Msat = 100e3 Ku1 = 0; Ku2 = 1e6 // try to trigger bad ku_red update steps(1) m = uniform(cos(theta), sin(theta), 0) Msat = 1000e3 expect("easy2", E_anis.get()/V, -836344.9375, TOL) Ku1 = 0; Ku2 = -1e6 expect("hard2", E_anis.get()/V, 836344.9375, TOL) Msat = 100e3 Ku1 = 1e6; Ku2 = 0 Msat = 1000e3 expect("easy1", E_anis.get()/V, -914519, TOL) Msat = 1e3 Ku1 = -10e6; Ku2 = 0 E_total.get() Msat = 1000e3 Ku1 = -1e6; Ku2 = 0 expect("hard1", E_anis.get()/V, 914519, TOL) mumax3-3.10/test/uniaxialanisotropy-minimize.mx3000066400000000000000000000030341371432437400220500ustar00rootroot00000000000000/* Test uniaxial anisotropy. We let the anisotropy compete with an external field and verify the minimized my against OOMMF values. */ setgridsize(64, 64, 1) setcellsize(4e-9, 4e-9, 2e-9) Aex = 13e-12 alpha = 1 M = uniform(1, 1, 0) // define some regions to make sure anisotropy is applied everywhere // (when using Ku1 = ... syntax) defregion(2, ellipse(100e-9, 100e-9)) defregion(3, rect(100e-9, 20e-9)) // Test output save(Ku1) save(AnisU) save(B_anis) // Easy, in-plane AnisU = vector(1, 0, 0) Ku1 = 0.5e6 Msat = 1100e3 B_ext = vector(0, 0.00, 0) minimize() expect("my", m.average()[1], 0.000, 1e-3) B_ext = vector(0, 0.01, 0) minimize() expect("my", m.average()[1], 0.011, 1e-3) B_ext = vector(0, 0.03, 0) minimize() expect("my", m.average()[1], 0.033, 1e-3) B_ext = vector(0, 0.10, 0) minimize() expect("my", m.average()[1], 0.110, 1e-3) B_ext = vector(0, 0.30, 0) minimize() expect("my", m.average()[1], 0.331, 1e-3) // Hard, in-plane Ku1 = -0.5e6 m = uniform(-1, -2, -3) B_ext = vector(0, 0.00, 0) minimize() expect("my", m.average()[1], 1.000, 1e-3) B_ext = vector(0.01, 0, 0) minimize() expect("mx", m.average()[0], 0.011, 1e-3) B_ext = vector(0.10, 0, 0) minimize() expect("mx", m.average()[0], 0.110, 1e-3) AnisU = vector(100, 0, 0) // Test unnormalized U vector minimize() expect("mx", m.average()[0], 0.110, 1e-3) // should not make a difference (normalized internally) AnisU = vector(0, 0, 1) B_ext = vector(0, 0, 0) // Hard, perpendicular Ku1 = -1e6 minimize() expect("mz", m.average()[2], 0, 1e-3) mumax3-3.10/test/uniaxialanisotropy.mif000066400000000000000000000012571371432437400203020ustar00rootroot00000000000000# MIF 1.1 Ms:1100E3 A:13E-12 K1:0.5E6 Damp Coef:0.25 Anisotropy Type:uniaxial Anisotropy Init:constant Anisotropy Dir1:1 0 0 Anisotropy Dir2:0 1 0 Demag Type:constmag Part Height:256E-9 Part Width:256E-9 Part Thickness:2e-9 Part Shape:ellipse Cell Size:4e-9 Init Mag:uniform 45 45 Base Output Filename:anis-test Magnetization Output Format:binary 4 Total Field Output Format:binary 4 Data Table Output Format:%.15g Converge |mxh| Value:1.0E-5 Randomizer Seed:0 Field Range: 0 0 0 0 0 0 0 -time 1E-9 Field Range: 0 10e-3 0 0 10e-3 0 0 -time 1E-9 Field Range: 0 30e-3 0 0 30e-3 0 0 -time 1E-9 Field Range: 0 100e-3 0 0 100e-3 0 0 -time 1E-9 Field Range: 0 300e-3 0 0 300e-3 0 0 -time 1E-9 mumax3-3.10/test/uniaxialanisotropy.mx3000066400000000000000000000027451371432437400202410ustar00rootroot00000000000000/* Test uniaxial anisotropy. We let the anisotropy compete with an external field and verify the relaxed my against OOMMF values. */ setgridsize(64, 64, 1) setcellsize(4e-9, 4e-9, 2e-9) Aex = 13e-12 alpha = 1 M = uniform(1, 1, 0) // define some regions to make sure anisotropy is applied everywhere // (when using Ku1 = ... syntax) defregion(2, ellipse(100e-9, 100e-9)) defregion(3, rect(100e-9, 20e-9)) // Test output save(Ku1) save(AnisU) save(B_anis) // Easy, in-plane AnisU = vector(1, 0, 0) Ku1 = 0.5e6 Msat = 1100e3 B_ext = vector(0, 0.00, 0) relax() expect("my", m.average()[1], 0.000, 1e-3) B_ext = vector(0, 0.01, 0) relax() expect("my", m.average()[1], 0.011, 1e-3) B_ext = vector(0, 0.03, 0) relax() expect("my", m.average()[1], 0.033, 1e-3) B_ext = vector(0, 0.10, 0) relax() expect("my", m.average()[1], 0.110, 1e-3) B_ext = vector(0, 0.30, 0) relax() expect("my", m.average()[1], 0.331, 1e-3) // Hard, in-plane Ku1 = -0.5e6 B_ext = vector(0, 0.00, 0) relax() expect("my", m.average()[1], 1.000, 1e-3) B_ext = vector(0.01, 0, 0) relax() expect("mx", m.average()[0], 0.011, 1e-3) B_ext = vector(0.10, 0, 0) relax() expect("mx", m.average()[0], 0.110, 1e-3) AnisU = vector(100, 0, 0) // Test unnormalized U vector relax() expect("mx", m.average()[0], 0.110, 1e-3) // should not make a difference (normalized internally) AnisU = vector(0, 0, 1) B_ext = vector(0, 0, 0) // Hard, perpendicular Ku1 = -1e6 relax() expect("mz", m.average()[2], 0, 1e-3) mumax3-3.10/test/vector.mx3000066400000000000000000000005361371432437400155750ustar00rootroot00000000000000/* Test basic vector math. */ setgridsize(1,1,1) setcellsize(1,1,1) x := vector(1, 0, 0) y := vector(0, 1, 0) z := vector(0, 0, 1) a := vector(1, 2, 3) b := vector(4, 5, 6) tol := 0 expect("dot", a.dot(b), 4 + 10 + 18, tol) expect("cross", x.cross(y).x(), 0, tol) expect("cross", x.cross(y).y(), 0, tol) expect("cross", x.cross(y).z(), 1, tol) mumax3-3.10/test/zeemanenergy.mx3000066400000000000000000000006371371432437400167660ustar00rootroot00000000000000Nx := 128 Ny := 32 Nz := 2 cx := 5e-9 cy := 4e-9 cz := 3e-9 V := Nx * Ny * Nz * cx * cy * cz SetGridSize(Nx, Ny, Nz) SetCellSize(cx, cy, cz) Ms := 100e3 Msat = Ms M = Uniform(1, 0, 0) print(E_zeeman) B := 1e-3 tol := B*Ms*V / 1e5 B_ext = vector(B, 0, 0) expect("E", E_zeeman, -B*Ms*V, tol) B_ext = vector(0, B, 0) expect("E", E_zeeman, 0, tol) B_ext = vector(-B, 0, 0) expect("E", E_zeeman, B*Ms*V, tol) mumax3-3.10/test/zhangliPBC.mx3000066400000000000000000000007221371432437400162510ustar00rootroot00000000000000/* Test Zhang-li torque with PBCs. */ setPBC(1, 0, 0) setGridSize(256, 32, 1) c := 5e-9 setCellSize(c, c, c) Msat = 800e3 Aex = 13e-12 alpha = 3 m = twodomain(1,0,0, 0,1,0, -1,0,0) m.setInShape(xrange(-inf, -120*c), uniform(0,-1,0)) run(1e-9) alpha = 0.01 xi = 0.1 J = vector(1e12, 0, 0) Pol = 1 run(1e-9) m1 := m.average() expect("mx", m1[0], -0.081425920, 1e-4) expect("my", m1[1], -0.003434650, 1e-4) expect("mz", m1[2], -0.015030215, 1e-4) mumax3-3.10/timer/000077500000000000000000000000001371432437400137775ustar00rootroot00000000000000mumax3-3.10/timer/Makefile000066400000000000000000000000241371432437400154330ustar00rootroot00000000000000all: go install -v mumax3-3.10/timer/timer.go000066400000000000000000000037241371432437400154540ustar00rootroot00000000000000package timer import ( "fmt" "io" "sort" "time" ) var ( clocks map[string]*clock firstStart time.Time ) func Start(key string) { if clocks == nil { clocks = make(map[string]*clock) firstStart = time.Now() } if c, ok := clocks[key]; ok { c.Start() } else { clocks[key] = new(clock) // do not start, first run = warmup time } } func Stop(key string) { clocks[key].Stop() } type clock struct { total time.Duration started time.Time invocations int } func (c *clock) Start() { c.started = time.Now() c.invocations++ } func (c *clock) Stop() { if (c.started == time.Time{}) { return // not started } d := time.Since(c.started) c.total += d c.started = time.Time{} } // entry for sorted output by Print() type entry struct { name string total time.Duration invocations int pct float32 } func (e *entry) String() string { perOp := time.Duration(int64(e.total) / int64(e.invocations)) return fmt.Sprint(pad(e.name), pad(fmt.Sprint(e.invocations, "x")), perOp, "/op\t", e.pct, " %\t", e.total, " total") } func pad(s string) string { if len(s) >= 20 { return s } return s + " "[:20-len(s)] } type entries []entry func (l entries) Len() int { return len(l) } func (l entries) Less(i, j int) bool { return l[i].total > l[j].total } func (l entries) Swap(i, j int) { l[i], l[j] = l[j], l[i] } func Print(out io.Writer) { if clocks == nil { return } wallTime := time.Since(firstStart) lines := make(entries, 0, len(clocks)) var accounted time.Duration for k, v := range clocks { pct := 100 * float32(int64(v.total)) / float32(int64(wallTime)) lines = append(lines, entry{k, v.total, v.invocations, pct}) accounted += v.total } unaccounted := wallTime - accounted pct := 100 * float32(int64(unaccounted)) / float32(int64(wallTime)) lines = append(lines, entry{"NOT TIMED", unaccounted, 1, pct}) sort.Sort(lines) for _, l := range lines { fmt.Fprintln(out, &l) } } mumax3-3.10/util/000077500000000000000000000000001371432437400136345ustar00rootroot00000000000000mumax3-3.10/util/Makefile000066400000000000000000000000241371432437400152700ustar00rootroot00000000000000all: go install -v mumax3-3.10/util/atom.go000066400000000000000000000003131371432437400151200ustar00rootroot00000000000000package util import "sync/atomic" // Atomic int type Atom int32 func (a *Atom) Add(v int32) { atomic.AddInt32((*int32)(a), v) } func (a *Atom) Load() int32 { return atomic.LoadInt32((*int32)(a)) } mumax3-3.10/util/format.go000066400000000000000000000040121371432437400154500ustar00rootroot00000000000000package util import ( "bytes" "fmt" "io" "os" ) // Produces nicely formatted output for multi-dimensional arrays. func Println(array ...interface{}) { Fprint(os.Stdout, array...) fmt.Fprintln(os.Stdout) } // Produces nicely formatted output for multi-dimensional arrays. func Print(array ...interface{}) { Fprint(os.Stdout, array...) } // Produces nicely formatted output for multi-dimensional arrays. func Printf(format string, array ...interface{}) { Fprintf(os.Stdout, format, array...) } // Produces nicely formatted output for multi-dimensional arrays. func Fprint(out io.Writer, array ...interface{}) { Fprintf(out, "%v", array...) } func Sprint(array ...interface{}) string { var buf bytes.Buffer Fprint(&buf, array...) return buf.String() } // Produces nicely formatted output for multi-dimensional arrays. func Fprintf(out io.Writer, format string, array ...interface{}) { for _, arr := range array { switch a := arr.(type) { case [][][]float32: FprintfFloats(out, format, a) case [][][][]float32: FprintfTensors(out, format, a) case [3][][][]float32: FprintfTensors(out, format, a[:]) case [3][3][][][]float32: Fprintf(out, format, a[0][:]) Fprintf(out, format, a[1][:]) Fprintf(out, format, a[2][:]) default: fmt.Fprintf(out, format, a) } } } // Produces nicely formatted output. func FprintfTensors(out io.Writer, format string, a [][][][]float32) { for i := range a { FprintfFloats(out, format, a[i]) fmt.Fprintln(out) } } // Produces nicely formatted output. func FprintfFloats(out io.Writer, format string, a [][][]float32) { format += " " for i := range a { for j := range a[i] { for _, v := range a[i][j] { fmt.Fprintf(out, format, v) } fmt.Fprintln(out) } fmt.Fprintln(out) } } //// Produces nicely formatted output. //func FprintComplexs(out io.Writer, a [][][]complex64) { // for i := range a { // for j := range a[i] { // for _, v := range a[i][j] { // fmt.Fprint(out, v, " ") // } // fmt.Fprintln(out) // } // fmt.Fprintln(out) // } //} mumax3-3.10/util/log.go000066400000000000000000000041331371432437400147450ustar00rootroot00000000000000package util // Logging and error reporting utility functions import ( "fmt" "log" "runtime" "sync" "time" ) func Fatal(msg ...interface{}) { log.Fatal(msg...) } func Fatalf(format string, msg ...interface{}) { log.Fatalf(format, msg...) } // If err != nil, trigger log.Fatal(msg, err) func FatalErr(err interface{}) { _, file, line, _ := runtime.Caller(1) if err != nil { log.Fatal(file, ":", line, err) } } // Panics if err is not nil. Signals a bug. func PanicErr(err error) { if err != nil { log.Panic(err) } } // Logs the error of non-nil, plus message func LogErr(err error, msg ...interface{}) { if err != nil { log.Println(append(msg, err)...) } } func Log(msg ...interface{}) { log.Println(msg...) } // Panics with "illegal argument" if test is false. func Argument(test bool) { if !test { log.Panic("illegal argument") } } // Panics with msg if test is false func AssertMsg(test bool, msg interface{}) { if !test { log.Panic(msg) } } // Panics with "assertion failed" if test is false. func Assert(test bool) { if !test { log.Panic("assertion failed") } } // Hack to avoid cyclic dependency on engine. var ( progress_ func(int, int, string) = PrintProgress progLock sync.Mutex ) // Set progress bar to progress/total and display msg // if GUI is up and running. func Progress(progress, total int, msg string) { progLock.Lock() defer progLock.Unlock() if progress_ != nil { progress_(progress, total, msg) } } var ( lastPct = -1 // last progress percentage shown lastProgT time.Time // last time we showed progress percentage ) func PrintProgress(prog, total int, msg string) { pct := (prog * 100) / total if pct != lastPct { // only print percentage if changed if (time.Since(lastProgT) > time.Second) || pct == 100 { // only print percentage once/second unless finished fmt.Println("//", msg, pct, "%") lastPct = pct lastProgT = time.Now() } } } // Sets the function to be used internally by Progress. // Avoids cyclic dependency on engine. func SetProgress(f func(int, int, string)) { progLock.Lock() defer progLock.Unlock() progress_ = f } mumax3-3.10/util/util.go000066400000000000000000000010321371432437400151340ustar00rootroot00000000000000// package util provides common utilities for all other packages. package util import ( "net" "path" "strings" ) // Remove extension from file name. func NoExt(file string) string { ext := path.Ext(file) return file[:len(file)-len(ext)] } // returns all network interface addresses, without CIDR mask func InterfaceAddrs() []string { addrs, _ := net.InterfaceAddrs() ips := make([]string, 0, len(addrs)) for _, addr := range addrs { IpCidr := strings.Split(addr.String(), "/") ips = append(ips, IpCidr[0]) } return ips }