/home/tgrogers/github/purdue-aalp/gpgpu-sim_simulations/util/correlation/../../benchmarks/bin/10.1/release/transpose Starting...

> Device 0: "Quadro GV100"
> SM Capability 7.0 detected:
> [Quadro GV100] has 80 MP(s) x -1 (Cores/MP) = -80 (Cores)
> Compute performance scaling factor = 1.00
> MatrixSize X = 512
> MatrixSize Y = 512

Matrix size: 512x512 (32x32 tiles), tile size: 16x16, block size: 16x16


transpose-Outer-simple copy       , Throughput = 0.0378 GB/s, Time = 51.65699 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-simple copy       , Throughput = 0.0343 GB/s, Time = 57.00006 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-shared memory copy, Throughput = 0.0380 GB/s, Time = 51.36454 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-shared memory copy, Throughput = 0.0409 GB/s, Time = 47.80765 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-naive             , Throughput = 0.0435 GB/s, Time = 44.91475 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-naive             , Throughput = 0.0411 GB/s, Time = 47.54051 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coalesced         , Throughput = 0.0302 GB/s, Time = 64.77648 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coalesced         , Throughput = 0.0304 GB/s, Time = 64.16253 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-optimized         , Throughput = 0.0354 GB/s, Time = 55.22713 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-optimized         , Throughput = 0.0315 GB/s, Time = 62.02605 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coarse-grained    , Throughput = 0.0353 GB/s, Time = 55.36605 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coarse-grained    , Throughput = 0.0299 GB/s, Time = 65.29514 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-fine-grained      , Throughput = 0.0336 GB/s, Time = 58.06659 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-fine-grained      , Throughput = 0.0348 GB/s, Time = 56.19321 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-diagonal          , Throughput = 0.0360 GB/s, Time = 54.18688 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-diagonal          , Throughput = 0.0373 GB/s, Time = 52.37827 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
