/home/tgrogers/github-purdue/gpgpu-sim_simulations/util/correlation/../../benchmarks/bin/9.1/release/transpose Starting...

> Device 0: "GeForce GTX 1080 Ti"
> SM Capability 6.1 detected:
> [GeForce GTX 1080 Ti] has 28 MP(s) x -1 (Cores/MP) = -28 (Cores)
> Compute performance scaling factor = 1.00

Matrix size: 512x512 (32x32 tiles), tile size: 16x16, block size: 16x16


transpose-Outer-simple copy       , Throughput = 111.5816 GB/s, Time = 0.01750 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-simple copy       , Throughput = 84.7710 GB/s, Time = 0.02304 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-shared memory copy, Throughput = 125.3289 GB/s, Time = 0.01558 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-shared memory copy, Throughput = 90.6912 GB/s, Time = 0.02154 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-naive             , Throughput = 82.2576 GB/s, Time = 0.02374 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-naive             , Throughput = 54.4471 GB/s, Time = 0.03587 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coalesced         , Throughput = 132.6851 GB/s, Time = 0.01472 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coalesced         , Throughput = 89.6258 GB/s, Time = 0.02179 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-optimized         , Throughput = 134.7354 GB/s, Time = 0.01450 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-optimized         , Throughput = 93.7560 GB/s, Time = 0.02083 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coarse-grained    , Throughput = 132.9742 GB/s, Time = 0.01469 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coarse-grained    , Throughput = 95.8166 GB/s, Time = 0.02038 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-fine-grained      , Throughput = 133.2645 GB/s, Time = 0.01466 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-fine-grained      , Throughput = 95.6664 GB/s, Time = 0.02042 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-diagonal          , Throughput = 137.7769 GB/s, Time = 0.01418 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-diagonal          , Throughput = 87.4429 GB/s, Time = 0.02234 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
