/home/tgrogers/github/purdue-aalp/gpgpu-sim_simulations/util/correlation/../../benchmarks/bin/9.1/release/transpose Starting...

> Device 0: "Quadro GV100"
> SM Capability 7.0 detected:
> [Quadro GV100] has 80 MP(s) x -1 (Cores/MP) = -80 (Cores)
> Compute performance scaling factor = 1.00
> MatrixSize X = 128
> MatrixSize Y = 128

Matrix size: 128x128 (8x8 tiles), tile size: 16x16, block size: 16x16


transpose-Outer-simple copy       , Throughput = 4.4254 GB/s, Time = 0.02758 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-simple copy       , Throughput = 4.0930 GB/s, Time = 0.02982 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-shared memory copy, Throughput = 5.0863 GB/s, Time = 0.02400 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-shared memory copy, Throughput = 4.6692 GB/s, Time = 0.02614 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-naive             , Throughput = 5.0795 GB/s, Time = 0.02403 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-naive             , Throughput = 4.7923 GB/s, Time = 0.02547 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coalesced         , Throughput = 5.3956 GB/s, Time = 0.02262 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coalesced         , Throughput = 4.3202 GB/s, Time = 0.02826 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-optimized         , Throughput = 5.2544 GB/s, Time = 0.02323 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-optimized         , Throughput = 4.6578 GB/s, Time = 0.02621 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coarse-grained    , Throughput = 5.4263 GB/s, Time = 0.02250 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coarse-grained    , Throughput = 4.8349 GB/s, Time = 0.02525 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-fine-grained      , Throughput = 5.4574 GB/s, Time = 0.02237 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-fine-grained      , Throughput = 4.5795 GB/s, Time = 0.02666 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-diagonal          , Throughput = 5.5285 GB/s, Time = 0.02208 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-diagonal          , Throughput = 4.7153 GB/s, Time = 0.02589 s, Size = 16384 fp32 elements, NumDevsUsed = 1, Workgroup = 256
