/home/tgrogers/github/purdue-aalp/gpgpu-sim_simulations/util/correlation/../../benchmarks/bin/10.1/release/transpose Starting...

> Device 0: "TITAN X (Pascal)"
> SM Capability 6.1 detected:
> [TITAN X (Pascal)] has 28 MP(s) x -1 (Cores/MP) = -28 (Cores)
> Compute performance scaling factor = 1.00
> MatrixSize X = 512
> MatrixSize Y = 512

Matrix size: 512x512 (32x32 tiles), tile size: 16x16, block size: 16x16


transpose-Outer-simple copy       , Throughput = 0.2395 GB/s, Time = 8.15533 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-simple copy       , Throughput = 0.3872 GB/s, Time = 5.04442 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-shared memory copy, Throughput = 0.4114 GB/s, Time = 4.74710 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-shared memory copy, Throughput = 0.3399 GB/s, Time = 5.74589 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-naive             , Throughput = 0.4520 GB/s, Time = 4.32074 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-naive             , Throughput = 0.4607 GB/s, Time = 4.23965 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coalesced         , Throughput = 0.4766 GB/s, Time = 4.09811 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coalesced         , Throughput = 0.3749 GB/s, Time = 5.20954 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-optimized         , Throughput = 0.3997 GB/s, Time = 4.88698 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-optimized         , Throughput = 0.3883 GB/s, Time = 5.03040 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-coarse-grained    , Throughput = 0.3867 GB/s, Time = 5.05133 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-coarse-grained    , Throughput = 0.4268 GB/s, Time = 4.57610 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-fine-grained      , Throughput = 0.3393 GB/s, Time = 5.75581 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-fine-grained      , Throughput = 0.4209 GB/s, Time = 4.64083 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256

transpose-Outer-diagonal          , Throughput = 0.4187 GB/s, Time = 4.66429 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
transpose-Inner-diagonal          , Throughput = 0.4250 GB/s, Time = 4.59542 s, Size = 262144 fp32 elements, NumDevsUsed = 1, Workgroup = 256
