FFT Speedtest comparing Tensorflow, PyTorch, CuPy, PyFFTW and NumPy
FFT Speedtest comparing Tensorflow, PyTorch, CuPy, PyFFTW and NumPy.¶
See bottom of page for graphs.
I test the performance of taking an inverse 2D fft on the regular 2D fft of arrays of size 512x512, 1024x1024, 2048x2048 and 4096x4096. The data type is set to Complex 64-bit (Equivalent of float32 for complex numbers) for compatability.
GPUs are individual NVidia RTX 2080TI with 11GB of ram.
The system has 4 of them, each GPU fft implementation runs on its own GPU.
CPU is a 28-core Intel Xeon Gold 5120 CPU @ 2.20GHz
Test by @thomasaarholt
TLDR: PyTorch GPU fastest and is 4.5 times faster than TensorFlow GPU and CuPy, and the PyTorch CPU version outperforms every other CPU implementation by at least 57 times (including PyFFTW).
My best guess on why the PyTorch cpu solution is better is that it possibly better at taking advantage of the multi-core CPU system the code ran on.
%matplotlib inline
import tensorflow as tf
import torch
import cupy as cp
import numpy as np
import matplotlib.pyplot as plt
import pyfftw
# Print numpy see whether mkl/blas is available
np.show_config()
from tensorflow.python.client import device_lib
def get_available_gpus():
local_device_protos = device_lib.list_local_devices()
return [x.name for x in local_device_protos if x.device_type == 'GPU']
get_available_gpus()
print('TensorFlow: {}'.format(tf.__version__))
print('PyTorch: {}'.format(torch.__version__))
print('Numpy: {}'.format(np.__version__))
print('CuPy: {}'.format(cp.__version__))
print('pyFFTW: {}'.format(pyfftw.__version__))
# Helper functions for each package
def tf_ifft2_fft2(data):
data2 = tf.signal.fft2d(data)
return tf.signal.ifft2d(data2)
def torch_ifft2_fft2(data):
signal_ndim = 2
data2 = torch.fft(data, signal_ndim=signal_ndim)
return torch.ifft(data2, signal_ndim=signal_ndim)
def np_ifft2_fft2(data):
data2 = np.fft.fft2(data)
return np.fft.ifft2(data2)
def cp_ifft2_fft2(data):
data2 = cp.fft.fft2(data)
return cp.fft.ifft2(data2)
def pyfftw_ifft2_fft2(data):
data2 = pyfftw.interfaces.numpy_fft.fft2(data)
return pyfftw.interfaces.numpy_fft.ifft2(data2)
Checking that the functions produce the same result¶
# Using some helper functions to turn all data into numpy arrays
from pyms.utils import cx_from_numpy, cx_to_numpy
data = np.random.random((20,20)).astype('complex64')
dtf = tf_ifft2_fft2(data).numpy()
dtorch = cx_to_numpy(torch_ifft2_fft2(cx_from_numpy(data)))
dcp = cp.asnumpy(cp_ifft2_fft2(cp.array(data)))
dnp = np_ifft2_fft2(data)
dfftw = pyfftw_ifft2_fft2(data)
# Data is equal six, but not to seven decimals (good enough for our purposes)
np.testing.assert_almost_equal(dtf, dtorch, decimal=6)
np.testing.assert_almost_equal(dtf, dcp, decimal=6)
np.testing.assert_almost_equal(dtf, dnp, decimal=6)
np.testing.assert_almost_equal(dtf, dnp, decimal=6)
np.testing.assert_almost_equal(dtf, dfftw, decimal=6)
Setting up measurement¶
sizes = [512, 1024, 2048, 4096] # X * X pixel arrays/tensors
names = []
totaltimes = []
tfdevice = 1
torchdevice = 3
cpdevice = 2
TensorFlow CPU¶
name = 'TensorFlow\nCPU'
names.append(name)
with tf.device("/device:CPU:0"):
tensortimes = []
datas = [tf.dtypes.cast(tf.random.normal(shape=(x,x)), dtype = tf.complex128) for x in sizes]
print(f'Working on device: {datas[0].device}')
for x, data in zip(sizes, datas):
print('{} {}x{}'.format(" ".join(name.split('\n')), x, x))
t = %timeit -o tf_ifft2_fft2(data)
tensortimes.append(t)
totaltimes.append(tensortimes)
TensorFlow GPU¶
name = 'TensorFlow\nGPU'
names.append(name)
print('{} available: {}'.format(name, tf.test.is_gpu_available()))
with tf.device("/device:GPU:{}".format(tfdevice)):
tensortimes = []
datas = [tf.dtypes.cast(tf.random.normal(shape=(x,x)), dtype = tf.complex128) for x in sizes]
print(f'Working on device: {datas[0].device}')
for x, data in zip(sizes, datas):
print('{} {}x{}'.format(" ".join(name.split('\n')), x, x))
t = %timeit -o tf_ifft2_fft2(data)
tensortimes.append(t)
totaltimes.append(tensortimes)
PyTorch CPU¶
name = 'PyTorch\nCPU'
device_type = 'cpu'
device = torch.device(device_type)
names.append(name)
torchtimes = []
datas = [torch.randn((x,x,2), device=device) for x in sizes]
print(f'Working on device: {datas[0].device}')
for x, data in zip(sizes, datas):
print('{} {}x{}'.format(" ".join(name.split('\n')), x, x))
t = %timeit -o torch_ifft2_fft2(data)
torchtimes.append(t)
totaltimes.append(torchtimes)
PyTorch GPU¶
name = 'PyTorch\nGPU'
device_type = 'cuda'
device = torch.device('cuda:{}'.format(torchdevice))
names.append(name)
print('{} available: {}'.format(name, torch.cuda.is_available()))
torchtimes = []
datas = [torch.randn(size=(x,x,2), device=device) for x in sizes]
print(f'Working on device: {datas[0].device}')
for x, data in zip(sizes, datas):
print('{} {}x{}'.format(" ".join(name.split('\n')), x, x))
t = %timeit -o torch_ifft2_fft2(data)
torchtimes.append(t)
totaltimes.append(torchtimes)
# # Clear pytorch memory
# del datas
# torch.cuda.empty_cache()
Numpy¶
name = 'Numpy\nCPU'
names.append(name)
nptimes = []
datas = [np.random.normal(size=(x,x)).astype('complex128') for x in sizes]
for x, data in zip(sizes, datas):
print('{} {}x{}'.format(" ".join(name.split('\n')), x, x))
t = %timeit -o np_ifft2_fft2(data)
nptimes.append(t)
totaltimes.append(nptimes)
CuPy¶
name = 'CuPy\nGPU'
names.append(name)
cptimes = []
device = cp.cuda.Device(cpdevice)
with device:
datas = [cp.random.normal(size=(x,x)).astype('complex128') for x in sizes]
for x, data in zip(sizes, datas):
print('{} {}x{}'.format(" ".join(name.split('\n')), x, x))
t = %timeit -o cp_ifft2_fft2(data)
cptimes.append(t)
totaltimes.append(cptimes)
# # Clear cupy memory
# mempool = cp.get_default_memory_pool()
# del datas
# mempool.free_all_blocks()
PyFFTW¶
name = 'PyFFTW\nCPU'
names.append(name)
tensortimes = []
def pyfftw_array(shape):
arr = pyfftw.empty_aligned(shape, dtype='complex128')
arr[:] = np.random.normal(size=shape) + 1j*np.random.normal(size=shape)
return arr
datas = [pyfftw_array((x,x)) for x in sizes]
for x, data in zip(sizes, datas):
print('{} {}x{}'.format(" ".join(name.split('\n')), x, x))
t = %timeit -o pyfftw_ifft2_fft2(data)
tensortimes.append(t)
totaltimes.append(tensortimes)
# a = pyfftw.empty_aligned((512, 512), dtype='complex128', n=16)
# a[:] = np.random.random((512, 512)) + 1j*np.random.random((512, 512))
# %%timeit
# b = pyfftw.interfaces.numpy_fft.fft(a)
Plotting results¶
Plot as function of data shape¶
fig, AX = plt.subplots(ncols=2, nrows=2, constrained_layout=True, figsize=(14,6), dpi=400)
# fig.canvas.layout.width = "1400px"
# fig.canvas.layout.height = "600px"
plt.suptitle('iFFT2(FFT2(array)) performance by array size\nLower is better')
for i in range(len(sizes)):
t = np.array([time[i].average for time in totaltimes]) * 1000 # now in ms
err = np.array([time[i].stdev for time in totaltimes]) * 1000
ax = AX.flatten()[i]
ax.set_title("{}x{}".format(sizes[i], sizes[i]))
ax.bar(names, t)
ax.set_yscale('log')
ax.set_ylabel('FFT Time (ms)')
plt.savefig('FFT_Speed_by_size.png')
Average times (in ms) for a 4k x 4k array¶
times_4k = np.array([time[-1].average for time in totaltimes]) * 1000
for i in range(len(names)):
print(" ".join(names[i].split('\n')))
print(round(times_4k[i], 2))
pytorch_time_4k = times_4k[3]
print()
print('4k x 4k iFFT2(FFT2())')
print('PyTorch faster than others by a factor of ')
times_4k = np.array([time[-1].average for time in totaltimes]) * 1000
for i in range(len(names)):
print(" ".join(names[i].split('\n')))
print(round(times_4k[i]/pytorch_time_4k, 2))
Plot as function of software¶
# Log scale
fig, AX = plt.subplots(ncols=2, nrows=4, constrained_layout=True, figsize=(9,12), dpi=400)
# fig.canvas.layout.width = "1300px"
# fig.canvas.layout.height = "1000px"
cputimes = [1000 * time.average for times in totaltimes[::2] for time in times]
gputimes = [1000 * time.average for times in totaltimes[1::2] for time in times]
gmx, gmn = max(gputimes), min(gputimes)
cmx, cmn = max(cputimes), min(cputimes)
plt.suptitle('Log-scale iFFT2(FFT2()) Performance for each program. Y-Axis limits fixed separately for CPU/GPU.\nSmaller is better')
for i in range(len(names)):
t = np.array([time.average for time in totaltimes[i]]) * 1000 # now in ms
ax = AX.flatten()[i]
ax.set_title(names[i])
sizes_title = ["{}x{}".format(size, size) for size in sizes]
ax.bar(sizes_title, t)
ax.set_yscale('log')
if i % 2:
ax.set_ylim(gmn/10, gmx*10)
else:
ax.set_ylim(cmn/10, cmx*10)
ax.set_ylabel('FFT Time (ms)')
AX.flatten()[-1].remove()