I have PyTorch 1.9.0 and TensorFlow 2.6.0 in the same environment, and both recognizing the all GPUs.
I was comparing the performance of both, so I did this small simple test, multiplying large matrices (A and B, both 2000x2000) several times (10000x):
import numpy as np
import os
import time
def mul_torch(A,B):
# PyTorch matrix multiplication
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import torch
A, B = torch.Tensor(A.copy()), torch.Tensor(B.copy())
A = A.cuda()
B = B.cuda()
start = time.time()
for i in range(10000):
C = torch.matmul(A, B)
torch.cuda.empty_cache()
print('PyTorch:', time.time() - start, 's')
return C
def mul_tf(A,B):
# TensorFlow Matrix Multiplication
import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
with tf.device('GPU:0'):
A = tf.constant(A.copy())
B = tf.constant(B.copy())
start = time.time()
for i in range(10000):
C = tf.math.multiply(A, B)
print('TensorFlow:', time.time() - start, 's')
return C
if __name__ == '__main__':
A = np.load('A.npy')
B = np.load('B.npy')
n = 2000
A = np.random.rand(n, n)
B = np.random.rand(n, n)
PT = mul_torch(A, B)
time.sleep(5)
TF = mul_tf(A, B)
As a result:
PyTorch: 19.86856198310852 s
TensorFlow: 2.8338065147399902 s
I was not expecting these results, I thought the results should be similar.
Investigating the GPU performance, I noticed that both are using GPU full capacity, but PyTorch uses a small fraction of the memory Tensorflow uses. It explains the processing time difference, but I cannot explain the difference on memory usage. Is it something intrinsic to the methods, or is it my computer configuration? Regardless the matrix size (at least for matrices larger than 1000x1000), these plateau are the same.
Thanks you for your help.
It is because you are doing matrix multiplication in pytorch but element-wise multiplication in tensorflow. To do matrix multiplication in TF, use tf.matmul or simply:
for i in range(10000):
C = A # B
That does the same for both TF and torch. You also have to call torch.cuda.synchronize() inside the time measurement and move torch.cuda.empty_cache() outside of the measurement for the sake of fairness.
The expected results will be tensorflow's eager execution slower than pytorch.
Regarding the memory usage, TF by default claims all GPU memory and using nvidia-smi in linux or similarly task manager in windows, does not reflect the actual memory usage of the operations.
Related
i have a matrix A and want to calculate the distance matrix D from it, iteratively. The reason behind wanting to calculate it step by step is to later include some if-statements in the iteration process.
My code right now looks like this:
import numpy as np
from scipy.spatial import distance
def create_data_matrix(n,m):
mean = np.zeros(m)
cov = np.eye(m, dtype=float)
data_matrix = np.random.multivariate_normal(mean,cov,n)
return(data_matrix)
def create_full_distance(A):
distance_matrix = np.triu(distance.squareform(distance.pdist(A,"euclidean")),0)
return(distance_matrix)
matrix_a = create_data_matrix(1000,2)
distance_from_numpy = create_full_distance(matrix_a)
matrix_b = np.empty((1000,1000))
for idx, line in enumerate(matrix_a):
for j, line2 in enumerate(matrix_a):
matrix_b[idx][j] = distance.euclidean(matrix_a[idx],matrix_a[j])
Now the matrices "distance_from_numpy" and "matrix_b" are the same, though matrix_b takes far longer to calculate allthough the matrix_a is only a (100x2) matrix, and i know that "distance.pdist()" method is very fast but i am not sure if i can implement it in an iteration process.
My question is, why is the double for loop so slow and how can i increase the speed while still preserving the iteration process (since i want to include if statements there) ?
edit: for context: i want to preserve the iteration, because i'd like stop the iteration if one of the distances is smaller than a specific number.
Python is a high-level language and therefore loops are inherently slow. It just has to deal with a lot of overhead. This gets progressively worse, as the number of nested loops increases. On the other hand, Numpy uses fast Fortran code.
To speed up the Python implementation, you can for example implement the loop part with Cython, which will translate your code to C, and then compile it for faster execution. Other options are Numba, or writing the loops in Fortran.
As Ehsan mentioned in a comment i used numba to increase computational speed.
from numba import jit
import numpy as np
from scipy.spatial import distance
def create_data_matrix(n,m):
mean = np.zeros(m)
cov = np.eye(m, dtype=float)
data_matrix = np.random.multivariate_normal(mean,cov,n)
return(data_matrix)
def create_full_distance(A):
distance_matrix = np.triu(distance.squareform(distance.pdist(A,"euclidean")),0)
return(distance_matrix)
#jit(nopython=True) # Set "nopython" mode for best performance, equivalent to #njit
def slow_loop(matrix_a):
matrix_b = np.empty((1000,1000))
for i in range(len(matrix_a)):
for j in range(len(matrix_a)):
#matrix_b[i][j] = distance.euclidean(matrix_a[i],matrix_a[j])
matrix_b[i][j] = np.linalg.norm(matrix_a[i]-matrix_a[j])
print("matrix_b: ",matrix_b)
return()
def slow_loop_without_numba(matrix_a):
matrix_b = np.empty((1000,1000))
for i in range(len(matrix_a)):
for j in range(len(matrix_a)):
matrix_b[i][j] = np.linalg.norm(matrix_a[i]-matrix_a[j])
return()
matrix_a = create_data_matrix(1000,2)
start = time.time()
ergebnis = create_full_distance(matrix_a)
#print("ergebnis: ",ergebnis)
end = time.time()
print("with scipy.distance.pdist = %s" % (end - start))
start2 = time.time()
slow_loop(matrix_a)
end2 = time.time()
print("with #jit onto np.linalg.norm = %s" % (end2 - start2))
start3 = time.time()
slow_loop_without_numba(matrix_a)
end3 = time.time()
print("slow_loop without numba = %s" % (end3 - start3))
i executed the code and it yielded these results:
with scipy.distance.pdist = 0.021986722946166992
with #jit onto np.linalg.norm = 0.8565070629119873
slow_loop without numba = 6.818004846572876
so numba increased the computational speed by alot allthough scipy is still much faster. This will be more interesting the bigger the distance matrices get. I couldnĀ“t use numba on a function with scipy methods.
I wish to speed up the sparse system solver part of my code using Numba. Here is what I have up till now:
# Both numba and numba-scipy packages are installed. I am using PyCharm IDE
import numba
import numba_scipy
# import other required stuff
#numba.jit(nopython=True)
def solve_using_numba(A, b):
return sp.linalg.gmres(A, b)
# total = the number of points in the system
A = sp.lil_matrix((total, total), dtype=float)
# populate A with appropriate data
A = A.tocsc()
b = np.zeros((total, 1), dtype=float)
# populate b with appropriate data
y, exit_code = solve_using_numba(A, b)
# plot solution
This raises the error
argument 0: cannot determine Numba type of <class 'scipy.sparse.csc.csc_matrix'>
In the official documentation, numba-scipy extends Numba to make it aware of SciPy. But it seems that here, numba cannot work with scipy sparse matrix classes. Where am I going wrong and what can I do to fix this?
I only need to speed up the sparse system solution part of the code because the other stuff is pretty lightweight like taking a couple of user inputs, constructing the A and b matrices, and plotting the end result.
I have a series of signals length n = 36,000 which I need to perform crosscorrelation on. Currently, my cpu implementation in numpy is a little slow. I've heard Pytorch can greatly speed up tensor operations, and provides a way to perform computations in parallel on the GPU. I'd like to explore this option, but I'm not quite sure how to accomplish this using the framework.
Because of the length of these signals, I'd prefer to perform the crosscorrelation operation in the frequency domain.
Normally using numpy I'd perform the operation like so:
import numpy as np
signal_length=36000
# make the signals
signal_1 = np.random.uniform(-1,1, signal_length)
signal_2 = np.random.uniform(-1,1, signal_length)
# output target length of crosscorrelation
x_cor_sig_length = signal_length*2 - 1
# get optimized array length for fft computation
fast_length = np.fftpack.next_fast_len(x_cor_sig_length)
# move data into the frequency domain. axis=-1 to perform
# along last dimension
fft_1 = np.fft.rfft(src_data, fast_length, axis=-1)
fft_2 = np.fft.rfft(src_data, fast_length, axis=-1)
# take the complex conjugate of one of the spectrums. Which one you choose depends on domain specific conventions
fft_1 = np.conj(fft_1)
fft_multiplied = fft_1 * fft_2
# back to time domain.
prelim_correlation = np.fft.irfft(result, x_corr_sig_length, axis=-1)
# shift the signal to make it look like a proper crosscorrelation,
# and transform the output to be purely real
final_result = np.real(np.fft.fftshift(prelim_correlation),axes=-1)).astype(np.float64)
Looking at the Pytorch documentation, there doesn't seem to be an equivalent for numpy.conj(). I'm also not sure if/how I need to implement a fftshift after the irfft operation.
So how would you go about writing a 1D crosscorrelation in Pytorch using the fourier method?
A few things to be considered.
Python interpreter is very slow, what those vectorization libraries do is to move the workload to a native implementation. In order to make any difference you need to be able to give perform many operations in one python instruction. Evaluating things on GPU follows the same principle, while GPU has more compute power it is slower to copy data to/from GPU.
I adapted your example to process multiple signals simulataneously.
import numpy as np
def numpy_xcorr(BATCH=1, signal_length=36000):
# make the signals
signal_1 = np.random.uniform(-1,1, (BATCH, signal_length))
signal_2 = np.random.uniform(-1,1, (BATCH, signal_length))
# output target length of crosscorrelation
x_cor_sig_length = signal_length*2 - 1
# get optimized array length for fft computation
fast_length = next_fast_len(x_cor_sig_length)
# move data into the frequency domain. axis=-1 to perform
# along last dimension
fft_1 = np.fft.rfft(signal_1, fast_length, axis=-1)
fft_2 = np.fft.rfft(signal_2 + 0.1 * signal_1, fast_length, axis=-1)
# take the complex conjugate of one of the spectrums.
fft_1 = np.conj(fft_1)
fft_multiplied = fft_1 * fft_2
# back to time domain.
prelim_correlation = np.fft.irfft(fft_multiplied, fast_length, axis=-1)
# shift the signal to make it look like a proper crosscorrelation,
# and transform the output to be purely real
final_result = np.fft.fftshift(np.real(prelim_correlation), axes=-1)
return final_result, np.sum(final_result)
Since torch 1.7 we have the torch.fft module that provides an interface similar to numpy.fft, the fftshift is missing but the same result can be obtained with torch.roll. Another point is that numpy uses by default 64-bit precision and torch will use 32-bit precision.
The fast length consists in choosing smooth numbers (those having that are factorized in to small prime numbers, and I suppose you are familiar with this subject).
def next_fast_len(n, factors=[2, 3, 5, 7]):
'''
Returns the minimum integer not smaller than n that can
be written as a product (possibly with repettitions) of
the given factors.
'''
best = float('inf')
stack = [1]
while len(stack):
a = stack.pop()
if a >= n:
if a < best:
best = a;
if best == n:
break; # no reason to keep searching
else:
for p in factors:
b = a * p;
if b < best:
stack.append(b)
return best;
Then the torch implementation goes
import torch;
import torch.fft
def torch_xcorr(BATCH=1, signal_length=36000, device='cpu', factors=[2,3,5], dtype=torch.float):
signal_length=36000
# torch.rand is random in the range (0, 1)
signal_1 = 1 - 2*torch.rand((BATCH, signal_length), device=device, dtype=dtype)
signal_2 = 1 - 2*torch.rand((BATCH, signal_length), device=device, dtype=dtype)
# just make the cross correlation more interesting
signal_2 += 0.1 * signal_1;
# output target length of crosscorrelation
x_cor_sig_length = signal_length*2 - 1
# get optimized array length for fft computation
fast_length = next_fast_len(x_cor_sig_length, [2, 3])
# the last signal_ndim axes (1,2 or 3) will be transformed
fft_1 = torch.fft.rfft(signal_1, fast_length, dim=-1)
fft_2 = torch.fft.rfft(signal_2, fast_length, dim=-1)
# take the complex conjugate of one of the spectrums. Which one you choose depends on domain specific conventions
fft_multiplied = torch.conj(fft_1) * fft_2
# back to time domain.
prelim_correlation = torch.fft.irfft(fft_multiplied, dim=-1)
# shift the signal to make it look like a proper crosscorrelation,
# and transform the output to be purely real
final_result = torch.roll(prelim_correlation, (fast_length//2,))
return final_result, torch.sum(final_result);
And here a code to test the results
import time
funcs = {'numpy-f64': lambda b: numpy_xcorr(b, factors=[2,3,5], dtype=np.float64),
'numpy-f32': lambda b: numpy_xcorr(b, factors=[2,3,5], dtype=np.float32),
'torch-cpu-f64': lambda b: torch_xcorr(b, device='cpu', factors=[2,3,5], dtype=torch.float64),
'torch-cpu': lambda b: torch_xcorr(b, device='cpu', factors=[2,3,5], dtype=torch.float32),
'torch-gpu-f64': lambda b: torch_xcorr(b, device='cuda', factors=[2,3,5], dtype=torch.float64),
'torch-gpu': lambda b: torch_xcorr(b, device='cuda', factors=[2,3,5], dtype=torch.float32),
}
times ={}
for batch in [1, 10, 100]:
times[batch] = {}
for l, f in funcs.items():
t0 = time.time()
t1, t2 = f(batch)
tf = time.time()
del t1
del t2
times[batch][l] = 1000 * (tf - t0) / batch;
I obtained the following results
And what surprised myself is the result when the numbers are not so smooth e.g. using 17-smooth length the torch implementation is so much better that I used logarithmic scale here (with batch size 100 the torch gpu was 10000 times faster than numpy with batch size 1).
Remember that these functions are generating the data at the GPU in general we want to copy the final results to the CPU, if we consider the time spent copying the final result to CPU I observed times up to 10x higher than the cross correlation computation (random data generation + three FFTs).
i tried to do a LR with SKLearn for a rather large dataset with ~600 dummy and only few interval variables (and 300 K lines in my dataset) and the resulting confusion matrix looks suspicious. I wanted to check the significance of the returned coefficients and ANOVA but I cannot find how to access it. Is it possible at all? And what is the best strategy for data that contains lots of dummy variables? Thanks a lot!
Scikit-learn deliberately does not support statistical inference. If you want out-of-the-box coefficients significance tests (and much more), you can use Logit estimator from Statsmodels. This package mimics interface glm models in R, so you could find it familiar.
If you still want to stick to scikit-learn LogisticRegression, you can use asymtotic approximation to distribution of maximum likelihiood estimates. Precisely, for a vector of maximum likelihood estimates theta, its variance-covariance matrix can be estimated as inverse(H), where H is the Hessian matrix of log-likelihood at theta. This is exactly what the function below does:
import numpy as np
from scipy.stats import norm
from sklearn.linear_model import LogisticRegression
def logit_pvalue(model, x):
""" Calculate z-scores for scikit-learn LogisticRegression.
parameters:
model: fitted sklearn.linear_model.LogisticRegression with intercept and large C
x: matrix on which the model was fit
This function uses asymtptics for maximum likelihood estimates.
"""
p = model.predict_proba(x)
n = len(p)
m = len(model.coef_[0]) + 1
coefs = np.concatenate([model.intercept_, model.coef_[0]])
x_full = np.matrix(np.insert(np.array(x), 0, 1, axis = 1))
ans = np.zeros((m, m))
for i in range(n):
ans = ans + np.dot(np.transpose(x_full[i, :]), x_full[i, :]) * p[i,1] * p[i, 0]
vcov = np.linalg.inv(np.matrix(ans))
se = np.sqrt(np.diag(vcov))
t = coefs/se
p = (1 - norm.cdf(abs(t))) * 2
return p
# test p-values
x = np.arange(10)[:, np.newaxis]
y = np.array([0,0,0,1,0,0,1,1,1,1])
model = LogisticRegression(C=1e30).fit(x, y)
print(logit_pvalue(model, x))
# compare with statsmodels
import statsmodels.api as sm
sm_model = sm.Logit(y, sm.add_constant(x)).fit(disp=0)
print(sm_model.pvalues)
sm_model.summary()
The outputs of print() are identical, and they happen to be coefficient p-values.
[ 0.11413093 0.08779978]
[ 0.11413093 0.08779979]
sm_model.summary() also prints a nicely formatted HTML summary.
According to MKL BLAS documentation
"All matrix-matrix operations (level 3) are threaded for both dense and sparse BLAS."
http://software.intel.com/en-us/articles/parallelism-in-the-intel-math-kernel-library
I have built Scipy with MKL BLAS. Using the test code below, I see the expected multithreaded speedup for dense, but not sparse, matrix multiplication. Are there any changes to Scipy to enable multithreaded sparse operations?
# test dense matrix multiplication
from numpy import *
import time
x = random.random((10000,10000))
t1 = time.time()
foo = dot(x.T, x)
print time.time() - t1
# test sparse matrix multiplication
from scipy import sparse
x = sparse.rand(10000,10000)
t1 = time.time()
foo = dot(x.T, x)
print time.time() - t1
As far as I know, the answer is no. But, you can build your own wrapper around the MKL sparse multiply routines. You asked about the multiplying two sparse matrices. Below is some a wrapper code I used for multiplying one sparse matrix times a dense vector, so it shouldn't be hard to adapt (look at the Intel MKL reference for mkl_cspblas_dcsrgemm). Also, be aware of how your scipy arrays are stored: default is coo, but csr (or csc) may be better choices. I chose csr, but MKL supports most types (just call the appropriate routine).
From what I could tell, both scipy's default and MKL are multithreaded. By changing OMP_NUM_THREADS I could see a difference in performance.
To use the function below, if you havea a recent version of MKL, just make sure you have LD_LIBRARY_PATHS set to include the relevant MKL directories. For older versions, you need to build some specific libraries. I got my information from IntelMKL in python
def SpMV_viaMKL( A, x ):
"""
Wrapper to Intel's SpMV
(Sparse Matrix-Vector multiply)
For medium-sized matrices, this is 4x faster
than scipy's default implementation
Stephen Becker, April 24 2014
stephen.beckr#gmail.com
"""
import numpy as np
import scipy.sparse as sparse
from ctypes import POINTER,c_void_p,c_int,c_char,c_double,byref,cdll
mkl = cdll.LoadLibrary("libmkl_rt.so")
SpMV = mkl.mkl_cspblas_dcsrgemv
# Dissecting the "cspblas_dcsrgemv" name:
# "c" - for "c-blas" like interface (as opposed to fortran)
# Also means expects sparse arrays to use 0-based indexing, which python does
# "sp" for sparse
# "d" for double-precision
# "csr" for compressed row format
# "ge" for "general", e.g., the matrix has no special structure such as symmetry
# "mv" for "matrix-vector" multiply
if not sparse.isspmatrix_csr(A):
raise Exception("Matrix must be in csr format")
(m,n) = A.shape
# The data of the matrix
data = A.data.ctypes.data_as(POINTER(c_double))
indptr = A.indptr.ctypes.data_as(POINTER(c_int))
indices = A.indices.ctypes.data_as(POINTER(c_int))
# Allocate output, using same conventions as input
nVectors = 1
if x.ndim is 1:
y = np.empty(m,dtype=np.double,order='F')
if x.size != n:
raise Exception("x must have n entries. x.size is %d, n is %d" % (x.size,n))
elif x.shape[1] is 1:
y = np.empty((m,1),dtype=np.double,order='F')
if x.shape[0] != n:
raise Exception("x must have n entries. x.size is %d, n is %d" % (x.size,n))
else:
nVectors = x.shape[1]
y = np.empty((m,nVectors),dtype=np.double,order='F')
if x.shape[0] != n:
raise Exception("x must have n entries. x.size is %d, n is %d" % (x.size,n))
# Check input
if x.dtype.type is not np.double:
x = x.astype(np.double,copy=True)
# Put it in column-major order, otherwise for nVectors > 1 this FAILS completely
if x.flags['F_CONTIGUOUS'] is not True:
x = x.copy(order='F')
if nVectors == 1:
np_x = x.ctypes.data_as(POINTER(c_double))
np_y = y.ctypes.data_as(POINTER(c_double))
# now call MKL. This returns the answer in np_y, which links to y
SpMV(byref(c_char("N")), byref(c_int(m)),data ,indptr, indices, np_x, np_y )
else:
for columns in xrange(nVectors):
xx = x[:,columns]
yy = y[:,columns]
np_x = xx.ctypes.data_as(POINTER(c_double))
np_y = yy.ctypes.data_as(POINTER(c_double))
SpMV(byref(c_char("N")), byref(c_int(m)),data,indptr, indices, np_x, np_y )
return y