tensorflow not using gpu - prime number program - python-3.x

I have installed tensorflow-gpu in my laptop (GTX 1060-6GB) and I am developing a prime number generator program using tensorflow. Following is sample code i am using to calculate prime-numbers:
def prime_tf_graf(max_count):
# matrix version
with tf.device('/gpu:0'):
data = tf.range(2, max_count, dtype=tf.int32)
Y, X = tf.meshgrid(data, data); Z = X%Y
with tf.device('/cpu:0'):
loer = tf.matrix_band_part(Z, -1, 0) - tf.matrix_band_part(Z, 0, 0)
with tf.device('/gpu:0'):
loer = tf.cast(tf.not_equal(loer, 0), tf.int32)
sumr = tf.reduce_sum(loer, axis=1)
nums = data - 2
rato = tf.cast(sumr/nums, tf.int32)
indx = tf.cast(tf.not_equal(rato, 0), tf.int32)
return tf.reshape(tf.where(indx), [-1]) + 2
def prime_tf(max_count):
graf = prime_tf_graf(max_count)
sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
prim = sess.run(graf)
return prim
When i run this code, my CPU and RAM get used only, and my GPU stays out of it. (only dedicated memory is allocated). Test Run: prime_tf(tf.constant(20000))
Here is my system usage:
It shows that tensorflow is only using my CPU and RAM not GPU.

Related

Why do repeated calls to torch.cuda.is_available() all return True?

I'm reading code that makes multiple calls to torch.cuda.is_available(). Each time it prepares to query a net, calls a function (shown below) that uses torch.cuda.is_available() to set the device (cuda or cpu) used. I don't understand why calls after the first one don't return False, thus pushing computation to the cpu.
Is the GPU released when the code leaves the method? Or, does each call take up only a relatively small part of the GPU, so that the code would need to make multiple calls to this method, before computation was pushed to the CPU?
Code in question:
def computeProposals(imageName):
app.config['args'].img = imageName
print('ARGs --img = ', app.config['args'].img)
# Setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Setup Model
Config = namedtuple('Config', ['iSz', 'oSz', 'gSz', 'batch'])
config = Config(iSz=160, oSz=56, gSz=112, batch=1) # default for training
model = (models.__dict__[app.config['args'].arch](config))
model = load_pretrain(model, app.config['args'].resume)
model = model.eval().to(device)
scales_range = np.arange(app.config['args'].si,
app.config['args'].sf + app.config['args'].ss,
app.config['args'].ss)
scales = [2 ** i for i in scales_range]
meanstd = {'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225]}
infer = Infer(nps=app.config['args'].nps, scales=scales, meanstd=meanstd,
model=model, device=device)
print('| start')
tic = time.time()
im = np.array(Image.open(app.config['args'].img).convert('RGB'),
dtype=np.float32)
h, w = im.shape[:2]
img = np.expand_dims(np.transpose(im, (2, 0, 1)), axis=0).astype(np.float32)
img = torch.from_numpy(img / 255.).to(device)
infer.forward(img)
masks, scores = infer.getTopProps(.2, h, w)
toc = time.time() - tic
print('| done in %05.3f s' % toc)
return masks, scores
Here is the code that makes the repeated calls to computeProposals:
#app.route('/')
def index():
global cc_data
base_dir = app.config['base_dir']
img_name = app.config['img_name']
img_dir = os.path.join(base_dir, 'images')
img_path = os.path.join(img_dir, img_name)
print(img_path)
print('Loading image and proposals, please wait')
img = skio.imread(img_path)
img = img[:, :, :3]
masks, scores = computeProposals(img_path)
session['pos_wts'] = np.zeros(masks.shape[2], dtype=np.float64).tolist()
session['neg_wts'] = np.zeros(masks.shape[2], dtype=np.float64).tolist()
masks = np.transpose(masks, (2, 0, 1))
dilated = dilate_proposals(masks)
print('Loading done')
img_h = img.shape[0]
img_w = img.shape[1]
print('Image height {} and width {}'.format(img_h, img_w))
rendered_img = draw_buttons(np.copy(img), img_w)
if app.config['DoneFlag'] == 1:
rendered_img = draw_end(rendered_img, img_h)
img_stream = embed_image_html(rendered_img)
# Create dicts with session variables
cc_data = {'img_h': img_h, 'img_w': img_w, 'masks': masks,
'scores': scores, 'dilated': dilated,
'orig': np.copy(img).tolist(), 'render': rendered_img.tolist(),
'clicks': []}
session['response'] = {'input_img': img_stream,
'im_width': img_w, 'im_height': img_h,
'show_error': False}
return render_template('index.html', response=session['response'])
Function torch.cuda.is_available() does not return whether the Cuda device is being used or if there is a memory left on the (those) device(s). This means the returned value will not depend on the number of processes running or the memory which has already been allocated to those processes. It only returns whether one or more Cuda devices are accessible by PyTorch.

Why is PyTorch slower than PyOpenCL, which is slower than Numba on GPU?

I was working on a FDTD program that used the discrete Laplacian, which I can be implemented as a convolution operation. From what I have read, the main component of PyTorch is a tensor library optimized to perform operations commonly used for machine learning (such as convolutions). I was interested to compare it to other frameworks I have used, so I wrote a test program to apply the discrete Laplacian to a 1d array multiple times and compare execution times:
import torch as tr
import time
from numba import jit, cuda
import numpy as np
import pyopencl as cl
from pyopencl import array
#parameters
number_of_timesteps = 1000
number_of_elements = 10000000
#set up the inital conditions
torch_data = tr.rand((1,1,number_of_elements),dtype=tr.double) #torch convolution needs shape (minibatch,in_channels,iW)
numba_data = np.array([0] + list(torch_data[0][0].numpy()) + [0]) #add padding [0] for convolution. handled automatically in torch.
opencl_data = np.array([0] + list(torch_data[0][0].numpy()) + [0])
#test Torch
device = "cuda"
torch_data_a = torch_data.to(device)
torch_data_b = torch_data.to(device)
kernel = tr.tensor([[[1,-2,1]]],dtype=tr.double,device=device)
with tr.no_grad():
start_time = time.time()
for t in range(round(number_of_timesteps/2)): # /2 because each loop is two convolutions
torch_data_b = torch_data_a + 0.1* tr.nn.functional.conv1d(torch_data_a,kernel,padding=1)
torch_data_a = torch_data_b + 0.1* tr.nn.functional.conv1d(torch_data_b,kernel,padding=1)
print("Torch GPU time:",time.time()-start_time)
torch_data_numpy = np.array([0] + list(torch_data_a[0][0].cpu().numpy()) + [0])
#Numba GPU kernel
#cuda.jit
def numba_conv_cuda(x,x_new):
gid = cuda.grid(1)
if 0 < gid < x.size - 1 : # Check array boundaries
x_new[gid] = x[gid] + 0.1*(x[gid+1]+x[gid-1]-2*x[gid])
threadsperblock = 100
blockspergrid = (numba_data.size + (threadsperblock - 1)) // threadsperblock
x_a = cuda.to_device(numba_data)
x_b = cuda.to_device(numba_data)
start_time = time.time()
#actually run the kernel
for t in range(round(number_of_timesteps/2)): #again /2 because each loop is two convolutions
numba_conv_cuda[blockspergrid, threadsperblock](x_a,x_b)
numba_conv_cuda[blockspergrid, threadsperblock](x_b,x_a)
print("Numba GPU time:",time.time()-start_time)
numba_data = x_a.copy_to_host()
#test OpenCL
context = cl.create_some_context(interactive=False,answers=[0])
queue = cl.CommandQueue(context)
mem_flags = cl.mem_flags
program = cl.Program(context, """
#pragma OPENCL EXTENSION cl_khr_fp64 : enable //enable double precision calculations
__kernel void update_psi(__global const double *x, __global double *x_new)
{
int gid = get_global_id(0);
if(0 < gid && gid < x.size - 1){
x_new[gid] = x[gid] + 0.1*(x[gid+1]+x[gid-1]-2*x[gid]);
}
}
""".replace("x.size",str(opencl_data.size))).build()
x_a_buf = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=opencl_data)
x_b_buf = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=opencl_data)
#actually run the OpenCL
start_time = time.time()
for t in range(round(number_of_timesteps/2)): #again /2 because each loop is two convolutions
event = program.update_psi(queue, [threadsperblock*blockspergrid], [threadsperblock], x_a_buf, x_b_buf)
event.wait()
event = program.update_psi(queue, [threadsperblock*blockspergrid], [threadsperblock], x_b_buf, x_a_buf)
event.wait()
print("OpenCL GPU time:",time.time()-start_time)
event = cl.enqueue_copy(queue, opencl_data, x_a_buf)
event.wait()
print("Results are same?",np.allclose(torch_data_numpy,numba_data) and np.allclose(numba_data,opencl_data))
And these are the results testing on an Nvidia GPU:
Torch GPU time: 13.544365406036377
Numba GPU time: 0.2404193878173828
OpenCL GPU time: 0.9025869369506836
Results are same? True
I am surprised that the results show that a library designed for applying operations such as convolutions to be so much slower than Numba or PyOpenCL (which is not even optimized because I did not use any local memory on the GPU). Is this really the case, or did I do something wrong?
Additionally, why is the kernel written in c more than 3x slower than the kernel written in Python?

Out of memory when sampling images using TF2.0 autoregressive model on Google Colab

The sampling function looks like this:
def sample_image(batch_size, model):
image = np.random.choice(4, size=(batch_size, 28, 28, 3))
for i in range(28):
for j in range(28):
for k in range(3):
_ ,prob_output = model(tf.Variable(image, dtype=tf.float32, trainable=False))
prob_output = prob_output.numpy().reshape((batch_size,28,28,3,-1))
# prob_output = tf.nn.softmax(prob_output, axis=-1)
# print(prob_output.shape)
for b in range(batch_size):
if k == 0 and b ==0:
print(f'i:{i}, j:{j}, k:{k}')
# print(prob_output[b,i,j,k])
# prob = tf.nn.softmax(prob_output[b,i,j,k], axis=-1).numpy()
# print(prob)
# prob /= prob.sum()
image[b, i, j, k] = np.random.choice(4, p=prob_output[b,i,j,k])
del prob_output
del _
return image
And when I ran this code to sample images on Google Colab, the RAM finally became OOM and the kernel shut down after that. How to avoid OOM while running on Colab?

How do I reduce memory usage for deep reinforcement learning algorithms?

I wrote a script of DQN to play BreakoutDeterministic and ran it on my school GPU server. However, it seems that the code is taking up 97% of the total RAM memory (more than 100GB)!
I would like to know which part of the script is demanding this high usage of RAM? I used memory-profiler for 3 episodes and it seems that the memory requirement increases linearly with each time step on my laptop.
I wrote the script in PyCharm, python 3.6. My laptop 12GB RAM with no GPU but the school server is using Ubuntu, p100 GPU.
import gym
import numpy as np
import random
from collections import deque
from keras.layers import Dense, Input, Lambda, convolutional, core
from keras.models import Model
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import os
import time as dt
plt.switch_backend('agg')
def preprocess(state):
process_state = np.mean(state, axis=2).astype(np.uint8)
process_state = process_state[::2, ::2]
process_state_size = list(process_state.shape)
process_state_size.append(1)
process_state = np.reshape(process_state, process_state_size)
return process_state
class DQNAgent:
def __init__(self, env):
self.env = env
self.action_size = env.action_space.n
self.state_size = self.select_state_size()
self.memory = deque(maxlen=1000000) # specify memory size
self.gamma = 0.99
self.eps = 1.0
self.eps_min = 0.01
self.decay = 0.95
self.lr = 0.00025
self.start_life = 5 # get from environment
self.tau = 0.125 # special since 2 models to be trained
self.model = self.create_cnnmodel()
self.target_model = self.create_cnnmodel()
def select_state_size(self):
process_state = preprocess(self.env.reset())
state_size = process_state.shape
return state_size
def create_cnnmodel(self):
data_input = Input(shape=self.state_size, name='data_input', dtype='int32')
normalized = Lambda(lambda x: x/255)(data_input)
conv1 = convolutional.Convolution2D(32, 8, strides=(4, 4), activation='relu')(normalized)
conv2 = convolutional.Convolution2D(64, 4, strides=(2,2), activation='relu')(conv1)
conv3 = convolutional.Convolution2D(64, 3, strides=(1,1), activation='relu')(conv2)
conv_flatten = core.Flatten()(conv3) # flatten to feed cnn to fc
h4 = Dense(512, activation='relu')(conv_flatten)
prediction_output = Dense(self.action_size, name='prediction_output', activation='linear')(h4)
model = Model(inputs=data_input, outputs=prediction_output)
model.compile(optimizer=Adam(lr=self.lr),
loss='mean_squared_error') # 'mean_squared_error') keras.losses.logcosh(y_true, y_pred)
return model
def remember(self, state, action, reward, new_state, done): # store past experience as a pre-defined table
self.memory.append([state, action, reward, new_state, done])
def replay(self, batch_size):
if batch_size > len(self.memory):
return
all_states = []
all_targets = []
samples = random.sample(self.memory, batch_size)
for sample in samples:
state, action, reward, new_state, done = sample
target = self.target_model.predict(state)
if done:
target[0][action] = reward
else:
target[0][action] = reward + self.gamma*np.max(self.target_model.predict(new_state)[0])
all_states.append(state)
all_targets.append(target)
history = self.model.fit(np.vstack(all_states), np.vstack(all_targets), epochs=1, verbose=0)
return history
def act(self, state):
self.eps *= self.decay
self.eps = max(self.eps_min, self.eps)
if np.random.random() < self.eps:
return self.env.action_space.sample()
return np.argmax(self.model.predict(state)[0])
def train_target(self):
weights = self.model.get_weights()
target_weights = self.target_model.get_weights()
for i in range(len(target_weights)):
target_weights[i] = (1-self.tau)*target_weights[i] + self.tau*weights[i]
self.target_model.set_weights(target_weights) #
def main(episodes):
env = gym.make('BreakoutDeterministic-v4')
agent = DQNAgent(env, cnn)
time = env._max_episode_steps
batch_size = 32
save_model = 'y'
filepath = os.getcwd()
date = dt.strftime('%d%m%Y')
clock = dt.strftime('%H.%M.%S')
print('++ Training started on {} at {} ++'.format(date, clock))
start_time = dt.time()
tot_r = []
tot_loss = []
it_r = []
it_loss = []
tot_frames = 0
for e in range(episodes):
r = []
loss = []
state = env.reset()
state = preprocess(state)
state = state[None,:]
current_life = agent.start_life
for t in range(time):
if rend_env == 'y':
action = agent.act(state)
new_state, reward, terminal_life, life = env.step(action)
new_state = preprocess(new_state)
new_state = new_state[None,:]
if life['ale.lives'] < current_life:
reward = -1
current_life = life['ale.lives']
agent.remember(state, action, reward, new_state, terminal_life)
hist = agent.replay(batch_size)
agent.train_target()
state = new_state
r.append(reward)
tot_frames += 1
if hist is None:
loss.append(0.0)
else:
loss.append(hist.history['loss'][0])
if t%20 == 0:
print('Frame : {}, Cum Reward = {}, Avg Loss = {}, Curr Life: {}'.format(t,
np.sum(r),
round(np.mean(loss[-20:-1]),3),
current_life))
agent.model.save('{}/Mod_Fig/DQN_BO_model_{}.h5'.format(filepath, date))
agent.model.save_weights('{}/Mod_Fig/DQN_BO_weights_{}.h5'.format(filepath, date))
if current_life == 0 or terminal_life:
print('Episode {} of {}, Cum Reward = {}, Avg Loss = {}'.format(e, episodes, np.sum(r), np.mean(loss)))
break
tot_r.append(np.sum(r))
tot_loss.append(np.mean(loss))
it_r.append(r)
it_loss.append(loss)
print('Training ended on {} at {}'.format(date, clock))
run_time = dt.time() - start_time
print('Total Training time: %d Hrs %d Mins $d s' % (run_time // 3600, (run_time % 3600) // 60),
(run_time % 3600) % 60 // 1)
if save_model == 'y':
agent.model.save('{}/Mod_Fig/DQN_BO_finalmodel_{}_{}.h5'.format(filepath, date, clock))
agent.model.save_weights('{}/Mod_Fig/DQN_BO_finalweights_{}_{}.h5'.format(filepath, date, clock))
agent.model.summary()
return tot_r, tot_loss, it_r, it_loss, tot_frames
if __name__ == '__main__':
episodes = 3
total_reward, total_loss, rewards_iter, loss_iter, frames_epi = main(episodes=episodes)
Would really appreciate your comments and help on writing memory and speed efficient deep RL codes! I hope to train my DQN on breakout for 5000 episodes but the remote server only allows maximum of 48 hours of training. Thanks in advance!
It sounds like you have a memory leak.
This line
agent.remember(state, action, reward, new_state, terminal_life)
gets called 5000 * env._max_episode_steps times, and each state is a (210, 160, 3) array. The first thing to try would be to reduce the size of self.memory = deque(maxlen=1000000) # specify memory size to verify that this is the sole cause.
If you really believe you need that much capacity, you should dump self.memory to disk and keep a only a small subsample in memory.
Additionally: subsampling from deque is very slow, deque is implemented as a linked list so each subsample is O(N*M). You should consider implementing your own ring buffer for self.memory.
Alternatively: you might consider a probabilistic buffer (I don't know the proper name), where each time you would append to a full buffer, remove an element at random and append the new element. This means any (state, action, reward, ...) tuple that is encountered has a nonzero probability of being contained in the buffer, with recent tuples being more likely than older ones.
I had similar problems with memory and I still do.
The main cause of the large memory consumption are the states. But here's what I did to make it better:
Step 1: Resize them to a 84 x 84 sample using openCV. Some people instead downsample the images to 84 x 84. This results in each state having the shape (84,84,3).
Step 2: Convert these frames to grayscale (basically, black and white). This should change the shape to (84,84,1).
Step 3: Use dtype=np.uint8 for storing states. They consume minimal memory and are perfect for the pixel intensity values ranged 0-255.
Additional Info
I run my code on free Google Collab notebooks (K80 Tesla GPU and 13GB RAM), periodically saving the replay buffer to my drive.
For steps 1 and 2, consider using the OpenAI baseline Atari wrappers, as there is no point in reinventing the wheel.
You could also this snippet to check the amount of RAM used by your own program at each step, like I did:
import os
import psutil
def show_RAM_usage(self):
py = psutil.Process(os.getpid())
print('RAM usage: {} GB'.format(py.memory_info()[0]/2. ** 30))
This snippet is modified to use in my own program from the original answer

Tensorflow: using a FIFO queue for code running on GPUs

The code below shows my attempt to run an algorithm on single GPUs and feed data to it using a FIFO queue. The data exists in a CSV file. I use a separate python thread to read from the file one line at a time and enqueue the line into a FIFO.
N = 16
num_ckfs =80000
q = [0.01 for i in range(N)]
q_ckfs = np.array([q for i in range(num_ckfs)])
r = [5]
r_ckfs = np.array([r for i in range(num_ckfs)])
init_var = [10.0 for i in range(N)]
init_var_ckfs = np.array([init_var for i in range(num_ckfs)])
init_state = [0.0 for i in range(N)]
init_state_ckfs = np.array([init_state for i in range(num_ckfs)])
class CKF(object):
def __init__(self, num_ckfs, N):
self.init_variances = tf.Variable(init_var_ckfs, name='init_variances', dtype=tf.float64)
self.init_states = tf.Variable(init_state_ckfs, name='init_states', dtype=tf.float64)
init_states_expanded = tf.expand_dims(self.init_states, 2) # num_ckfs X N X 1
self.q_values = tf.constant(q_ckfs, name='q_values', dtype=tf.float64)
self.r_values = tf.constant(r_ckfs, name='r_values', dtype=tf.float64)
self.input_vectors = tf.placeholder(tf.float64, shape=[num_ckfs, N], name='input_vectors')
self.z_k = tf.placeholder(tf.float64, shape=[num_ckfs, 1], name='z_k');
q = tf.FIFOQueue(200, [tf.float64, tf.float64], shapes=[[num_ckfs,1], [num_ckfs,N]])
self.enqueue_op = q.enqueue([self.z_k, self.input_vectors])
observations, inputs = q.dequeue()
#further processing using the input data
with tf.device('/gpu:0'):
ckf_gpu0 = CKF(num_ckfs, N)
def load_and_enqueue():
#read one line at a time
#obvs_list corresponds to the first column
#data_list corresponds to the rest of the columns
session.run(ckf_gpu0.enqueue_op, feed_dict={
ckf_gpu0.input_vectors: data_list[0], ckf_gpu0.z_k: obvs_list[0]})
count += 1
t = threading.Thread(target=load_and_enqueue)
t.start()
for i in range( num_rows):
out = session.run([ckf_gpu0.projected_output ])
The first problem that I have run into is:
InvalidArgumentError (see above for traceback): Cannot assign a device to node 'fifo_queue': Could not satisfy explicit device specification '/device:GPU:0' because no supported kernel for GPU devices is available.
Is there an alternate way to do such a thing, i.e. hide the I/O latency while the computation is being done on a GPU?

Resources