Running transformers predictions on CPU with multiprocessing - pytorch

I am trying to use pre-trained model from transformers to predict on CPU with multiprocessing. Below code code gets stuck at step-4. Any thoughts on how to resolve this?
I referred to below but no luck:
https://pytorch.org/docs/stable/notes/multiprocessing.html
import os
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
from torch.multiprocessing import Pool, Process, set_start_method
def answer_question(question, answer_text, model, tokenizer):
input_ids = tokenizer.encode(question, answer_text)
print('Query has {:,} tokens.\n'.format(len(input_ids)))
sep_index = input_ids.index(tokenizer.sep_token_id)
print(".. step -- 1")
num_seg_a = sep_index + 1
print(".. step -- 2")
num_seg_b = len(input_ids) - num_seg_a
print(".. step -- 3")
segment_ids = [0]*num_seg_a + [1]*num_seg_b
print(".. step -- 4")
assert len(segment_ids) == len(input_ids)
# ======== Evaluate ========
# Run our example question through the model.
start_scores, end_scores = model(torch.tensor([input_ids]), # The tokens representing our input text.
token_type_ids=torch.tensor([segment_ids])) # The segment IDs to differentiate question from answer_text
print(".. step -- 5")
# ======== Reconstruct Answer ========
# Find the tokens with the highest `start` and `end` scores.
answer_start = torch.argmax(start_scores)
answer_end = torch.argmax(end_scores)
print(".. step -- 6")
tokens = tokenizer.convert_ids_to_tokens(input_ids)
print(".. step -- 7")
answer = tokens[answer_start]
print(".. step -- 8")
for i in range(answer_start + 1, answer_end + 1):
if tokens[i][0:2] == '##':
answer += tokens[i][2:]
else:
answer += ' ' + tokens[i]
print('Answer: "' + answer + '"')
if __name__ == "__main__":
size = 2
processes = []
print("reading bert model")
bert_model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
print("reading bert tokenizer")
bert_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
q_list = ["How many parameters does BERT-large have?","How many parameters does BERT-large have?"]
a_list = ["BERT-large is really big... it has 24-layers and an embedding size of 1,024, for a total of 340M parameters! Altogether it is 1.34GB, so expect it to take a couple minutes to download to your Colab instance.", "BERT-large is really big... it has 24-layers and an embedding size of 1,024, for a total of 340M parameters! Altogether it is 1.34GB, so expect it to take a couple minutes to download to your Colab instance."]
for a in zip(q_list, a_list):
print(a)
q = Process(target = answer_question, args= (a[0], a[1], bert_model, bert_tokenizer))
q.start()
processes.append(q)
for q in processes:
q.join()

Related

How to run one (pre-trained lstm) model on CPU and second (pose estimation) model on GPU simultaneously using multithreading?

!!! My problem definition might be long and makes you boring, however, I am trying to make my case and error clear to you.
**
The definition what I am doing:**
I am implementing socket transmission in python. In client side, object detection is performed, and detected number of people and detected frames are sent to server side. The server side consists of multiple threaded classes to handle data from client and perform pose estimation to monitor and log GPU utilization. When I run code, the whole code is running while logging GPU memory usage about 46% as expected. Here, I provide below DataManager.py (to handle data from client) and ChildProcess.py (to get frames data from queue and perform pose estimation) which are part of the whole code.
DataManager.py
class DataManagerThread(Thread):
def __init__(self, queue, sock, index):
super().__init__()
self.image_queue = queue
self.server_socket = sock
self.index = index
def run(self):
data = b""
payload_size = struct.calcsize("Q")
while True:
while len(data) < payload_size:
packet = self.server_socket.recv(4*1024) # The server_socket attribute is no longer None, so this should work
if not packet:
break
data += packet
packed_msg_size = data[:payload_size]
data = data[payload_size:]
msg_size = struct.unpack("Q", packed_msg_size)[0]
while len(data) < msg_size:
data += self.server_socket.recv(4*1024)
frame_data = data[:msg_size]
data = data[msg_size:]
data_dict = pickle.loads(frame_data)
# extract frame and detection information from data dictionary
img = data_dict['frame']
people = data_dict['people']
print(f'Detected number of people: {people}')
# TODO: Passing data to the process manager thread as a queue
self.put_data_to_queue(img)
else:
print("[System] end socket")
self.put_data_to_queue("End")
def put_data_to_queue(self, image):
self.image_queue.put(image)
ChildProcess.py
import warnings
warnings.filterwarnings(action="ignore")
from multiprocessing import Process
from tf_pose.estimator import TfPoseEstimator
from tf_pose.networks import get_graph_path, model_wh
import argparse
import cv2
import logging
import time
def str2bool(v):
return v.lower() in ("yes", "true", "t", "1")
def init_logger():
logger = logging.getLogger('TfPoseEstimator-WebCam')
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter('[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
class ChildProcess(Process):
def __init__(self, queue):
self.start_time = time.time()
super().__init__()
self.image_queue = queue
self.start_time = time.time()
def __del__(self):
pass
def run(self):
args, w, h, e = self.init_model()
print("[Time]", time.time() - self.start_time)
while True:
print("[System] Run motion")
image = self.image_queue.get()
if type(image) is str:
print("[System end process]")
break
else:
self.motionTracking(args, e, w, h, image)
def motionTracking(self, args, e, w, h, decimg):
humans = e.inference(decimg, resize_to_default=(w > 0 and h > 0),
upsample_size=args.resize_out_ratio)
y1 = [0.0]
y = 0
image = TfPoseEstimator.draw_humans(decimg, humans, imgcopy=False)
for human in humans:
for i in range(len(humans)):
try:
a = human.body_parts[0]
x = a.x * image.shape[1]
y = a.y * image.shape[0]
y1.append(y)
except:
pass
if ((y - y1[len(y1) - 2]) > 30):
pass
cv2.imshow('tf-pose-estimation result', image)
_ = 0xFF & cv2.waitKey(1)
def init_model(self):
print("[System] model init")
parser = argparse.ArgumentParser(description='tf-pose-estimation realtime webcam')
parser.add_argument('--camera', type=int, default=0)
parser.add_argument('--resize', type=str, default='0x0',
help='if provided, resize images before they are processed. default=0x0, Recommends : 432x368 or 656x368 or 1312x736 ')
parser.add_argument('--resize-out-ratio', type=float, default=4.0,
help='if provided, resize heatmaps before they are post-processed. default=1.0')
parser.add_argument('--model', type=str, default='mobilenet_thin',
help='cmu / mobilenet_thin / mobilenet_v2_large / mobilenet_v2_small')
parser.add_argument('--show-process', type=bool, default=False,
help='for debug purpose, if enabled, speed for inference is dropped.')
parser.add_argument('--tensorrt', type=str, default="False",
help='for tensorrt process.')
args = parser.parse_args()
print('[System] initialization %s : %s' % (args.model, get_graph_path(args.model)))
w, h = model_wh(args.resize)
if w > 0 and h > 0:
e = TfPoseEstimator(get_graph_path(args.model), target_size=(w, h), trt_bool=str2bool(args.tensorrt))
else:
e = TfPoseEstimator(get_graph_path(args.model), target_size=(432, 368), trt_bool=str2bool(args.tensorrt))
print("[System] End model")
return args, w, h, e
The definition what I want to do and what error I am getting:
Here, I want to add my LSTM code to predict (people = data_dict['p#ople']) in DataManager.py file.
DataManager(added_lstm).py
# import some necessary libraries
config = tf.compat.v1.ConfigProto()
graph = tf.compat.v1.get_default_graph()
first_session = tf.compat.v1.Session(config=config)
with graph.as_default(), first_session.as_default():
with graph.as_default():
with tf.device('CPU:0'):
model = tf.keras.models.load_model('/home/tf-pose-estimation/modules/lstm_model/model1.h5', compile=False)
print(model.summary())
def make_prediction(m):
WINDOW_SIZE, alpha, theta = 5, 0.9, 3
forecast_ewma, forecast_values, theta_values, arr_of_num = [0], [], [], [1,1,1,1,1]
arr_of_num.append(m)
if len(arr_of_num)>WINDOW_SIZE:
arr_of_num = arr_of_num[1:]
if len(arr_of_num)==WINDOW_SIZE:
actual = arr_of_num[-1]
with graph.as_default(), first_session.as_default():
forecast = model.predict(np.array(arr_of_num[-WINDOW_SIZE:]).reshape(1, WINDOW_SIZE, 1))[0][0]
forecast_values.append(forecast)
a = alpha * forecast + (1 - alpha) * forecast_ewma[-1]
theta += 1 if a > 0.5 else -1
theta = min(max(theta, 0), 2)
theta_values.append(theta)
forecast_ewma.append(a)
return actual, forecast
class DataManagerThread(Thread):
def __init__(self, queue,sock, index):
super().__init__()
self.image_queue = queue
self.server_socket = sock
self.index = index
def run(self):
data = b""
payload_size = struct.calcsize("Q")
while True:
while len(data) < payload_size:
packet = self.server_socket.recv(4*1024) # The server_socket attribute is no longer None, so this should work
if not packet:
break
data += packet
packed_msg_size = data[:payload_size]
data = data[payload_size:]
msg_size = struct.unpack("Q", packed_msg_size)[0]
while len(data) < msg_size:
data += self.server_socket.recv(4*1024)
frame_data = data[:msg_size]
data = data[msg_size:]
data_dict = pickle.loads(frame_data)
# extract frame and detection information from data dictionary
img = data_dict['frame']
people = data_dict['people']
print(f'Detected number of people: {people}')
self.put_data_to_queue(img)
pred = make_prediction(people) # Added for lstm prediction
print(f"Predictions: {pred}")
def put_data_to_queue(self, image):
self.image_queue.put(image)
Here, I am adding only to DataManager.py file that loading lstm model, defining make_prediction function and use pred = make_prediction(people) inside of DataManagerThread(Thread) class to make prediction. Other code remained unchanged in this file.
When I run both models simultaneously, only lstm is predicting and pose estimation is just frozen. Also, even lstm model is forced to utilize CPU, about 84 % of GPU memory is occupied. Why? I do not know. However, my expectation is that lstm model should use cpu and pose estimation model should use gpu, and both models should bre run simultaneously.
When I run every model separately (i.e., lstm on CPU and pose estimation on GPU), they are working pretty well. Specifically, I tested LSTM model seoerately by generating random number, it worked as expected. LSTM model is trained in Tensorflow 2.5 and both models are running in Tensorflow 2.5.
Below is my PC and Env specifications:
GPU: NVIDIA GeForce RTX 2070 SUPER
Driver Version: 525
CUDA Version: 11.6
Python: 3.9.12
Tensorflow-gpu: 2.5.0
Is there possible or relevant solution for running the lstm model on CPU and the pose estimation model on GPU simultaneously using multithreading?
Any help appreciated!!!

TypeError: __init__() got an unexpected keyword argument 'has_model_config' while training Token classification model

System Info
transformers version: 4.18.1
Platform: Linux Jupyter Notebook, TF2.3 Python 3.6, 2 GPU
Python version: '1.7.1+cu101'
Using GPU in script?: yes
Using distributed or parallel set-up in script?: no
Information
[ ] The official example scripts
[X] My own modified scripts
Tasks
[ ] An officially supported task in the examples folder (such as GLUE/SQuAD, ...)
[X] My own task or dataset (give details below)
Reproduction
model_checkpoint = "xlm-roberta-large-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,add_prefix_space=True)
train_examples ={'texts':[x[0] for x in train_set],'tag_names':[x[1] for x in train_set]}
def isin(a, b):
return a[1] > b[0] and a[0] < b[1]
def tokenize_and_align_labels(examples, label2id, max_length=256):
tokenized_inputs = tokenizer(examples["texts"], truncation=True, padding='max_length', max_length=max_length,
return_offsets_mapping=True)
labels = []
for i, label_idx_for_single_input in enumerate(tqdm.tqdm(examples["tag_names"])):
labels_for_single_input = ['O' for _ in range(max_length)]
text_offsets = tokenized_inputs['offset_mapping'][i]
for entity in label_idx_for_single_input:
tag = entity['tag']
tag_offset = [entity['start'], entity['end']]
affected_token_ids = [j for j in range(max_length) if isin(tag_offset, text_offsets[j])]
if len(affected_token_ids) < 1:
continue
if any(labels_for_single_input[j] != 'O' for j in affected_token_ids):
continue
for j in affected_token_ids:
labels_for_single_input[j] = 'I_' + tag
labels_for_single_input[affected_token_ids[-1]] = 'L_' + tag
labels_for_single_input[affected_token_ids[0]] = 'B_' + tag
label_ids = [label2id[x] for x in labels_for_single_input]
labels.append(label_ids)
tokenized_inputs["labels"] = labels
print(tokenized_inputs.keys())
return tokenized_inputs
class MyDataset(torch.utils.data.Dataset):
def __init__(self, examples):
self.encodings = examples
self.labels = examples['labels']
def __getitem__(self, idx):
item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
item["labels"] = torch.tensor([self.labels[idx]])
return item
def __len__(self):
return len(self.labels)
train_data=MyDataset(train_data)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint,id2label=id2label,label2id=label2id,ignore_mismatched_sizes=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
args = TrainingArguments(
"xlmroberta-finetuned-ner",
# evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=2,
weight_decay=0.01,
per_device_train_batch_size=4,
# per_device_eval_batch_size=32
fp16=True
# bf16=True #Ampere GPU
)
trainer = Trainer(
model=model,
args=args,
train_dataset=train_data,
# eval_dataset=train_data,
# data_collator=data_collator,
# compute_metrics=compute_metrics,
tokenizer=tokenizer)
trainer.train()
Using amp half precision backend
FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
FutureWarning,
***** Running training *****
Num examples = 141648
Num Epochs = 2
Instantaneous batch size per device = 4
Total train batch size (w. parallel, distributed & accumulation) = 8
Gradient Accumulation steps = 1
Total optimization steps = 35412
MLflow's log_param() only accepts values no longer than 250 characters so we dropped this attribute.
TypeError: __init__() got an unexpected keyword argument 'has_model_config'
Expected behavior
To train NER model

RuntimeError: Error(s) in loading state_dict for Actor - torch.load()

I have created a custom environment in open ai gym and i am facing error while loading the weights Could some one help me to resolve the issue . I am training a TD3 network in a custom environment and i have trained successfully but while inferencing i am facing this issue
class Actor(nn.Module):
def __init__(self, state_dim, action_dim, max_action):
super(Actor, self).__init__()
self.layer_1 = nn.Linear(state_dim, 400)
self.layer_2 = nn.Linear(400, 300)
self.layer_3 = nn.Linear(300, action_dim)
self.max_action = max_action
def forward(self, x):
x = F.relu(self.layer_1(x))
x = F.relu(self.layer_2(x))
x = self.max_action * torch.tanh(self.layer_3(x))
return x
class Critic(nn.Module):
def __init__(self, state_dim, action_dim):
super(Critic, self).__init__()
# Defining the first Critic neural network
self.layer_1 = nn.Linear(state_dim + action_dim, 400)
self.layer_2 = nn.Linear(400, 300)
self.layer_3 = nn.Linear(300, 1)
# Defining the second Critic neural network
self.layer_4 = nn.Linear(state_dim + action_dim, 400)
self.layer_5 = nn.Linear(400, 300)
self.layer_6 = nn.Linear(300, 1)
def forward(self, x, u):
xu = torch.cat([x, u], 1)
# Forward-Propagation on the first Critic Neural Network
x1 = F.relu(self.layer_1(xu))
x1 = F.relu(self.layer_2(x1))
x1 = self.layer_3(x1)
# Forward-Propagation on the second Critic Neural Network
x2 = F.relu(self.layer_4(xu))
x2 = F.relu(self.layer_5(x2))
x2 = self.layer_6(x2)
return x1, x2
def Q1(self, x, u):
xu = torch.cat([x, u], 1)
x1 = F.relu(self.layer_1(xu))
x1 = F.relu(self.layer_2(x1))
x1 = self.layer_3(x1)
return x1
# Selecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Building the whole Training Process into a class
class TD3(object):
def __init__(self, state_dim, action_dim, max_action):
self.actor = Actor(state_dim, action_dim, max_action).to(device)
self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
self.actor_target.load_state_dict(self.actor.state_dict())
self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
self.critic = Critic(state_dim, action_dim).to(device)
self.critic_target = Critic(state_dim, action_dim).to(device)
self.critic_target.load_state_dict(self.critic.state_dict())
self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
self.max_action = max_action
def select_action(self, state):
state = torch.Tensor(state.reshape(1, -1)).to(device)
return self.actor(state).cpu().data.numpy().flatten()
def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
for it in range(iterations):
# Step 4: We sample a batch of transitions (s, s’, a, r) from the memory
batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
state = torch.Tensor(batch_states).to(device)
next_state = torch.Tensor(batch_next_states).to(device)
action = torch.Tensor(batch_actions).to(device)
reward = torch.Tensor(batch_rewards).to(device)
done = torch.Tensor(batch_dones).to(device)
# Step 5: From the next state s’, the Actor target plays the next action a’
next_action = self.actor_target(next_state)
# Step 6: We add Gaussian noise to this next action a’ and we clamp it in a range of values supported by the environment
noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
noise = noise.clamp(-noise_clip, noise_clip)
next_action = (next_action + noise).clamp(-self.max_action, self.max_action)
# Step 7: The two Critic targets take each the couple (s’, a’) as input and return two Q-values Qt1(s’,a’) and Qt2(s’,a’) as outputs
target_Q1, target_Q2 = self.critic_target(next_state, next_action)
# Step 8: We keep the minimum of these two Q-values: min(Qt1, Qt2)
target_Q = torch.min(target_Q1, target_Q2)
# Step 9: We get the final target of the two Critic models, which is: Qt = r + γ * min(Qt1, Qt2), where γ is the discount factor
target_Q = reward + ((1 - done) * discount * target_Q).detach()
# Step 10: The two Critic models take each the couple (s, a) as input and return two Q-values Q1(s,a) and Q2(s,a) as outputs
current_Q1, current_Q2 = self.critic(state, action)
# Step 11: We compute the loss coming from the two Critic models: Critic Loss = MSE_Loss(Q1(s,a), Qt) + MSE_Loss(Q2(s,a), Qt)
critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
# Step 12: We backpropagate this Critic loss and update the parameters of the two Critic models with a SGD optimizer
self.critic_optimizer.zero_grad()
critic_loss.backward()
self.critic_optimizer.step()
# Step 13: Once every two iterations, we update our Actor model by performing gradient ascent on the output of the first Critic model
if it % policy_freq == 0:
actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
self.actor_optimizer.zero_grad()
actor_loss.backward()
self.actor_optimizer.step()
# Step 14: Still once every two iterations, we update the weights of the Actor target by polyak averaging
for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
# Step 15: Still once every two iterations, we update the weights of the Critic target by polyak averaging
for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
# Making a save method to save a trained model
def save(self, filename, directory):
torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))
# Making a load method to load a pre-trained model
def load(self, filename, directory):
self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))
def evaluate_policy(policy, eval_episodes=10):
avg_reward = 0.
for _ in range(eval_episodes):
obs = env.reset()
done = False
while not done:
action = policy.select_action(np.array(obs))
obs, reward, done, _ = env.step(action)
avg_reward += reward
avg_reward /= eval_episodes
print ("---------------------------------------")
print ("Average Reward over the Evaluation Step: %f" % (avg_reward))
print ("---------------------------------------")
return avg_reward
env_name = "Pygame-v0"
seed = 0
file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")
eval_episodes = 10
save_env_vid = True
env = gym.make(env_name)
max_episode_steps = env._max_episode_steps
if save_env_vid:
env = wrappers.Monitor(env, monitor_dir, force = True)
env.reset()
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
policy = TD3(state_dim, action_dim, max_action)
#policy.load(file_name, './pytorch_models/')
policy.load(file_name,"/content/gdrive/My Drive/reinforce/gym_game/pytorch_models")
_ = evaluate_policy(policy, eval_episodes=eval_episodes)
Traceback:
I am facing a runtime error while loading the state_dict for actor model .I searched google but couldnt find similar issues .
RuntimeError: Error(s) in loading state_dict for Actor:
Missing key(s) in state_dict: "layer_1.weight", "layer_1.bias", "layer_2.weight", "layer_2.bias", "layer_3.weight", "layer_3.bias".
Unexpected key(s) in state_dict: "encoder.0.weight", "encoder.0.bias", "encoder.2.weight", "encoder.2.bias", "encoder.2.running_mean", "encoder.2.running_var", "encoder.2.num_batches_tracked", "encoder.3.weight", "encoder.3.bias", "encoder.5.weight", "encoder.5.bias", "encoder.5.running_mean", "encoder.5.running_var", "encoder.5.num_batches_tracked", "encoder.6.weight", "encoder.6.bias", "encoder.8.weight", "encoder.8.bias", "encoder.8.running_mean", "encoder.8.running_var", "encoder.8.num_batches_tracked", "encoder.10.weight", "encoder.10.bias", "encoder.12.weight", "encoder.12.bias", "encoder.12.running_mean", "encoder.12.running_var", "encoder.12.num_batches_tracked", "encoder.13.weight", "encoder.13.bias", "encoder.15.weight", "encoder.15.bias", "encoder.15.running_mean", "encoder.15.running_var", "encoder.15.num_batches_tracked", "encoder.16.weight", "encoder.16.bias", "linear.0.weight", "linear.0.bias", "linear.2.weight", "linear.2.bias".
it was answered by #MicaelJungo
The weights you saved were not from the model you are using here. Make sure to load the correct checkpoint, which was created when training this particular model.

How do I reduce memory usage for deep reinforcement learning algorithms?

I wrote a script of DQN to play BreakoutDeterministic and ran it on my school GPU server. However, it seems that the code is taking up 97% of the total RAM memory (more than 100GB)!
I would like to know which part of the script is demanding this high usage of RAM? I used memory-profiler for 3 episodes and it seems that the memory requirement increases linearly with each time step on my laptop.
I wrote the script in PyCharm, python 3.6. My laptop 12GB RAM with no GPU but the school server is using Ubuntu, p100 GPU.
import gym
import numpy as np
import random
from collections import deque
from keras.layers import Dense, Input, Lambda, convolutional, core
from keras.models import Model
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import os
import time as dt
plt.switch_backend('agg')
def preprocess(state):
process_state = np.mean(state, axis=2).astype(np.uint8)
process_state = process_state[::2, ::2]
process_state_size = list(process_state.shape)
process_state_size.append(1)
process_state = np.reshape(process_state, process_state_size)
return process_state
class DQNAgent:
def __init__(self, env):
self.env = env
self.action_size = env.action_space.n
self.state_size = self.select_state_size()
self.memory = deque(maxlen=1000000) # specify memory size
self.gamma = 0.99
self.eps = 1.0
self.eps_min = 0.01
self.decay = 0.95
self.lr = 0.00025
self.start_life = 5 # get from environment
self.tau = 0.125 # special since 2 models to be trained
self.model = self.create_cnnmodel()
self.target_model = self.create_cnnmodel()
def select_state_size(self):
process_state = preprocess(self.env.reset())
state_size = process_state.shape
return state_size
def create_cnnmodel(self):
data_input = Input(shape=self.state_size, name='data_input', dtype='int32')
normalized = Lambda(lambda x: x/255)(data_input)
conv1 = convolutional.Convolution2D(32, 8, strides=(4, 4), activation='relu')(normalized)
conv2 = convolutional.Convolution2D(64, 4, strides=(2,2), activation='relu')(conv1)
conv3 = convolutional.Convolution2D(64, 3, strides=(1,1), activation='relu')(conv2)
conv_flatten = core.Flatten()(conv3) # flatten to feed cnn to fc
h4 = Dense(512, activation='relu')(conv_flatten)
prediction_output = Dense(self.action_size, name='prediction_output', activation='linear')(h4)
model = Model(inputs=data_input, outputs=prediction_output)
model.compile(optimizer=Adam(lr=self.lr),
loss='mean_squared_error') # 'mean_squared_error') keras.losses.logcosh(y_true, y_pred)
return model
def remember(self, state, action, reward, new_state, done): # store past experience as a pre-defined table
self.memory.append([state, action, reward, new_state, done])
def replay(self, batch_size):
if batch_size > len(self.memory):
return
all_states = []
all_targets = []
samples = random.sample(self.memory, batch_size)
for sample in samples:
state, action, reward, new_state, done = sample
target = self.target_model.predict(state)
if done:
target[0][action] = reward
else:
target[0][action] = reward + self.gamma*np.max(self.target_model.predict(new_state)[0])
all_states.append(state)
all_targets.append(target)
history = self.model.fit(np.vstack(all_states), np.vstack(all_targets), epochs=1, verbose=0)
return history
def act(self, state):
self.eps *= self.decay
self.eps = max(self.eps_min, self.eps)
if np.random.random() < self.eps:
return self.env.action_space.sample()
return np.argmax(self.model.predict(state)[0])
def train_target(self):
weights = self.model.get_weights()
target_weights = self.target_model.get_weights()
for i in range(len(target_weights)):
target_weights[i] = (1-self.tau)*target_weights[i] + self.tau*weights[i]
self.target_model.set_weights(target_weights) #
def main(episodes):
env = gym.make('BreakoutDeterministic-v4')
agent = DQNAgent(env, cnn)
time = env._max_episode_steps
batch_size = 32
save_model = 'y'
filepath = os.getcwd()
date = dt.strftime('%d%m%Y')
clock = dt.strftime('%H.%M.%S')
print('++ Training started on {} at {} ++'.format(date, clock))
start_time = dt.time()
tot_r = []
tot_loss = []
it_r = []
it_loss = []
tot_frames = 0
for e in range(episodes):
r = []
loss = []
state = env.reset()
state = preprocess(state)
state = state[None,:]
current_life = agent.start_life
for t in range(time):
if rend_env == 'y':
action = agent.act(state)
new_state, reward, terminal_life, life = env.step(action)
new_state = preprocess(new_state)
new_state = new_state[None,:]
if life['ale.lives'] < current_life:
reward = -1
current_life = life['ale.lives']
agent.remember(state, action, reward, new_state, terminal_life)
hist = agent.replay(batch_size)
agent.train_target()
state = new_state
r.append(reward)
tot_frames += 1
if hist is None:
loss.append(0.0)
else:
loss.append(hist.history['loss'][0])
if t%20 == 0:
print('Frame : {}, Cum Reward = {}, Avg Loss = {}, Curr Life: {}'.format(t,
np.sum(r),
round(np.mean(loss[-20:-1]),3),
current_life))
agent.model.save('{}/Mod_Fig/DQN_BO_model_{}.h5'.format(filepath, date))
agent.model.save_weights('{}/Mod_Fig/DQN_BO_weights_{}.h5'.format(filepath, date))
if current_life == 0 or terminal_life:
print('Episode {} of {}, Cum Reward = {}, Avg Loss = {}'.format(e, episodes, np.sum(r), np.mean(loss)))
break
tot_r.append(np.sum(r))
tot_loss.append(np.mean(loss))
it_r.append(r)
it_loss.append(loss)
print('Training ended on {} at {}'.format(date, clock))
run_time = dt.time() - start_time
print('Total Training time: %d Hrs %d Mins $d s' % (run_time // 3600, (run_time % 3600) // 60),
(run_time % 3600) % 60 // 1)
if save_model == 'y':
agent.model.save('{}/Mod_Fig/DQN_BO_finalmodel_{}_{}.h5'.format(filepath, date, clock))
agent.model.save_weights('{}/Mod_Fig/DQN_BO_finalweights_{}_{}.h5'.format(filepath, date, clock))
agent.model.summary()
return tot_r, tot_loss, it_r, it_loss, tot_frames
if __name__ == '__main__':
episodes = 3
total_reward, total_loss, rewards_iter, loss_iter, frames_epi = main(episodes=episodes)
Would really appreciate your comments and help on writing memory and speed efficient deep RL codes! I hope to train my DQN on breakout for 5000 episodes but the remote server only allows maximum of 48 hours of training. Thanks in advance!
It sounds like you have a memory leak.
This line
agent.remember(state, action, reward, new_state, terminal_life)
gets called 5000 * env._max_episode_steps times, and each state is a (210, 160, 3) array. The first thing to try would be to reduce the size of self.memory = deque(maxlen=1000000) # specify memory size to verify that this is the sole cause.
If you really believe you need that much capacity, you should dump self.memory to disk and keep a only a small subsample in memory.
Additionally: subsampling from deque is very slow, deque is implemented as a linked list so each subsample is O(N*M). You should consider implementing your own ring buffer for self.memory.
Alternatively: you might consider a probabilistic buffer (I don't know the proper name), where each time you would append to a full buffer, remove an element at random and append the new element. This means any (state, action, reward, ...) tuple that is encountered has a nonzero probability of being contained in the buffer, with recent tuples being more likely than older ones.
I had similar problems with memory and I still do.
The main cause of the large memory consumption are the states. But here's what I did to make it better:
Step 1: Resize them to a 84 x 84 sample using openCV. Some people instead downsample the images to 84 x 84. This results in each state having the shape (84,84,3).
Step 2: Convert these frames to grayscale (basically, black and white). This should change the shape to (84,84,1).
Step 3: Use dtype=np.uint8 for storing states. They consume minimal memory and are perfect for the pixel intensity values ranged 0-255.
Additional Info
I run my code on free Google Collab notebooks (K80 Tesla GPU and 13GB RAM), periodically saving the replay buffer to my drive.
For steps 1 and 2, consider using the OpenAI baseline Atari wrappers, as there is no point in reinventing the wheel.
You could also this snippet to check the amount of RAM used by your own program at each step, like I did:
import os
import psutil
def show_RAM_usage(self):
py = psutil.Process(os.getpid())
print('RAM usage: {} GB'.format(py.memory_info()[0]/2. ** 30))
This snippet is modified to use in my own program from the original answer

KNN doesn't classify correctly

I'm building my own 1-NN clasifier in python cause I need max speed in certain operations for testing it, because I want to use it in genetic algorithm and every milisecond its important for speed.
I'm trying to implement a leave one out test inside of my KNN class with numpy, but I obtain about 50% of success with this test. I try the scikit learn knn with the same leave one out and returns about 97% of success.
This is my KNN class:
class KNN(object):
"""Documentation for KNK-clasifier"""
def __init__(self):
super(KNN, self).__init__()
# self.args = args
def fit(self, entrenamiento, clases):
self.entrenamiento = np.asarray(entrenamiento)
self.n_examples = len(self.entrenamiento)
self.n_features = len(self.entrenamiento[1])
self.clases = np.asarray(clases)
self.createDistenceMatrix()
def createDistenceMatrix(self):
self.distances = np.zeros([len(self.entrenamiento),
len(self.entrenamiento),
len(self.entrenamiento[1])])
for i in range(self.n_examples):
for j in range(self.n_examples):
if i is not j:
self.distances[i][j] = self.distance(self.entrenamiento[i],
self.entrenamiento[j])
else:
self.distances[i][j] = np.full(len(self.entrenamiento[1]),
10000.0)
def distance(self, x, y):
return (x-y)*(x-y)
def predict(self, test, pesos=None):
dist = 100000
class_index = 0
for i in range(self.n_examples):
aux = self.distance(self.entrenamiento[i], test)
if pesos is not None:
aux = pesos*aux
if aux < dist:
dist = aux
class_index = i
return self.clases[class_index]
def leave_one_out(self, pesos=None):
# DONE: solo tengo que buscar el minimo de cada columna
dist = np.zeros(self.n_examples)
aciertos = 0
for i in range(self.n_examples):
for j in range(self.n_examples):
if pesos is not None:
dist[i] = np.linalg.norm(
np.multiply(self.distances[i][j], pesos))
else:
dist[i] = np.linalg.norm(self.distances[i][j])
if self.clases[i] == self.clases[np.argmin(dist)]:
aciertos = aciertos + 1
return 100*(aciertos/self.n_examples)
where create createDistanceMatrix precalculate all the possible x,y distances for all the features and save it to a vector. This vector will be multiplied for a weitght vector. This vector represent a feature weights learning problem that I'm trying to solve. I passed two days trying to find where the mistake is but I cand find, but my clasifier doesn't give me a decent percent of good classification in leave one out.
For sklearn knn this is the leave one out that I'm testing:
aciertos = 0
knn = neighbors.KNeighborsClassifier(n_neighbors=1)
start = time.clock()
for i in range(len(train)):
knn.fit(train[1:], cls[1:])
if knn.predict(train[0])[0] == cls[0]:
aciertos = aciertos + 1
train[0], train[-1] = train[-1], train[0]
cls[0], cls[-1] = cls[-1], cls[0]
end = time.clock()
print(str(end - start) + " segundos")
print(str(100*(aciertos/len(train))))
this same code with my own clasifier returns similar percent of succes.
I don't know if you fixed your problem but your distance looks wrong?
Here is the algorithm from Stanford cs231n:

Resources