LibTorch, using deeplab model gives segfault on forwarding - pytorch

I am trying to run images through the DeepLab model in Libtorch to segment them. Using pytorch, i am converting the Deeplabv3 model like this:
import torch
import torchvision
from torchvision import models
deeplap_model = models.segmentation.deeplabv3_resnet101(pretrained=True)
deeplap_model.eval()
class wrapper(torch.nn.Module):
def __init__(self, model):
super(wrapper, self).__init__()
self.model = model
def forward(self, input):
results = []
output = self.model(input)
for k, v in output.items():
results.append(v)
return tuple(results)
model = wrapper(deeplap_model)
example = torch.rand(1, 3, 224, 224)
# Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing.
traced_script_module = torch.jit.trace(model, example)
traced_script_module.save("model.pt")
Now, in c++ with LibTorch, i am trying to load the model and run data through it. This, however, fails:
std::shared_ptr<torch::jit::script::Module> module = torch::jit::load("model.pt");
module->to(torch::kCUDA);
assert(module != nullptr);
std::cout << "ok\n";
std::vector<torch::jit::IValue> inputs;
cv::Mat image;
image = cv::imread("pic.jpeg", 1);
cv::Mat image_resized;
cv::resize(image, image_resized, cv::Size(224, 224));
cv::cvtColor(image_resized, image_resized, cv::COLOR_BGR2RGB);
cv::Mat image_resized_float;
image_resized.convertTo(image_resized_float, CV_32F, 1.0 / 255);
auto img_tensor = torch::from_blob(image_resized_float.data, {1, 224, 224, 3}, torch::kFloat32);
cout << "img tensor loaded..\n";
img_tensor = img_tensor.permute({0, 3, 1, 2});
img_tensor[0][0] = img_tensor[0][0].sub(0.485).div(0.229);
img_tensor[0][1] = img_tensor[0][1].sub(0.456).div(0.224);
img_tensor[0][2] = img_tensor[0][2].sub(0.406).div(0.225);
// to GPU
img_tensor = img_tensor.to(at::kCUDA);
torch::Tensor out_tensor2 = module->forward({img_tensor}).toTensor(); // SEGFAULT
Where am i going wrong here?

Related

Use bounding box model in webcam

I have a model trained using the following code:
https://jovian.ml/aakanksha-ns/road-signs-bounding-box-prediction/v/2?utm_source=embed
Then I saved the model using the following code:
torch.save(model.state_dict(), 'checkpoint.pth')
How can I use the save model and in webcam? I created the below code but it only works for the classification model and not for the bounding box with the classification model.
from PIL import Image
import time
#Load the saved model
# Create a neural net class
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.nn
import torchvision
import PIL
import cv2
from torchvision import models
class BB_model(nn.Module):
def __init__(self):
super(BB_model, self).__init__()
resnet = models.resnet34(pretrained=True)
layers = list(resnet.children())[:8]
self.features1 = nn.Sequential(*layers[:6])
self.features2 = nn.Sequential(*layers[6:])
self.classifier = nn.Sequential(nn.BatchNorm1d(512), nn.Linear(512, 4))
self.bb = nn.Sequential(nn.BatchNorm1d(512), nn.Linear(512, 4))
def forward(self, x):
x = self.features1(x)
x = self.features2(x)
x = F.relu(x)
x = nn.AdaptiveAvgPool2d((1, 1))(x)
x = x.view(x.shape[0], -1)
return self.classifier(x), self.bb(x)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") ##Assigning the Device which will do the calculation
model = BB_model()
model.load_state_dict(torch.load("checkpoint_bb.pth"))
model = model.to(device)
model.eval()
video = cv2.VideoCapture(0)
# used to record the time when we processed last frame
prev_frame_time = 0
# used to record the time at which we processed current frame
new_frame_time = 0
encoder = {0:"Standing"}
# Let's preprocess the inputted frame
data_transforms = torchvision.transforms.Compose([
torchvision.transforms.Resize(size=(128, 128)),
torchvision.transforms.RandomHorizontalFlip(),
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])
def argmax(prediction):
prediction = prediction.cpu()
prediction = prediction.detach().numpy()
top_1 = np.argmax(prediction, axis=1)
score = np.amax(prediction)
score = '{:6f}'.format(score)
prediction = top_1[0]
result = encoder[prediction]
return result,score
def preprocess(image):
image = PIL.Image.fromarray(image) #Webcam frames are numpy array format
#Therefore transform back to PIL image
print(image)
image = data_transforms(image)
image = image.float()
#image = Variable(image, requires_autograd=True)
image = image.unsqueeze(0) #I don't know for sure but Resnet-50 model seems to only
#accpets 4-D Vector Tensor so we need to squeeze another
return image
while True:
_, frame = video.read()
image = frame[100:450, 150:570]
image_data = preprocess(image)
print(image_data)
prediction = model(image_data)
result, score = argmax(prediction)
font = cv2.FONT_HERSHEY_SIMPLEX
cv2.putText(frame,
f"{result}",
(50, 50),
font, 1,
(0, 0, 255),
2,
cv2.LINE_4)
# time when we finish processing for this frame
new_frame_time = time.time()
fps = 1 / (new_frame_time - prev_frame_time)
prev_frame_time = new_frame_time
cv2.putText(frame,
f"FPS: {round(fps,1)}",
(50, 80),
font, 1,
(255, 255, 0),
2,
cv2.LINE_4)
cv2.imshow("Capturing", frame)
key=cv2.waitKey(1)
if key == ord('q'):
break
video.release()
cv2.destroyAllWindows()
When I run the above code, I received the error stating that
prediction = prediction.cpu()
AttributeError: 'tuple' object has no attribute 'cpu'
Since my code was build for standard classification and not bounding box, the argmax function is not working and I need help on how to change it. Thank you

Onnx RuntimeError NOT_IMPLEMENTED Trilu

This model works in PyTorch however, after exporting it with PyTorch to Onnx format, the onnx runtime crashes with a 'Trilu NOT_IMPLEMENTED error' when loading it in. (I do not have this issue for my other models that use torch.tril() )
How do I make this model run in the Onnxruntime?
This is a visualisation of the Onnx graph of the Model.
The Model in PyTorch
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
def forward(self, item_seq):
attention_mask = item_seq < 100
tril_mask = torch.tril(attention_mask)
query_layer = torch.rand((1, 2, 2, 32))
key_layer = torch.rand((1, 2, 32, 2))
attention_scores = torch.matmul(query_layer, key_layer)
return attention_scores + tril_mask
model = MyModel()
model.eval()
x_train = torch.ones([1, 2], dtype=torch.long)
# demonstrate that eager works
print(model.forward(x_train))
bigmodel_onnx_filename = 'mymodel.onnx'
torch.onnx.export(
model,
x_train,
bigmodel_onnx_filename,
input_names=['x'],
output_names=['output'],
)
onnx.load(bigmodel_onnx_filename)
# Onnxruntime crashes when loading in the model
ort_sess = ort.InferenceSession(bigmodel_onnx_filename, providers=['CPUExecutionProvider'])
key = {'x': x_train.numpy()}
print(ort_sess.run(None, key))
This results in the following error for ort.InferenceSession():
NotImplemented: [ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Trilu(14) node with name '/net/Trilu'
How can I make this model run in the Onnxruntime?
[github: code to reproduce the error and the model.onnx file]
(https://github.com/bkersbergen/pytorch_onnx_runtime_error/blob/main/main.py)
I'm using python 3.9, these are the project requirements
torch==1.13.1
jupyter==1.0.0
onnxruntime==1.13.1
onnx==1.13.0
Torch nightly version 2.0.0.dev20230205 gave the same error
I then decided to implement my own tril function.
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
def forward(self, item_seq):
attention_mask = item_seq < 100
tril_mask = self.my_tril(attention_mask)
query_layer = torch.rand((1, 2, 2, 32))
key_layer = torch.rand((1, 2, 32, 2))
attention_scores = torch.matmul(query_layer, key_layer)
return attention_scores + tril_mask
def my_tril(self, x):
l = x.size(-1)
arange = torch.arange(l)
mask = arange.expand(l, l)
arange = arange.unsqueeze(-1)
mask = torch.le(mask, arange)
return x.masked_fill(mask == 0, 0)
but then I get a Where(9) node with name '/Where_1' NOT_IMPLEMENTED error. (?!)
The boolean output of torch.lt() as input for torch.tril() works with PyTorch's Eager and LIT mode. However it breaks the Onnx runtime with the "TRILU not implemented error".
I was able to work around it by casting the torch.tril() input to float():
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
def forward(self, item_seq):
attention_mask = torch.lt(item_seq, 100).float()
tril_mask = torch.tril(attention_mask)
query_layer = torch.rand((1, 2, 2, 32))
key_layer = torch.rand((1, 2, 32, 2))
attention_scores = torch.matmul(query_layer, key_layer)
return attention_scores + tril_mask
Based on this experience, my hypothesis is that the TRILU NOT_IMPLEMENTED error is only applicable when having BOOLEAN Tensors as input. The Onnxruntime then throws this generic TRILU NOT_IMPLEMENTED error making me believe that Onnx has no TRILU support at all, which is clearly not the case.

make multiple parallel predictions on tensorflow model

I want to make multiple predictions.
I have trained a segmentation model (images and masks) . You can find the model here.
The images have dimensions (32,32,3). The masks (32, 32).
What I am doing when I want to inference is:
Load the images array (tiles) with dim (62500, 32, 32, 3). You can find it here
Create tensorflow dataset from this array.
and then predict on each image, like:
masks = []
for k, element in enumerate(the_image_array):
the_img = np.asarray(np.expand_dims(element, 0))[-1, -1, :, :]
pred = model.predict(the_img[np.newaxis, :, :, :])[0]
mask = tf.where(pred > 0.5, 255, 0)
masks.append(mask)
Now, I want to do these predictions in parallel.
So, I tried:
import tensorflow as tf
import numpy as np
import os
from tensorflow.keras.models import load_model
from itertools import chain
from tensorflow.keras import backend as K
import multiprocessing
from multiprocessing import Pool
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
multiprocessing.set_start_method('spawn', force=True)
model = load_model('./model.h5',
custom_objects={"K": K})
def resize_and_rescale(image):
image = tf.image.resize(image,
(32, 32),
preserve_aspect_ratio=True)
image /= 255.0
return image
def prepare(ds):
ds = ds.map(resize_and_rescale)
return ds
def _apply_df(data):
img = np.asarray(np.expand_dims(data, 0))[-1,-1, :, :]
print(img.shape)
pred = model.predict(img[np.newaxis, :, :, :], verbose=2)[0]
#pred = model.predict(data)[0]
mask = tf.where(pred[:, :, -1] > 0.5, 255, 0)
return mask
def apply_by_multiprocessing(data, workers):
pool = Pool(processes=workers)
#result = pool.map(_apply_df, np.array_split(list(data.as_numpy_iterator()), workers))
result = pool.map(_apply_df, data.batch(np.ceil(len(data) / workers)))
pool.close()
return list(result)
def after_prepare(data):
tens_data = tf.data.Dataset.from_tensor_slices(data)
tens_data = prepare(tens_data)
return tens_data
def main():
tiles = np.load('tiles.npy')
print(len(tiles))
print(tiles[0].shape)
prep = after_prepare(tiles)
print(len(prep))
masks = apply_by_multiprocessing(prep, workers=4)
masks_flatten = list(chain.from_iterable(masks))
print(len(masks_flatten), masks_flatten[0].shape) #
return masks_flatten
if __name__=="__main__":
masks_flatten = main()
The len(masks_flatten) is 128 and the shape of an element is (32,).
I would expect it to be len=62500 and every element (mask) (32, 32).
--- UPDATE ---
So, I want something like this:
def _apply_df(data):
results = []
for el in data:
pred = model.predict(el[np.newaxis, :, :, :], verbose=2)[0]
mask = tf.where(pred[:, :, -1] > 0.5, 255, 0)
results.append(mask)
return results
but without using the loop. Doing it in parallel.
Your approach is not incorrect, but even inside a single worker, it's better to let the TensorFlow/NumPy vectorization do its job instead of writing an explicit for loop:
def _apply_df(data):
pred = model.predict(data)
mask = tf.where(pred.squeeze(axis=-1) > 0.5, 255, 0)
return mask
This is the complete code:
import tensorflow as tf
import numpy as np
import os
from tensorflow.keras.models import load_model
from itertools import chain
from tensorflow.keras import backend as K
import multiprocessing
from multiprocessing import Pool
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
multiprocessing.set_start_method("spawn", force=True)
model = load_model("./model.h5", custom_objects={"K": K})
def resize_and_rescale(image):
image = tf.image.resize(image, (32, 32), preserve_aspect_ratio=True)
image /= 255.0
return image
def prepare(ds):
ds = ds.map(resize_and_rescale)
return ds
def _apply_df(data):
pred = model.predict(data)
mask = tf.where(pred.squeeze(axis=-1) > 0.5, 255, 0)
return mask
def apply_by_multiprocessing(data, workers):
pool = Pool(processes=workers)
# result = pool.map(_apply_df, np.array_split(list(data.as_numpy_iterator()), workers))
result = pool.map(_apply_df, data.batch(np.ceil(len(data) / workers)))
pool.close()
return list(result)
def after_prepare(data):
tens_data = tf.data.Dataset.from_tensor_slices(data)
tens_data = prepare(tens_data)
return tens_data
def main():
tiles = np.load("tiles.npy")
prep = after_prepare(tiles)
masks = apply_by_multiprocessing(prep, workers=4)
masks_flatten = list(chain.from_iterable(masks))
print(len(masks_flatten), masks_flatten[0].shape) # 62500 (32, 32)
return masks_flatten
if __name__ == "__main__":
masks_flatten = main()

torch to onnx, but missing input?

I'm trying to convert the pytorch model to onnx. First I have a demo, but different numeric inputs result in different onnx models.
code:
import torch
from torch import nn
from openvino.runtime import Core
class TestNet(nn.Module):
def __init__(self, x):
super(TestNet, self).__init__()
self.x = x
def forward(self, x: torch.Tensor):
y, _ = x.min(dim=1, keepdim=True)
if y < self.x:
return torch.zeros((1, 1))
return y
dummy_input = torch.zeros((1, 5))
model = TestNet(1)
onnx_path = "./test_net.onnx"
torch.onnx.export(
model,
(dummy_input,),
onnx_path,
opset_version=11,
do_constant_folding=False,
input_names=["x"],
output_names=["y"],
)
ie = Core()
model_onnx = ie.read_model(model=onnx_path)
compiled_model_onnx = ie.compile_model(model=model_onnx, device_name="CPU")
output_layer_onnx = compiled_model_onnx.output(0)
res_onnx1 = compiled_model_onnx([torch.zeros((1, 5)) + 10])[output_layer_onnx]
res_onnx2 = compiled_model_onnx([torch.zeros((1, 5))])[output_layer_onnx]
print(res_onnx1, res_onnx2)
It could have worked, nut without the input node, the onnx is as fllows:
<Model: 'torch-jit-export'
inputs[
]
outputs[
<ConstOutput: names[y] shape{1,1} type: f32>
]>
but, if I use dummy_input = torch.zeros((1, 5)) + 1, onnx have input node, output is as fllows:
[[10.]] [[0.]]
The code is the same, but dummy_input is different. I don't know why. By the way, 1torch.where is not what I. need.

get InvalidArgumentError when using tf.image.resize_bilinear in Keras with multi-gpu environment

I use tf.image.resize_bilinear in a segmentation network, It seems this function does not support by multi-gpu model. The following code shows the simplified situation: (which can be run directly)
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '0, 1'
from keras.backend.tensorflow_backend import set_session
from keras import backend as K
from keras.utils import multi_gpu_model
from keras.applications.mobilenet_v2 import preprocess_input
import tensorflow as tf
import numpy as np
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.allow_soft_placement = True
sess = tf.Session(config=config)
set_session(sess)
batch = 4
num_classes = 2
size = 128
K.clear_session()
def _GetRandomImg():
shape = (batch, size, size, 3)
img = np.random.randint(low=0, high=256, size=shape)
return preprocess_input(img)
def _GetRandomLabel():
shape = (batch, size, size, num_classes)
label = np.random.randint(low=0, high=num_classes, size=shape)
label = np.exp(label)
label = label/ np.sum(label, axis=-1, keepdims=True)
return label
def DataGen():
while True:
x = _GetRandomImg()
y = _GetRandomLabel()
yield x, y
from keras.layers import Input, Conv2D, Lambda
from keras import Model
def GetModel():
inputs = Input(shape=(size, size, 3))
f = lambda x: tf.image.resize_bilinear(inputs, (size, size), align_corners=True)
x = Lambda(f, output_shape=(size, size, 3))(inputs)
outputs = Conv2D(num_classes, kernel_size=3, padding='same')(x)
model = Model(inputs=[inputs], outputs=[outputs])
return model
gen = DataGen()
with tf.device('/cpu:0'):
model = GetModel()
model = multi_gpu_model(model, gpus=2)
model.compile(loss='categorical_crossentropy', optimizer='sgd')
result = model.fit_generator(gen, epochs=2, verbose = 1, steps_per_epoch = 100)
it works fine with single gpu environment, but in multi-gpu environment, I got the following error:
InvalidArgumentError: Incompatible shapes: [3,128,128,2] vs. [6,128,128,2]
[[{{node loss/conv2d_1_loss/categorical_crossentropy/mul}}]]
[[{{node training/SGD/gradients/conv2d_1_1/concat_grad/Slice_1}}]]
the problem is solved. If tensorflow function is used in a customized Lambda layer, it is needed to explicitly use set_shape() function:
def MyResizeBilinear(x, height, width):
rows, cols = 1, 2
original_shape = K.int_shape(x)
new_shape = tf.constant(np.array([height, width], dtype='int32'))
x = tf.image.resize_bilinear(x, new_shape, align_corners=True)
new_height = None if original_shape[rows] is None else height
new_width = None if original_shape[cols] is None else width
output_shape = (None, new_height, new_width, None)
x.set_shape(output_shape)
return x

Resources