LibTorch, using deeplab model gives segfault on forwarding - pytorch

I am trying to run images through the DeepLab model in Libtorch to segment them. Using pytorch, i am converting the Deeplabv3 model like this:
import torch
import torchvision
from torchvision import models
deeplap_model = models.segmentation.deeplabv3_resnet101(pretrained=True)
class wrapper(torch.nn.Module):
def __init__(self, model):
super(wrapper, self).__init__()
self.model = model
def forward(self, input):
results = []
output = self.model(input)
for k, v in output.items():
return tuple(results)
model = wrapper(deeplap_model)
example = torch.rand(1, 3, 224, 224)
# Use torch.jit.trace to generate a torch.jit.ScriptModule via tracing.
traced_script_module = torch.jit.trace(model, example)"")
Now, in c++ with LibTorch, i am trying to load the model and run data through it. This, however, fails:
std::shared_ptr<torch::jit::script::Module> module = torch::jit::load("");
assert(module != nullptr);
std::cout << "ok\n";
std::vector<torch::jit::IValue> inputs;
cv::Mat image;
image = cv::imread("pic.jpeg", 1);
cv::Mat image_resized;
cv::resize(image, image_resized, cv::Size(224, 224));
cv::cvtColor(image_resized, image_resized, cv::COLOR_BGR2RGB);
cv::Mat image_resized_float;
image_resized.convertTo(image_resized_float, CV_32F, 1.0 / 255);
auto img_tensor = torch::from_blob(, {1, 224, 224, 3}, torch::kFloat32);
cout << "img tensor loaded..\n";
img_tensor = img_tensor.permute({0, 3, 1, 2});
img_tensor[0][0] = img_tensor[0][0].sub(0.485).div(0.229);
img_tensor[0][1] = img_tensor[0][1].sub(0.456).div(0.224);
img_tensor[0][2] = img_tensor[0][2].sub(0.406).div(0.225);
// to GPU
img_tensor =;
torch::Tensor out_tensor2 = module->forward({img_tensor}).toTensor(); // SEGFAULT
Where am i going wrong here?


Use bounding box model in webcam

I have a model trained using the following code:
Then I saved the model using the following code:, 'checkpoint.pth')
How can I use the save model and in webcam? I created the below code but it only works for the classification model and not for the bounding box with the classification model.
from PIL import Image
import time
#Load the saved model
# Create a neural net class
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.nn
import torchvision
import PIL
import cv2
from torchvision import models
class BB_model(nn.Module):
def __init__(self):
super(BB_model, self).__init__()
resnet = models.resnet34(pretrained=True)
layers = list(resnet.children())[:8]
self.features1 = nn.Sequential(*layers[:6])
self.features2 = nn.Sequential(*layers[6:])
self.classifier = nn.Sequential(nn.BatchNorm1d(512), nn.Linear(512, 4)) = nn.Sequential(nn.BatchNorm1d(512), nn.Linear(512, 4))
def forward(self, x):
x = self.features1(x)
x = self.features2(x)
x = F.relu(x)
x = nn.AdaptiveAvgPool2d((1, 1))(x)
x = x.view(x.shape[0], -1)
return self.classifier(x),
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") ##Assigning the Device which will do the calculation
model = BB_model()
model =
video = cv2.VideoCapture(0)
# used to record the time when we processed last frame
prev_frame_time = 0
# used to record the time at which we processed current frame
new_frame_time = 0
encoder = {0:"Standing"}
# Let's preprocess the inputted frame
data_transforms = torchvision.transforms.Compose([
torchvision.transforms.Resize(size=(128, 128)),
torchvision.transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
def argmax(prediction):
prediction = prediction.cpu()
prediction = prediction.detach().numpy()
top_1 = np.argmax(prediction, axis=1)
score = np.amax(prediction)
score = '{:6f}'.format(score)
prediction = top_1[0]
result = encoder[prediction]
return result,score
def preprocess(image):
image = PIL.Image.fromarray(image) #Webcam frames are numpy array format
#Therefore transform back to PIL image
image = data_transforms(image)
image = image.float()
#image = Variable(image, requires_autograd=True)
image = image.unsqueeze(0) #I don't know for sure but Resnet-50 model seems to only
#accpets 4-D Vector Tensor so we need to squeeze another
return image
while True:
_, frame =
image = frame[100:450, 150:570]
image_data = preprocess(image)
prediction = model(image_data)
result, score = argmax(prediction)
(50, 50),
font, 1,
(0, 0, 255),
# time when we finish processing for this frame
new_frame_time = time.time()
fps = 1 / (new_frame_time - prev_frame_time)
prev_frame_time = new_frame_time
f"FPS: {round(fps,1)}",
(50, 80),
font, 1,
(255, 255, 0),
cv2.imshow("Capturing", frame)
if key == ord('q'):
When I run the above code, I received the error stating that
prediction = prediction.cpu()
AttributeError: 'tuple' object has no attribute 'cpu'
Since my code was build for standard classification and not bounding box, the argmax function is not working and I need help on how to change it. Thank you

Onnx RuntimeError NOT_IMPLEMENTED Trilu

This model works in PyTorch however, after exporting it with PyTorch to Onnx format, the onnx runtime crashes with a 'Trilu NOT_IMPLEMENTED error' when loading it in. (I do not have this issue for my other models that use torch.tril() )
How do I make this model run in the Onnxruntime?
This is a visualisation of the Onnx graph of the Model.
The Model in PyTorch
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
def forward(self, item_seq):
attention_mask = item_seq < 100
tril_mask = torch.tril(attention_mask)
query_layer = torch.rand((1, 2, 2, 32))
key_layer = torch.rand((1, 2, 32, 2))
attention_scores = torch.matmul(query_layer, key_layer)
return attention_scores + tril_mask
model = MyModel()
x_train = torch.ones([1, 2], dtype=torch.long)
# demonstrate that eager works
bigmodel_onnx_filename = 'mymodel.onnx'
# Onnxruntime crashes when loading in the model
ort_sess = ort.InferenceSession(bigmodel_onnx_filename, providers=['CPUExecutionProvider'])
key = {'x': x_train.numpy()}
print(, key))
This results in the following error for ort.InferenceSession():
NotImplemented: [ONNXRuntimeError] : 9 : NOT_IMPLEMENTED : Could not find an implementation for Trilu(14) node with name '/net/Trilu'
How can I make this model run in the Onnxruntime?
[github: code to reproduce the error and the model.onnx file]
I'm using python 3.9, these are the project requirements
Torch nightly version 2.0.0.dev20230205 gave the same error
I then decided to implement my own tril function.
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
def forward(self, item_seq):
attention_mask = item_seq < 100
tril_mask = self.my_tril(attention_mask)
query_layer = torch.rand((1, 2, 2, 32))
key_layer = torch.rand((1, 2, 32, 2))
attention_scores = torch.matmul(query_layer, key_layer)
return attention_scores + tril_mask
def my_tril(self, x):
l = x.size(-1)
arange = torch.arange(l)
mask = arange.expand(l, l)
arange = arange.unsqueeze(-1)
mask = torch.le(mask, arange)
return x.masked_fill(mask == 0, 0)
but then I get a Where(9) node with name '/Where_1' NOT_IMPLEMENTED error. (?!)
The boolean output of as input for torch.tril() works with PyTorch's Eager and LIT mode. However it breaks the Onnx runtime with the "TRILU not implemented error".
I was able to work around it by casting the torch.tril() input to float():
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
def forward(self, item_seq):
attention_mask =, 100).float()
tril_mask = torch.tril(attention_mask)
query_layer = torch.rand((1, 2, 2, 32))
key_layer = torch.rand((1, 2, 32, 2))
attention_scores = torch.matmul(query_layer, key_layer)
return attention_scores + tril_mask
Based on this experience, my hypothesis is that the TRILU NOT_IMPLEMENTED error is only applicable when having BOOLEAN Tensors as input. The Onnxruntime then throws this generic TRILU NOT_IMPLEMENTED error making me believe that Onnx has no TRILU support at all, which is clearly not the case.

make multiple parallel predictions on tensorflow model

I want to make multiple predictions.
I have trained a segmentation model (images and masks) . You can find the model here.
The images have dimensions (32,32,3). The masks (32, 32).
What I am doing when I want to inference is:
Load the images array (tiles) with dim (62500, 32, 32, 3). You can find it here
Create tensorflow dataset from this array.
and then predict on each image, like:
masks = []
for k, element in enumerate(the_image_array):
the_img = np.asarray(np.expand_dims(element, 0))[-1, -1, :, :]
pred = model.predict(the_img[np.newaxis, :, :, :])[0]
mask = tf.where(pred > 0.5, 255, 0)
Now, I want to do these predictions in parallel.
So, I tried:
import tensorflow as tf
import numpy as np
import os
from tensorflow.keras.models import load_model
from itertools import chain
from tensorflow.keras import backend as K
import multiprocessing
from multiprocessing import Pool
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
multiprocessing.set_start_method('spawn', force=True)
model = load_model('./model.h5',
custom_objects={"K": K})
def resize_and_rescale(image):
image = tf.image.resize(image,
(32, 32),
image /= 255.0
return image
def prepare(ds):
ds =
return ds
def _apply_df(data):
img = np.asarray(np.expand_dims(data, 0))[-1,-1, :, :]
pred = model.predict(img[np.newaxis, :, :, :], verbose=2)[0]
#pred = model.predict(data)[0]
mask = tf.where(pred[:, :, -1] > 0.5, 255, 0)
return mask
def apply_by_multiprocessing(data, workers):
pool = Pool(processes=workers)
#result =, np.array_split(list(data.as_numpy_iterator()), workers))
result =, data.batch(np.ceil(len(data) / workers)))
return list(result)
def after_prepare(data):
tens_data =
tens_data = prepare(tens_data)
return tens_data
def main():
tiles = np.load('tiles.npy')
prep = after_prepare(tiles)
masks = apply_by_multiprocessing(prep, workers=4)
masks_flatten = list(chain.from_iterable(masks))
print(len(masks_flatten), masks_flatten[0].shape) #
return masks_flatten
if __name__=="__main__":
masks_flatten = main()
The len(masks_flatten) is 128 and the shape of an element is (32,).
I would expect it to be len=62500 and every element (mask) (32, 32).
--- UPDATE ---
So, I want something like this:
def _apply_df(data):
results = []
for el in data:
pred = model.predict(el[np.newaxis, :, :, :], verbose=2)[0]
mask = tf.where(pred[:, :, -1] > 0.5, 255, 0)
return results
but without using the loop. Doing it in parallel.
Your approach is not incorrect, but even inside a single worker, it's better to let the TensorFlow/NumPy vectorization do its job instead of writing an explicit for loop:
def _apply_df(data):
pred = model.predict(data)
mask = tf.where(pred.squeeze(axis=-1) > 0.5, 255, 0)
return mask
This is the complete code:
import tensorflow as tf
import numpy as np
import os
from tensorflow.keras.models import load_model
from itertools import chain
from tensorflow.keras import backend as K
import multiprocessing
from multiprocessing import Pool
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
multiprocessing.set_start_method("spawn", force=True)
model = load_model("./model.h5", custom_objects={"K": K})
def resize_and_rescale(image):
image = tf.image.resize(image, (32, 32), preserve_aspect_ratio=True)
image /= 255.0
return image
def prepare(ds):
ds =
return ds
def _apply_df(data):
pred = model.predict(data)
mask = tf.where(pred.squeeze(axis=-1) > 0.5, 255, 0)
return mask
def apply_by_multiprocessing(data, workers):
pool = Pool(processes=workers)
# result =, np.array_split(list(data.as_numpy_iterator()), workers))
result =, data.batch(np.ceil(len(data) / workers)))
return list(result)
def after_prepare(data):
tens_data =
tens_data = prepare(tens_data)
return tens_data
def main():
tiles = np.load("tiles.npy")
prep = after_prepare(tiles)
masks = apply_by_multiprocessing(prep, workers=4)
masks_flatten = list(chain.from_iterable(masks))
print(len(masks_flatten), masks_flatten[0].shape) # 62500 (32, 32)
return masks_flatten
if __name__ == "__main__":
masks_flatten = main()

torch to onnx, but missing input?

I'm trying to convert the pytorch model to onnx. First I have a demo, but different numeric inputs result in different onnx models.
import torch
from torch import nn
from openvino.runtime import Core
class TestNet(nn.Module):
def __init__(self, x):
super(TestNet, self).__init__()
self.x = x
def forward(self, x: torch.Tensor):
y, _ = x.min(dim=1, keepdim=True)
if y < self.x:
return torch.zeros((1, 1))
return y
dummy_input = torch.zeros((1, 5))
model = TestNet(1)
onnx_path = "./test_net.onnx"
ie = Core()
model_onnx = ie.read_model(model=onnx_path)
compiled_model_onnx = ie.compile_model(model=model_onnx, device_name="CPU")
output_layer_onnx = compiled_model_onnx.output(0)
res_onnx1 = compiled_model_onnx([torch.zeros((1, 5)) + 10])[output_layer_onnx]
res_onnx2 = compiled_model_onnx([torch.zeros((1, 5))])[output_layer_onnx]
print(res_onnx1, res_onnx2)
It could have worked, nut without the input node, the onnx is as fllows:
<Model: 'torch-jit-export'
<ConstOutput: names[y] shape{1,1} type: f32>
but, if I use dummy_input = torch.zeros((1, 5)) + 1, onnx have input node, output is as fllows:
[[10.]] [[0.]]
The code is the same, but dummy_input is different. I don't know why. By the way, 1torch.where is not what I. need.

get InvalidArgumentError when using tf.image.resize_bilinear in Keras with multi-gpu environment

I use tf.image.resize_bilinear in a segmentation network, It seems this function does not support by multi-gpu model. The following code shows the simplified situation: (which can be run directly)
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '0, 1'
from keras.backend.tensorflow_backend import set_session
from keras import backend as K
from keras.utils import multi_gpu_model
from keras.applications.mobilenet_v2 import preprocess_input
import tensorflow as tf
import numpy as np
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.allow_soft_placement = True
sess = tf.Session(config=config)
batch = 4
num_classes = 2
size = 128
def _GetRandomImg():
shape = (batch, size, size, 3)
img = np.random.randint(low=0, high=256, size=shape)
return preprocess_input(img)
def _GetRandomLabel():
shape = (batch, size, size, num_classes)
label = np.random.randint(low=0, high=num_classes, size=shape)
label = np.exp(label)
label = label/ np.sum(label, axis=-1, keepdims=True)
return label
def DataGen():
while True:
x = _GetRandomImg()
y = _GetRandomLabel()
yield x, y
from keras.layers import Input, Conv2D, Lambda
from keras import Model
def GetModel():
inputs = Input(shape=(size, size, 3))
f = lambda x: tf.image.resize_bilinear(inputs, (size, size), align_corners=True)
x = Lambda(f, output_shape=(size, size, 3))(inputs)
outputs = Conv2D(num_classes, kernel_size=3, padding='same')(x)
model = Model(inputs=[inputs], outputs=[outputs])
return model
gen = DataGen()
with tf.device('/cpu:0'):
model = GetModel()
model = multi_gpu_model(model, gpus=2)
model.compile(loss='categorical_crossentropy', optimizer='sgd')
result = model.fit_generator(gen, epochs=2, verbose = 1, steps_per_epoch = 100)
it works fine with single gpu environment, but in multi-gpu environment, I got the following error:
InvalidArgumentError: Incompatible shapes: [3,128,128,2] vs. [6,128,128,2]
[[{{node loss/conv2d_1_loss/categorical_crossentropy/mul}}]]
[[{{node training/SGD/gradients/conv2d_1_1/concat_grad/Slice_1}}]]
the problem is solved. If tensorflow function is used in a customized Lambda layer, it is needed to explicitly use set_shape() function:
def MyResizeBilinear(x, height, width):
rows, cols = 1, 2
original_shape = K.int_shape(x)
new_shape = tf.constant(np.array([height, width], dtype='int32'))
x = tf.image.resize_bilinear(x, new_shape, align_corners=True)
new_height = None if original_shape[rows] is None else height
new_width = None if original_shape[cols] is None else width
output_shape = (None, new_height, new_width, None)
return x
