torchvision MNIST HTTPError: HTTP Error 403: Forbidden - pytorch

I am trying to replicate this experiment presented in this webpage https://adversarial-ml-tutorial.org/adversarial_examples/
I got the jupyter notebook and loaded in my localhost and open it using Jupiter notebook. When I run the following code to get the dataset using the following code:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
mnist_train = datasets.MNIST("../data", train=True, download=True, transform=transforms.ToTensor())
mnist_test = datasets.MNIST("../data", train=False, download=True, transform=transforms.ToTensor())
train_loader = DataLoader(mnist_train, batch_size = 100, shuffle=True)
test_loader = DataLoader(mnist_test, batch_size = 100, shuffle=False)
and I get the following error:
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ../data\MNIST\raw\train-images-idx3-ubyte.gz
0/? [00:00<?, ?it/s]
---------------------------------------------------------------------------
HTTPError Traceback (most recent call last)
<ipython-input-15-e6f62798f426> in <module>
2 from torch.utils.data import DataLoader
3
----> 4 mnist_train = datasets.MNIST("../data", train=True, download=True, transform=transforms.ToTensor())
5 mnist_test = datasets.MNIST("../data", train=False, download=True, transform=transforms.ToTensor())
6 train_loader = DataLoader(mnist_train, batch_size = 100, shuffle=True)
~\Anaconda3\lib\site-packages\torchvision\datasets\mnist.py in __init__(self, root, train, transform, target_transform, download)
77
78 if download:
---> 79 self.download()
80
81 if not self._check_exists():
~\Anaconda3\lib\site-packages\torchvision\datasets\mnist.py in download(self)
144 for url, md5 in self.resources:
145 filename = url.rpartition('/')[2]
--> 146 download_and_extract_archive(url, download_root=self.raw_folder, filename=filename, md5=md5)
147
148 # process and save as torch files
~\Anaconda3\lib\site-packages\torchvision\datasets\utils.py in download_and_extract_archive(url, download_root, extract_root, filename, md5, remove_finished)
254 filename = os.path.basename(url)
255
--> 256 download_url(url, download_root, filename, md5)
257
258 archive = os.path.join(download_root, filename)
~\Anaconda3\lib\site-packages\torchvision\datasets\utils.py in download_url(url, root, filename, md5)
82 )
83 else:
---> 84 raise e
85 # check integrity of downloaded file
86 if not check_integrity(fpath, md5):
~\Anaconda3\lib\site-packages\torchvision\datasets\utils.py in download_url(url, root, filename, md5)
70 urllib.request.urlretrieve(
71 url, fpath,
---> 72 reporthook=gen_bar_updater()
73 )
74 except (urllib.error.URLError, IOError) as e: # type: ignore[attr-defined]
~\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
245 url_type, path = splittype(url)
246
--> 247 with contextlib.closing(urlopen(url, data)) as fp:
248 headers = fp.info()
249
~\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
~\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
529 for processor in self.process_response.get(protocol, []):
530 meth = getattr(processor, meth_name)
--> 531 response = meth(req, response)
532
533 return response
~\Anaconda3\lib\urllib\request.py in http_response(self, request, response)
639 if not (200 <= code < 300):
640 response = self.parent.error(
--> 641 'http', request, response, code, msg, hdrs)
642
643 return response
~\Anaconda3\lib\urllib\request.py in error(self, proto, *args)
567 if http_err:
568 args = (dict, 'default', 'http_error_default') + orig_args
--> 569 return self._call_chain(*args)
570
571 # XXX probably also want an abstract factory that knows when it makes
~\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
501 for handler in handlers:
502 func = getattr(handler, meth_name)
--> 503 result = func(*args)
504 if result is not None:
505 return result
~\Anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
647 class HTTPDefaultErrorHandler(BaseHandler):
648 def http_error_default(self, req, fp, code, msg, hdrs):
--> 649 raise HTTPError(req.full_url, code, msg, hdrs, fp)
650
651 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 403: Forbidden
Any help solving this issue is much appreciated.
I also can download the dataset directly from the link but then I don't know how to use that!

Yes it's a known bug: https://github.com/pytorch/vision/issues/3500
The possible solution can be to patch MNIST download method.
But it requires wget to be installed.
For Linux:
sudo apt install wget
For Windows:
choco install wget
import os
import subprocess as sp
from torchvision.datasets.mnist import MNIST, read_image_file, read_label_file
from torchvision.datasets.utils import extract_archive
def patched_download(self):
"""wget patched download method.
"""
if self._check_exists():
return
os.makedirs(self.raw_folder, exist_ok=True)
os.makedirs(self.processed_folder, exist_ok=True)
# download files
for url, md5 in self.resources:
filename = url.rpartition('/')[2]
download_root = os.path.expanduser(self.raw_folder)
extract_root = None
remove_finished = False
if extract_root is None:
extract_root = download_root
if not filename:
filename = os.path.basename(url)
# Use wget to download archives
sp.run(["wget", url, "-P", download_root])
archive = os.path.join(download_root, filename)
print("Extracting {} to {}".format(archive, extract_root))
extract_archive(archive, extract_root, remove_finished)
# process and save as torch files
print('Processing...')
training_set = (
read_image_file(os.path.join(self.raw_folder, 'train-images-idx3-ubyte')),
read_label_file(os.path.join(self.raw_folder, 'train-labels-idx1-ubyte'))
)
test_set = (
read_image_file(os.path.join(self.raw_folder, 't10k-images-idx3-ubyte')),
read_label_file(os.path.join(self.raw_folder, 't10k-labels-idx1-ubyte'))
)
with open(os.path.join(self.processed_folder, self.training_file), 'wb') as f:
torch.save(training_set, f)
with open(os.path.join(self.processed_folder, self.test_file), 'wb') as f:
torch.save(test_set, f)
print('Done!')
MNIST.download = patched_download
mnist_train = MNIST("../data", train=True, download=True, transform=transforms.ToTensor())
mnist_test = MNIST("../data", train=False, download=True, transform=transforms.ToTensor())
train_loader = DataLoader(mnist_train, batch_size=1, shuffle=True)
test_loader = DataLoader(mnist_test, batch_size=1, shuffle=False)

Related

Colab IndexError: Target 255 is out of bounds

I'm trying to perform an image semantic segmentation (segment mining fields) using lightning-flash. My images are all RGB/uint8/512x512 and the masks are L/uint8/512x512.
When I run the code, I get an error when fitting.
My code is this one:
import torch
import flash
from flash.image import SemanticSegmentation, SemanticSegmentationData
import os
from google.colab import drive
import ssl
drive.mount("/content/drive")
DATA_DIR = '/content/drive/MyDrive/data/'
x_train_dir = os.path.join(DATA_DIR, 'train_images')
y_train_dir = os.path.join(DATA_DIR, 'train_masks')
x_valid_dir = os.path.join(DATA_DIR, 'val_images')
y_valid_dir = os.path.join(DATA_DIR, 'val_masks')
x_test_dir = os.path.join(DATA_DIR, 'test_images')
y_test_dir = os.path.join(DATA_DIR, 'test_masks')
datamodule = SemanticSegmentationData.from_folders(
train_folder=x_train_dir,
train_target_folder=y_train_dir,
val_folder=x_valid_dir,
val_target_folder=y_valid_dir,
test_folder=x_test_dir,
test_target_folder=y_test_dir,
transform_kwargs=dict(image_size=(256, 256)),
num_classes=1,
batch_size=16,
)
#avoid ssl error
ssl._create_default_https_context = ssl._create_unverified_context
model = SemanticSegmentation(
head="unetplusplus",
backbone="densenet169",
pretrained="imagenet",
num_classes=datamodule.num_classes
)
GPUS = torch.cuda.device_count()
if GPUS > 0:
trainer = flash.Trainer(max_epochs=2, gpus=torch.cuda.device_count())
else:
trainer = flash.Trainer(max_epochs=2)
trainer.finetune(model, datamodule=datamodule, strategy="freeze")
trainer.save_checkpoint("semantic_segmentation_model.pt")
When I run the code, I get this error:
IndexError Traceback (most recent call last)
<ipython-input-7-11e2ce087ca0> in <module>
6
7 #trainer.fit(model, datamodule=datamodule)
----> 8 trainer.finetune(model, datamodule=datamodule, strategy="freeze")
9 trainer.save_checkpoint("semantic_segmentation_model.pt")
19 frames
/usr/local/lib/python3.7/dist-packages/flash/core/trainer.py in finetune(self, model, train_dataloader, val_dataloaders, datamodule, strategy, train_bn)
162 """
163 self._resolve_callbacks(model, strategy, train_bn=train_bn)
--> 164 return super().fit(model, train_dataloader, val_dataloaders, datamodule)
165
166 def predict(
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
695 self.strategy.model = model
696 self._call_and_handle_interrupt(
--> 697 self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
698 )
699
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _call_and_handle_interrupt(self, trainer_fn, *args, **kwargs)
648 return self.strategy.launcher.launch(trainer_fn, *args, trainer=self, **kwargs)
649 else:
--> 650 return trainer_fn(*args, **kwargs)
651 # TODO(awaelchli): Unify both exceptions below, where `KeyboardError` doesn't re-raise
652 except KeyboardInterrupt as exception:
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
735 ckpt_path, model_provided=True, model_connected=self.lightning_module is not None
736 )
--> 737 results = self._run(model, ckpt_path=self.ckpt_path)
738
739 assert self.state.stopped
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _run(self, model, ckpt_path)
1166 self._checkpoint_connector.resume_end()
1167
-> 1168 results = self._run_stage()
1169
1170 log.detail(f"{self.__class__.__name__}: trainer tearing down")
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _run_stage(self)
1252 if self.predicting:
1253 return self._run_predict()
-> 1254 return self._run_train()
1255
1256 def _pre_training_routine(self):
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _run_train(self)
1274
1275 with isolate_rng():
-> 1276 self._run_sanity_check()
1277
1278 # enable train mode
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _run_sanity_check(self)
1343 # run eval step
1344 with torch.no_grad():
-> 1345 val_loop.run()
1346
1347 self._call_callback_hooks("on_sanity_check_end")
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/loop.py in run(self, *args, **kwargs)
198 try:
199 self.on_advance_start(*args, **kwargs)
--> 200 self.advance(*args, **kwargs)
201 self.on_advance_end()
202 self._restarting = False
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py in advance(self, *args, **kwargs)
153 if self.num_dataloaders > 1:
154 kwargs["dataloader_idx"] = dataloader_idx
--> 155 dl_outputs = self.epoch_loop.run(self._data_fetcher, dl_max_batches, kwargs)
156
157 # store batch level output per dataloader
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/loop.py in run(self, *args, **kwargs)
198 try:
199 self.on_advance_start(*args, **kwargs)
--> 200 self.advance(*args, **kwargs)
201 self.on_advance_end()
202 self._restarting = False
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py in advance(self, data_fetcher, dl_max_batches, kwargs)
141
142 # lightning module methods
--> 143 output = self._evaluation_step(**kwargs)
144 output = self._evaluation_step_end(output)
145
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py in _evaluation_step(self, **kwargs)
238 """
239 hook_name = "test_step" if self.trainer.testing else "validation_step"
--> 240 output = self.trainer._call_strategy_hook(hook_name, *kwargs.values())
241
242 return output
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _call_strategy_hook(self, hook_name, *args, **kwargs)
1704
1705 with self.profiler.profile(f"[Strategy]{self.strategy.__class__.__name__}.{hook_name}"):
-> 1706 output = fn(*args, **kwargs)
1707
1708 # restore current_fx when nested context
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/strategies/strategy.py in validation_step(self, *args, **kwargs)
368 with self.precision_plugin.val_step_context():
369 assert isinstance(self.model, ValidationStep)
--> 370 return self.model.validation_step(*args, **kwargs)
371
372 def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]:
/usr/local/lib/python3.7/dist-packages/flash/image/segmentation/model.py in validation_step(self, batch, batch_idx)
151 def validation_step(self, batch: Any, batch_idx: int) -> Any:
152 batch = (batch[DataKeys.INPUT], batch[DataKeys.TARGET])
--> 153 return super().validation_step(batch, batch_idx)
154
155 def test_step(self, batch: Any, batch_idx: int) -> Any:
/usr/local/lib/python3.7/dist-packages/flash/core/model.py in validation_step(self, batch, batch_idx)
423
424 def validation_step(self, batch: Any, batch_idx: int) -> None:
--> 425 output = self.step(batch, batch_idx, self.val_metrics)
426 log_kwargs = {"batch_size": output.get(OutputKeys.BATCH_SIZE, None)} if _PL_GREATER_EQUAL_1_5_0 else {}
427 self.log_dict(
/usr/local/lib/python3.7/dist-packages/flash/core/model.py in step(self, batch, batch_idx, metrics)
360 output = {OutputKeys.OUTPUT: y_hat}
361 y_hat = self.to_loss_format(output[OutputKeys.OUTPUT])
--> 362 losses = {name: l_fn(y_hat, y) for name, l_fn in self.loss_fn.items()}
363
364 y_hat = self.to_metrics_format(output[OutputKeys.OUTPUT])
/usr/local/lib/python3.7/dist-packages/flash/core/model.py in <dictcomp>(.0)
360 output = {OutputKeys.OUTPUT: y_hat}
361 y_hat = self.to_loss_format(output[OutputKeys.OUTPUT])
--> 362 losses = {name: l_fn(y_hat, y) for name, l_fn in self.loss_fn.items()}
363
364 y_hat = self.to_metrics_format(output[OutputKeys.OUTPUT])
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)
3012 if size_average is not None or reduce is not None:
3013 reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 3014 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
3015
3016
IndexError: Target 255 is out of bounds.
How can I solve this problem? I researched others issues on stackoverflow and they were all related to the number of classes. But in my case, I only want to segment mining fields.

How to load a torchvision model from disk?

I'm trying to follow the solution from the top answer here to load an object detection model from the .pth file.
os.environ['TORCH_HOME'] = '../input/torchvision-fasterrcnn-resnet-50/' #setting the environment variable
model = detection.fasterrcnn_resnet50_fpn(pretrained=False).to(DEVICE)
I get the following error
NotADirectoryError: [Errno 20] Not a directory: '../input/torchvision-fasterrcnn-resnet-50/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth/hub'
google did not reveal an answer to the error and I don't exactly know what it means except for the obvious (that folder 'hub' is missing).
Do I have to unpack or create a folder?
I have tried loading the weights but I get the same error message.
this is how I load the model
model = detection.fasterrcnn_resnet50_fpn(pretrained=True)
checkpoint = torch.load('../input/torchvision-fasterrcnn-resnet-50/model.pth.tar')
model.load_state_dict(checkpoint['state_dict'])
thank you for your help!
Full Error Trace:
gaierror: [Errno -3] Temporary failure in name resolution
During handling of the above exception, another exception occurred:
URLError Traceback (most recent call last)
/tmp/ipykernel_42/1218627017.py in <module>
1 # to load
----> 2 model = detection.fasterrcnn_resnet50_fpn(pretrained=True)
3 checkpoint = torch.load('../input/torchvision-fasterrcnn-resnet-50/model.pth.tar')
4 model.load_state_dict(checkpoint['state_dict'])
/opt/conda/lib/python3.7/site-packages/torchvision/models/detection/faster_rcnn.py in fasterrcnn_resnet50_fpn(pretrained, progress, num_classes, pretrained_backbone, trainable_backbone_layers, **kwargs)
360 if pretrained:
361 state_dict = load_state_dict_from_url(model_urls['fasterrcnn_resnet50_fpn_coco'],
--> 362 progress=progress)
363 model.load_state_dict(state_dict)
364 return model
/opt/conda/lib/python3.7/site-packages/torch/hub.py in load_state_dict_from_url(url, model_dir, map_location, progress, check_hash, file_name)
553 r = HASH_REGEX.search(filename) # r is Optional[Match[str]]
554 hash_prefix = r.group(1) if r else None
--> 555 download_url_to_file(url, cached_file, hash_prefix, progress=progress)
556
557 if _is_legacy_zip_format(cached_file):
/opt/conda/lib/python3.7/site-packages/torch/hub.py in download_url_to_file(url, dst, hash_prefix, progress)
423 # certificates in older Python
424 req = Request(url, headers={"User-Agent": "torch.hub"})
--> 425 u = urlopen(req)
426 meta = u.info()
427 if hasattr(meta, 'getheaders'):
/opt/conda/lib/python3.7/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
220 else:
221 opener = _opener
--> 222 return opener.open(url, data, timeout)
223
224 def install_opener(opener):
/opt/conda/lib/python3.7/urllib/request.py in open(self, fullurl, data, timeout)
523 req = meth(req)
524
--> 525 response = self._open(req, data)
526
527 # post-process response
/opt/conda/lib/python3.7/urllib/request.py in _open(self, req, data)
541 protocol = req.type
542 result = self._call_chain(self.handle_open, protocol, protocol +
--> 543 '_open', req)
544 if result:
545 return result
/opt/conda/lib/python3.7/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
501 for handler in handlers:
502 func = getattr(handler, meth_name)
--> 503 result = func(*args)
504 if result is not None:
505 return result
/opt/conda/lib/python3.7/urllib/request.py in https_open(self, req)
1391 def https_open(self, req):
1392 return self.do_open(http.client.HTTPSConnection, req,
-> 1393 context=self._context, check_hostname=self._check_hostname)
1394
1395 https_request = AbstractHTTPHandler.do_request_
/opt/conda/lib/python3.7/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
1350 encode_chunked=req.has_header('Transfer-encoding'))
1351 except OSError as err: # timeout error
-> 1352 raise URLError(err)
1353 r = h.getresponse()
1354 except:
URLError: <urlopen error [Errno -3] Temporary failure in name resolution>
If you are loading a pretrained network, you don't need to load the model from torchvision pretrained (as in pretrained by torchvision on ImageNet using pretrained=True). You have two options:
Either set pretrained=False and load you weights using:
checkpoint = torch.load('../input/torchvision-fasterrcnn-resnet-50/model.pth.tar')
model.load_state_dict(checkpoint['state_dict'])
Or if you decide to change TORCH_HOME (which is not ideal) you need to keep the same directory structure Torchvision has which would be:
inputs/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
In practice, you wouldn't change TORCH_HOME just to load one model.
I found the solution digging deep into github, to the problem, which is a little hidden.
detection.()
has a default argument besides pretrained, it's called pretrained_backbone which by default is set to true, which if True sets the models to download from a dictionary path of urls.
this will work:
detection.fasterrcnn_resnet50_fpn(pretrained=False, pretrained_backbone = False, num_classes = 91).
then load the model as usual.
num_classes is expected, in the docs it's a default = 91 but in github i saw it as None, which is why I added it here for saftey.

Bert forward reports error on GPU; but runs fine on CPU

I would like to encode articles using Bert. My code runs fine on CPU, but failed on GPU. The GPU is a on a remote server.
My strategy is to tokenize the articles using CPU first. Then move both model and tokens to GPU. I am not sure if this is a good approach. Any suggestion of improvements are welcomed. Thank you in advance.
def assign_gpu(token):
token_tensor = token['input_ids'].to('cuda')
token_typeid = token['token_type_ids'].to('cuda')
attention_mask = token['attention_mask'].to('cuda')
output = {'input_ids': token_tensor,
'token_type_ids': token_typeid,
'attention_mask': attention_mask}
return output
bs = 16
data_dl = DataLoader(PatentDataset(df), batch_size=bs, shuffle=False)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased/')
model = BertModel.from_pretrained('bert-base-uncased/')
inputs_list = []
for a in data_dl:
inputs_list.append(tokenizer.batch_encode_plus(a, pad_to_max_length=True, return_tensors='pt'))
# GPU part
model.cuda()
model.eval()
out_list = []
with torch.no_grad():
for i, inputs in enumerate(inputs_list):
inputs = assign_gpu(inputs)
output = model(**inputs)[0][:, 0, :]
out_list.append(output)
The error message is
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-3-f6979d077f60> in <module>
6 for i, inputs in enumerate(inputs_list):
7 inputs = assign_gpu(inputs)
----> 8 output = model(**inputs)[0][:, 0, :]
9 out_list.append(output)
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
530 result = self._slow_forward(*input, **kwargs)
531 else:
--> 532 result = self.forward(*input, **kwargs)
533 for hook in self._forward_hooks.values():
534 hook_result = hook(self, input, result)
~/volta_pypkg/lib/python3.6/site-packages/transformers/modeling_bert.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, output_attentions, output_hidden_states)
751
752 embedding_output = self.embeddings(
--> 753 input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
754 )
755 encoder_outputs = self.encoder(
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
530 result = self._slow_forward(*input, **kwargs)
531 else:
--> 532 result = self.forward(*input, **kwargs)
533 for hook in self._forward_hooks.values():
534 hook_result = hook(self, input, result)
~/volta_pypkg/lib/python3.6/site-packages/transformers/modeling_bert.py in forward(self, input_ids, token_type_ids, position_ids, inputs_embeds)
180 token_type_embeddings = self.token_type_embeddings(token_type_ids)
181
--> 182 embeddings = inputs_embeds + position_embeddings + token_type_embeddings
183 embeddings = self.LayerNorm(embeddings)
184 embeddings = self.dropout(embeddings)
RuntimeError: CUDA error: device-side assert triggered
Another issue is that if I rerun the GPU for loop on a new cell in the interactive Jupyter notebook, it reports error on the assign_gpu part. I am not sure if it's caused by my remote server or some error in my code.
inputs = assign_gpu(inputs_list[0])
output = model(**inputs)
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-5-ec07b46d1a63> in <module>
----> 1 inputs = assign_gpu(inputs_list[0])
2 output = model(**inputs)
<ipython-input-1-cb4bdd793d7f> in assign_gpu(token)
29
30 def assign_gpu(token):
---> 31 token_tensor = token['input_ids'].to('cuda')
32 token_typeid = token['token_type_ids'].to('cuda')
33 attention_mask = token['attention_mask'].to('cuda')
RuntimeError: CUDA error: device-side assert triggered

Error using tfds.load on Tensorflow Dataset

I was wondering if tensorflow 2.2 dataset has an issue on Windows release.
Here is my diagnostic code
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")
Version: 2.2.0
Eager mode: True
Hub version: 0.8.0
GPU is available
I can load the list of datasets
tfds.list_builders()
['abstract_reasoning',
'aeslc',
'aflw2k3d',
'amazon_us_reviews',
'anli',
.
.
.
'xnli',
'xsum',
'yelp_polarity_reviews']
However, I am unable to load any dataset
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
I receive the following errors
---------------------------------------------------------------------------
UnimplementedError Traceback (most recent call last)
c:\python37\lib\site-packages\tensorflow_datasets\core\utils\py_utils.py in try_reraise(*args, **kwargs)
398 try:
--> 399 yield
400 except Exception: # pylint: disable=broad-except
c:\python37\lib\site-packages\tensorflow_datasets\core\registered.py in builder(name, **builder_init_kwargs)
243 prefix="Failed to construct dataset {}".format(name)):
--> 244 return builder_cls(name)(**builder_kwargs)
245
c:\python37\lib\site-packages\wrapt\wrappers.py in __call__(self, *args, **kwargs)
602 return self._self_wrapper(self.__wrapped__, self._self_instance,
--> 603 args, kwargs)
604
c:\python37\lib\site-packages\tensorflow_datasets\core\api_utils.py in disallow_positional_args_dec(fn, instance, args, kwargs)
68 _check_required(fn, kwargs)
---> 69 return fn(*args, **kwargs)
70
c:\python37\lib\site-packages\tensorflow_datasets\core\dataset_builder.py in __init__(self, data_dir, config, version)
205 else: # Use the code version (do not restore data)
--> 206 self.info.initialize_from_bucket()
207
c:\python37\lib\site-packages\tensorflow_datasets\core\dataset_info.py in initialize_from_bucket(self)
422 tmp_dir = tempfile.mkdtemp("tfds")
--> 423 data_files = gcs_utils.gcs_dataset_info_files(self.full_name)
424 if not data_files:
c:\python37\lib\site-packages\tensorflow_datasets\core\utils\gcs_utils.py in gcs_dataset_info_files(dataset_dir)
69 """Return paths to GCS files in the given dataset directory."""
---> 70 return gcs_listdir(posixpath.join(GCS_DATASET_INFO_DIR, dataset_dir))
71
c:\python37\lib\site-packages\tensorflow_datasets\core\utils\gcs_utils.py in gcs_listdir(dir_name)
62 root_dir = gcs_path(dir_name)
---> 63 if _is_gcs_disabled or not tf.io.gfile.exists(root_dir):
64 return None
c:\python37\lib\site-packages\tensorflow\python\lib\io\file_io.py in file_exists_v2(path)
266 try:
--> 267 _pywrap_file_io.FileExists(compat.as_bytes(path))
268 except errors.NotFoundError:
UnimplementedError: File system scheme 'gs' not implemented (file: 'gs://tfds-data/dataset_info/imdb_reviews/plain_text/1.0.0')
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-36-06930b64f980> in <module>
1 #tfds.list_builders()
----> 2 imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
c:\python37\lib\site-packages\wrapt\wrappers.py in __call__(self, *args, **kwargs)
562
563 return self._self_wrapper(self.__wrapped__, self._self_instance,
--> 564 args, kwargs)
565
566 class BoundFunctionWrapper(_FunctionWrapperBase):
c:\python37\lib\site-packages\tensorflow_datasets\core\api_utils.py in disallow_positional_args_dec(fn, instance, args, kwargs)
67 _check_no_positional(fn, args, ismethod, allowed=allowed)
68 _check_required(fn, kwargs)
---> 69 return fn(*args, **kwargs)
70
71 return disallow_positional_args_dec(wrapped) # pylint: disable=no-value-for-parameter
c:\python37\lib\site-packages\tensorflow_datasets\core\registered.py in load(name, split, data_dir, batch_size, shuffle_files, download, as_supervised, decoders, read_config, with_info, builder_kwargs, download_and_prepare_kwargs, as_dataset_kwargs, try_gcs)
366 data_dir = constants.DATA_DIR
367
--> 368 dbuilder = builder(name, data_dir=data_dir, **builder_kwargs)
369 if download:
370 download_and_prepare_kwargs = download_and_prepare_kwargs or {}
c:\python37\lib\site-packages\tensorflow_datasets\core\registered.py in builder(name, **builder_init_kwargs)
242 with py_utils.try_reraise(
243 prefix="Failed to construct dataset {}".format(name)):
--> 244 return builder_cls(name)(**builder_kwargs)
245
246
c:\python37\lib\contextlib.py in __exit__(self, type, value, traceback)
128 value = type()
129 try:
--> 130 self.gen.throw(type, value, traceback)
131 except StopIteration as exc:
132 # Suppress StopIteration *unless* it's the same exception that
c:\python37\lib\site-packages\tensorflow_datasets\core\utils\py_utils.py in try_reraise(*args, **kwargs)
399 yield
400 except Exception: # pylint: disable=broad-except
--> 401 reraise(*args, **kwargs)
402
403
c:\python37\lib\site-packages\tensorflow_datasets\core\utils\py_utils.py in reraise(prefix, suffix)
390 suffix = '\n' + suffix if suffix else ''
391 msg = prefix + str(exc_value) + suffix
--> 392 six.reraise(exc_type, exc_type(msg), exc_traceback)
393
394
TypeError: __init__() missing 2 required positional arguments: 'op' and 'message'
Is the library broken? As mentioned, I am on Windows 10 machine and using Jupyter Lab.
After I reported the issue on GitHub, the problem was fixed in version 3.2.1.

keras model.save() issues RuntimeError: Unable to flush file's cached information

I'm facing some issues with my databricks cluster configuration, and issue is that i'm not able to put a finger on where and why.
I was trying to save a keras model, and it seems to be not going well
dataset = pd.DataFrame([item.split(',') for item in '''6,148,72,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
8,183,64,0,0,23.3,0.672,32,1
1,89,66,23,94,28.1,0.167,21,0
0,137,40,35,168,43.1,2.288,33,1'''.split('\n')])
X = dataset.iloc[:,0:8]
y = dataset.iloc[:,8]
model = Sequential()
model.add(Dense(12, input_dim=8, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=3, batch_size=10)
accuracy = model.evaluate(X, y, verbose=0)
print(accuracy)
The issue is with saving the model, can anyone help me understand what the error is all about
I'm using Python 3.7.3, DBRuntime 6.2 (includes Apache Spark 2.4.4, Scala 2.11)
model.save('/dbfs/FileStore/tables/temp/new_model.h5')
KeyError Traceback (most recent call
last)
/databricks/python/lib/python3.7/site-packages/keras/engine/saving.py
in save_model(model, filepath, overwrite, include_optimizer)
540 with H5Dict(filepath, mode='w') as h5dict:
--> 541 _serialize_model(model, h5dict, include_optimizer)
542 elif hasattr(filepath, 'write') and callable(filepath.write):
/databricks/python/lib/python3.7/site-packages/keras/engine/saving.py
in _serialize_model(model, h5dict, include_optimizer)
160 for name, val in zip(weight_names, weight_values):
--> 161 layer_group[name] = val
162 if include_optimizer and model.optimizer:
/databricks/python/lib/python3.7/site-packages/keras/utils/io_utils.py
in setitem(self, attr, val)
230 raise KeyError('Cannot set attribute. '
--> 231 'Group with name "{}" exists.'.format(attr))
232 if is_np:
KeyError: 'Cannot set attribute. Group with name
"b\'dense_1/kernel:0\'" exists.'
During handling of the above exception, another exception occurred:
RuntimeError Traceback (most recent call
last) in
----> 1 model.save('/dbfs/FileStore/tables/temp/new_model.h5')
/databricks/python/lib/python3.7/site-packages/keras/engine/network.py
in save(self, filepath, overwrite, include_optimizer) 1150
raise NotImplementedError 1151 from ..models import
save_model
-> 1152 save_model(self, filepath, overwrite, include_optimizer) 1153 1154 #saving.allow_write_to_gcs
/databricks/python/lib/python3.7/site-packages/keras/engine/saving.py
in save_wrapper(obj, filepath, overwrite, *args, **kwargs)
447 os.remove(tmp_filepath)
448 else:
--> 449 save_function(obj, filepath, overwrite, *args, **kwargs)
450
451 return save_wrapper
/databricks/python/lib/python3.7/site-packages/keras/engine/saving.py
in save_model(model, filepath, overwrite, include_optimizer)
539 return
540 with H5Dict(filepath, mode='w') as h5dict:
--> 541 _serialize_model(model, h5dict, include_optimizer)
542 elif hasattr(filepath, 'write') and callable(filepath.write):
543 # write as binary stream
/databricks/python/lib/python3.7/site-packages/keras/utils/io_utils.py
in exit(self, exc_type, exc_val, exc_tb)
368
369 def exit(self, exc_type, exc_val, exc_tb):
--> 370 self.close()
371
372
/databricks/python/lib/python3.7/site-packages/keras/utils/io_utils.py
in close(self)
344 def close(self):
345 if isinstance(self.data, h5py.Group):
--> 346 self.data.file.flush()
347 if self._is_file:
348 self.data.close()
/databricks/python/lib/python3.7/site-packages/h5py/_hl/files.py in
flush(self)
450 """
451 with phil:
--> 452 h5f.flush(self.id)
453
454 #with_phil
h5py/_objects.pyx in h5py._objects.with_phil.wrapper()
h5py/_objects.pyx in h5py._objects.with_phil.wrapper()
h5py/h5f.pyx in h5py.h5f.flush()
RuntimeError: Unable to flush file's cached information (file write
failed: time = Fri Jan 31 08:19:53 2020 , filename =
'/dbfs/FileStore/tables/temp/new_model.h5', file descriptor = 9, errno
= 95, error message = 'Operation not supported', buf = 0x6993c98, total write size = 320, bytes this sub-write = 320, bytes actually
written = 18446744073709551615, offset = 800)
I was finally able to save the model, by saving it on driver only and copying it on s3...
import os
import shutil
classification_model.save('news_dedup_model.h5')
shutil.copyfile('/databricks/driver/news_dedup_model.h5', '/dbfs/FileStore/tables/temp/nemish/news_dedup_model.h5')
classification_model = load_model('/dbfs/FileStore/tables/temp/nemish/news_dedup_model.h5', custom_objects={'tf': tf})
Still unable to figure out, why wouldn't it save normally
Because keras model.save() doesn't support writing to a FUSE mount. Doing so you'll get 'Operation Not Supported' error.
You need to first write it to the driver node's local disk (where python's working directory is), then move it to DFBS FUSE mount using '/dbfs/your/path/on/DBFS'.

Resources