Related
Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 9 months ago.
Improve this question
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_33/2056227650.py in <module>
----> 1 learn.fit_one_cycle(n, max_learning_rate)
2 learn.recorder.plot_losses()
/opt/conda/lib/python3.7/site-packages/fastai/train.py in fit_one_cycle(learn, cyc_len, max_lr, moms, div_factor, pct_start, final_div, wd, callbacks, tot_epochs, start_epoch)
21 callbacks.append(OneCycleScheduler(learn, max_lr, moms=moms, div_factor=div_factor, pct_start=pct_start,
22 final_div=final_div, tot_epochs=tot_epochs, start_epoch=start_epoch))
---> 23 learn.fit(cyc_len, max_lr, wd=wd, callbacks=callbacks)
24
25 def fit_fc(learn:Learner, tot_epochs:int=1, lr:float=defaults.lr, moms:Tuple[float,float]=(0.95,0.85), start_pct:float=0.72,
/opt/conda/lib/python3.7/site-packages/fastai/basic_train.py in fit(self, epochs, lr, wd, callbacks)
198 else: self.opt.lr,self.opt.wd = lr,wd
199 callbacks = [cb(self) for cb in self.callback_fns + listify(defaults.extra_callback_fns)] + listify(callbacks)
--> 200 fit(epochs, self, metrics=self.metrics, callbacks=self.callbacks+callbacks)
201
202 def create_opt(self, lr:Floats, wd:Floats=0.)->None:
/opt/conda/lib/python3.7/site-packages/fastai/basic_train.py in fit(epochs, learn, callbacks, metrics)
104 if not cb_handler.skip_validate and not learn.data.empty_val:
105 val_loss = validate(learn.model, learn.data.valid_dl, loss_func=learn.loss_func,
--> 106 cb_handler=cb_handler, pbar=pbar)
107 else: val_loss=None
108 if cb_handler.on_epoch_end(val_loss): break
/opt/conda/lib/python3.7/site-packages/fastai/basic_train.py in validate(model, dl, loss_func, cb_handler, pbar, average, n_batch)
61 if not is_listy(yb): yb = [yb]
62 nums.append(first_el(yb).shape[0])
---> 63 if cb_handler and cb_handler.on_batch_end(val_losses[-1]): break
64 if n_batch and (len(nums)>=n_batch): break
65 nums = np.array(nums, dtype=np.float32)
/opt/conda/lib/python3.7/site-packages/fastai/callback.py in on_batch_end(self, loss)
306 "Handle end of processing one batch with `loss`."
307 self.state_dict['last_loss'] = loss
--> 308 self('batch_end', call_mets = not self.state_dict['train'])
309 if self.state_dict['train']:
310 self.state_dict['iteration'] += 1
/opt/conda/lib/python3.7/site-packages/fastai/callback.py in __call__(self, cb_name, call_mets, **kwargs)
248 "Call through to all of the `CallbakHandler` functions."
249 if call_mets:
--> 250 for met in self.metrics: self._call_and_update(met, cb_name, **kwargs)
251 for cb in self.callbacks: self._call_and_update(cb, cb_name, **kwargs)
252
/opt/conda/lib/python3.7/site-packages/fastai/callback.py in _call_and_update(self, cb, cb_name, **kwargs)
239 def _call_and_update(self, cb, cb_name, **kwargs)->None:
240 "Call `cb_name` on `cb` and update the inner state."
--> 241 new = ifnone(getattr(cb, f'on_{cb_name}')(**self.state_dict, **kwargs), dict())
242 for k,v in new.items():
243 if k not in self.state_dict:
/opt/conda/lib/python3.7/site-packages/object_detection_fastai/callbacks/callbacks.py in on_batch_end(self, last_output, last_target, **kwargs)
125 scores = scores[:total_nms_examples]
126 preds = preds[:total_nms_examples]
--> 127 to_keep = nms(bbox_pred, scores, self.nms_thresh)
128 bbox_pred, preds, scores = bbox_pred[to_keep].cpu(), preds[to_keep].cpu(), scores[to_keep].cpu()
129
/opt/conda/lib/python3.7/site-packages/object_detection_fastai/helper/object_detection_helper.py in nms(boxes, scores, thresh)
156 mask_keep = iou_vals <= thresh
157 if len(mask_keep.nonzero()) == 0: break
--> 158 idx_first = mask_keep.nonzero().min().item()
159 boxes, scores, indexes = boxes[mask_keep], scores[mask_keep], indexes[mask_keep]
160 return LongTensor(to_keep)
RuntimeError: min(): Expected reduction dim to be specified for input.numel() == 0. Specify the reduction dim with the 'dim' argument.
# %% [code] {"id":"asY9Gl86zcYl","outputId":"71cad981-691c-489e-f203-66ee5df9e25a","execution":{"iopub.status.busy":"2022-05-02T07:47:26.313181Z","iopub.execute_input":"2022-05-02T07:47:26.313767Z","iopub.status.idle":"2022-05-02T07:47:36.173129Z","shell.execute_reply.started":"2022-05-02T07:47:26.313727Z","shell.execute_reply":"2022-05-02T07:47:36.172282Z"}}
%reload_ext autoreload
%autoreload 2
%matplotlib inline
!pip install -U plotly
import json
from pathlib import Path
import plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from tqdm import tqdm
import pandas as pd
import random
import cv2
# %% [code] {"id":"NvOPXHfk2PzM","outputId":"4849c9ec-32d2-4aeb-b8f6-5a881423392e","execution":{"iopub.status.busy":"2022-05-02T07:47:36.175278Z","iopub.execute_input":"2022-05-02T07:47:36.175539Z","iopub.status.idle":"2022-05-02T07:47:36.278379Z","shell.execute_reply.started":"2022-05-02T07:47:36.175505Z","shell.execute_reply":"2022-05-02T07:47:36.277469Z"}}
folder = "midog-challenge"
midog_folder = Path("../input") / Path(folder)
print(list(midog_folder.glob("*.*")))
# %% [code] {"id":"3hzo-Io-zsRk","outputId":"5a4b2a8b-e79a-4371-dd41-c819c44026a9","execution":{"iopub.status.busy":"2022-05-02T07:47:36.280155Z","iopub.execute_input":"2022-05-02T07:47:36.280431Z","iopub.status.idle":"2022-05-02T07:47:45.692203Z","shell.execute_reply.started":"2022-05-02T07:47:36.280394Z","shell.execute_reply":"2022-05-02T07:47:45.691328Z"}}
!pip install -U object-detection-fastai
from object_detection_fastai.helper.wsi_loader import *
from object_detection_fastai.loss.RetinaNetFocalLoss import RetinaNetFocalLoss
from object_detection_fastai.models.RetinaNet import RetinaNet
from object_detection_fastai.callbacks.callbacks import BBMetrics, PascalVOCMetricByDistance, PascalVOCMetric, PascalVOCMetricByDistance
# %% [code] {"id":"jiZJLWqD5Rpr","execution":{"iopub.status.busy":"2022-05-02T07:47:45.694445Z","iopub.execute_input":"2022-05-02T07:47:45.694685Z","iopub.status.idle":"2022-05-02T07:47:45.801985Z","shell.execute_reply.started":"2022-05-02T07:47:45.694652Z","shell.execute_reply":"2022-05-02T07:47:45.800941Z"}}
image_folder = midog_folder / "images"
hamamatsu_rx_ids = list(range(0, 51))
hamamatsu_360_ids = list(range(51, 101))
aperio_ids = list(range(101, 151))
leica_ids = list(range(151, 201))
# %% [code] {"id":"tNwJXJufaVt-","outputId":"0e710552-9a03-4825-f1a3-97444a2be238","execution":{"iopub.status.busy":"2022-05-02T07:47:45.803635Z","iopub.execute_input":"2022-05-02T07:47:45.804129Z","iopub.status.idle":"2022-05-02T07:47:46.032320Z","shell.execute_reply.started":"2022-05-02T07:47:45.804090Z","shell.execute_reply":"2022-05-02T07:47:46.031354Z"}}
annotation_file = midog_folder / "MIDOG.json"
print(annotation_file," ",image_folder)
rows = []
with open(annotation_file) as f:
data = json.load(f)
categories = {1: 'mitotic figure', 2: 'hard negative'}
for row in data["images"]:
file_name = row["file_name"]
image_id = row["id"]
width = row["width"]
height = row["height"]
scanner = "Hamamatsu XR"
if image_id in hamamatsu_360_ids:
scanner = "Hamamatsu S360"
if image_id in aperio_ids:
scanner = "Aperio CS"
if image_id in leica_ids:
scanner = "Leica GT450"
for annotation in [anno for anno in data['annotations'] if anno["image_id"] == image_id]:
box = annotation["bbox"]
cat = categories[annotation["category_id"]]
rows.append([file_name, image_id, width, height, box, cat, scanner])
df = pd.DataFrame(rows, columns=["file_name", "image_id", "width", "height", "box", "cat", "scanner"])
df.head()
# %% [markdown] {"id":"r2Tm_N5PqbMJ"}
# ### Visual Examples
# %% [code] {"id":"KIALOeDIuCKo","execution":{"iopub.status.busy":"2022-05-02T07:47:53.807583Z","iopub.execute_input":"2022-05-02T07:47:53.807862Z","iopub.status.idle":"2022-05-02T07:47:53.904670Z","shell.execute_reply.started":"2022-05-02T07:47:53.807827Z","shell.execute_reply":"2022-05-02T07:47:53.903659Z"}}
def sample_function(y, classes, size, level_dimensions, level):
width, height = level_dimensions[level]
if len(y[0]) == 0:
return randint(0, width - size[0]), randint(0, height -size[1])
else:
#if randint(0, 5) < 2:
if True:
class_id = np.random.choice(classes, 1)[0] # select a random class
ids = np.array(y[1]) == class_id # filter the annotations according to the selected class
xmin, ymin, _, _ = np.array(y[0])[ids][randint(0, np.count_nonzero(ids) - 1)] # randomly select one of the filtered annotatons as seed for the training patch
# To have the selected annotation not in the center of the patch and an random offset.
xmin += random.randint(-size[0]/2, size[0]/2)
ymin += random.randint(-size[1]/2, size[1]/2)
xmin, ymin = max(0, int(xmin - size[0] / 2)), max(0, int(ymin -size[1] / 2))
xmin, ymin = min(xmin, width - size[0]), min(ymin, height - size[1])
return xmin, ymin
else:
return randint(0, width - size[0]), randint(0, height -size[1])
# %% [code] {"id":"HH_0sG8w4TfA","outputId":"d9de5667-10c5-46b1-c0e0-28a99e7f95d7","execution":{"iopub.status.busy":"2022-05-02T07:47:58.157246Z","iopub.execute_input":"2022-05-02T07:47:58.157508Z","iopub.status.idle":"2022-05-02T07:48:00.797900Z","shell.execute_reply.started":"2022-05-02T07:47:58.157480Z","shell.execute_reply":"2022-05-02T07:48:00.797151Z"}}
def create_wsi_container(annotations_df: pd.DataFrame):
container = []
for image_name in tqdm(annotations_df["file_name"].unique()):
image_annos = annotations_df[annotations_df["file_name"] == image_name]
bboxes = [box for box in image_annos["box"]]
labels = [label for label in image_annos["cat"]]
container.append(SlideContainer(image_folder/image_name, y=[bboxes, labels], level=res_level,width=patch_size, height=patch_size, sample_func=sample_function))
return container
train_scanner = "Aperio CS" #["Hamamatsu XR", "Hamamatsu S360", "Aperio CS"]
val_scanner = "Hamamatsu XR" #["Hamamatsu XR", "Hamamatsu S360", "Aperio CS"]
patch_size = 256
res_level = 0
train_annos = df[df["scanner"].isin(train_scanner.split(","))]
train_container = create_wsi_container(train_annos)
val_annos = df[df["scanner"].isin(val_scanner.split(","))]
valid_container = create_wsi_container(val_annos)
f"Created: {len(train_container)} training WSI container and {len(valid_container)} validation WSI container"
# %% [code] {"cellView":"form","id":"Mei_iD1sxJCA","outputId":"2c87f1b1-e9fd-4de1-bd16-6fdc5224d05a","execution":{"iopub.status.busy":"2022-05-02T07:48:18.070337Z","iopub.execute_input":"2022-05-02T07:48:18.070756Z","iopub.status.idle":"2022-05-02T07:48:18.213409Z","shell.execute_reply.started":"2022-05-02T07:48:18.070722Z","shell.execute_reply":"2022-05-02T07:48:18.212544Z"}}
import numpy as np
train_samples_per_scanner = 1500
val_samples_per_scanner = 500
train_images = list(np.random.choice(train_container, train_samples_per_scanner))
print('training_images =',len(train_images))
valid_images = list(np.random.choice(valid_container, val_samples_per_scanner))
print('validation_images =',len(valid_images))
# %% [code] {"id":"UZKoHWjR1mUG","outputId":"5cd9552e-8a9a-4aeb-f898-614e30e17eca","execution":{"iopub.status.busy":"2022-05-02T07:48:20.127048Z","iopub.execute_input":"2022-05-02T07:48:20.129167Z","iopub.status.idle":"2022-05-02T07:48:48.039604Z","shell.execute_reply.started":"2022-05-02T07:48:20.129128Z","shell.execute_reply":"2022-05-02T07:48:48.038869Z"}}
batch_size = 64
do_flip = True
flip_vert = True
max_rotate = 90
max_zoom = 1.1
max_lighting = 0.2
max_warp = 0.2
p_affine = 0.75
p_lighting = 0.75
tfms = get_transforms(do_flip=do_flip,
flip_vert=flip_vert,
max_rotate=max_rotate,
max_zoom=max_zoom,
max_lighting=max_lighting,
max_warp=max_warp,
p_affine=p_affine,
p_lighting=p_lighting)
train, valid = ObjectItemListSlide(train_images), ObjectItemListSlide(valid_images)
item_list = ItemLists(".", train, valid)
lls = item_list.label_from_func(lambda x: x.y, label_cls=SlideObjectCategoryList)
lls = lls.transform(tfms, tfm_y=True, size=patch_size)
data = lls.databunch(bs=batch_size, collate_fn=bb_pad_collate,num_workers=0).normalize()
# %% [code] {"id":"g5PZ9e-c2k1S","outputId":"317422c4-b572-4898-dd82-84c054f98735","execution":{"iopub.status.busy":"2022-05-02T07:48:54.016278Z","iopub.execute_input":"2022-05-02T07:48:54.016534Z","iopub.status.idle":"2022-05-02T07:48:54.455317Z","shell.execute_reply.started":"2022-05-02T07:48:54.016506Z","shell.execute_reply":"2022-05-02T07:48:54.454672Z"}}
scales = [2]
ratios=[1]
#The feature map sizes. [(64,64), (32,32) , (16,16), (8,8), (4,4)]
sizes=[(32,32)]
anchors = create_anchors(sizes=sizes, ratios=ratios, scales=scales)
fig,ax = plt.subplots(figsize=(15,15))
ax.imshow(image2np(data.valid_ds[0][0].data))
for i, bbox in enumerate(anchors[:len(scales)*len(ratios)*len(sizes)]):
bb = bbox.numpy()
x = (bb[0] + 1) * patch_size / 2
y = (bb[1] + 1) * patch_size / 2
w = bb[2] * patch_size / 2
h = bb[3] * patch_size / 2
rect = [x,y,w,h]
draw_rect(ax,rect)
# %% [code] {"id":"Mfrl8VJ94edJ","outputId":"3793f0b0-fa4c-4c87-a4ed-14840d18b4c2","execution":{"iopub.status.busy":"2022-05-02T07:49:03.926600Z","iopub.execute_input":"2022-05-02T07:49:03.926887Z","iopub.status.idle":"2022-05-02T07:49:39.263251Z","shell.execute_reply.started":"2022-05-02T07:49:03.926856Z","shell.execute_reply":"2022-05-02T07:49:39.262482Z"}}
all_boxes, all_labels = show_anchors_on_images(data, anchors, figsize=(12, 12))
# %% [code] {"id":"J-qEw_bN41cG","outputId":"eaf1c350-1c1f-4d50-a919-311ee538e612","execution":{"iopub.status.busy":"2022-05-02T07:49:39.264920Z","iopub.execute_input":"2022-05-02T07:49:39.265265Z","iopub.status.idle":"2022-05-02T07:49:39.546091Z","shell.execute_reply.started":"2022-05-02T07:49:39.265226Z","shell.execute_reply":"2022-05-02T07:49:39.545301Z"}}
from fastai.utils.collect_env import show_install
show_install()
# %% [code] {"id":"te6ci-Z35_5I","outputId":"67f2f18b-041b-44fd-fdc6-1a53121c445a","execution":{"iopub.status.busy":"2022-05-02T07:49:39.548856Z","iopub.execute_input":"2022-05-02T07:49:39.549088Z","iopub.status.idle":"2022-05-02T07:49:47.657381Z","shell.execute_reply.started":"2022-05-02T07:49:39.549058Z","shell.execute_reply":"2022-05-02T07:49:47.656648Z"}}
backbone = "ResNet101" #["ResNet18", "ResNet34", "ResNet50", "ResNet101", "ResNet150"]
backbone_model = models.resnet18
if backbone == "ResNet34":
backbone_model = models.resnet34
if backbone == "ResNet50":
backbone_model = models.resnet50
if backbone == "ResNet101":
backbone_model = models.resnet101
if backbone == "ResNet150":
backbone_model = models.resnet150
pre_trained_on_imagenet = True
encoder = create_body(models.resnet101, pre_trained_on_imagenet, -2)
loss_function = "FocalLoss"
if loss_function == "FocalLoss":
crit = RetinaNetFocalLoss(anchors)
channels = 128
final_bias = -4
n_conv = 3
model = RetinaNet(encoder, n_classes=data.train_ds.c,
n_anchors=len(scales) * len(ratios),
sizes=[size[0] for size in sizes],
chs=channels, # number of hidden layers for the classification head
final_bias=final_bias,
n_conv=n_conv # Number of hidden layers
)
# %% [code] {"id":"waS3UZNaNz4p","outputId":"9b49bd00-7b21-4cad-8588-a46248953370","execution":{"iopub.status.busy":"2022-05-02T07:49:47.659276Z","iopub.execute_input":"2022-05-02T07:49:47.659604Z","iopub.status.idle":"2022-05-02T07:49:47.758263Z","shell.execute_reply.started":"2022-05-02T07:49:47.659566Z","shell.execute_reply":"2022-05-02T07:49:47.757182Z"}}
voc = PascalVOCMetric(anchors, patch_size, [str(i) for i in data.train_ds.y.classes[1:]])
voc
# %% [code] {"id":"nwgGmvsPZ3bN","execution":{"iopub.status.busy":"2022-05-02T07:49:47.759735Z","iopub.execute_input":"2022-05-02T07:49:47.760020Z","iopub.status.idle":"2022-05-02T07:49:47.973584Z","shell.execute_reply.started":"2022-05-02T07:49:47.759982Z","shell.execute_reply":"2022-05-02T07:49:47.972749Z"}}
learn = Learner(data, model, loss_func=crit,
callback_fns=[BBMetrics, ShowGraph],
metrics=[voc]
)
learn.split([model.encoder[6], model.c5top5])
learn.freeze_to(-2)
# %% [code] {"id":"NWwQRvzw6vnr","outputId":"e9c8b89b-0a90-468b-de6d-351fd09bf467","execution":{"iopub.status.busy":"2022-05-02T07:47:46.267766Z","iopub.status.idle":"2022-05-02T07:47:46.268297Z","shell.execute_reply.started":"2022-05-02T07:47:46.268040Z","shell.execute_reply":"2022-05-02T07:47:46.268066Z"}}
learn.lr_find()
learn.recorder.plot(suggestion=True)
# %% [code] {"id":"gK1s61jSJtie","outputId":"aa0f9d65-3d19-42d6-b17e-7a29fe5fca8f","execution":{"iopub.status.busy":"2022-05-01T17:20:57.239658Z","iopub.execute_input":"2022-05-01T17:20:57.239975Z","iopub.status.idle":"2022-05-01T19:08:05.944655Z","shell.execute_reply.started":"2022-05-01T17:20:57.239931Z","shell.execute_reply":"2022-05-01T19:08:05.939947Z"}}
max_learning_rate = 1e-3
n=250
learn.fit_one_cycle(n, max_learning_rate)
learn.recorder.plot_sched()
learn.recorder.plot_losses()
This issue arises when the tensor on which you take max/min becomes empty.
I'm doing some tests to check if some choices from my sampling algorithm is better changing its values.
As I was doing them(till this moment without a hitch) and tried to run a couple more tests for more results I got the MemoryError.
MemoryError Traceback (most recent call last)
<ipython-input-66-1ab060bc6067> in <module>
22 for g in range(0,10000):
23 # sample
---> 24 sample_df = stratified_sample(df,test,size=38, keep_index=False)
25 pathaux = "C://Users//Pedro//Desktop//EscolhasAlgoritmos//Stratified//Stratified_Tests//"
26 example = "exampleFCUL"
<ipython-input-10-7aba847839db> in stratified_sample(df, strata, size, seed, keep_index)
79 # final dataframe
80 if first:
---> 81 stratified_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
82 first = False
83 else:
D:\Anaconda\lib\site-packages\pandas\core\frame.py in query(self, expr, inplace, **kwargs)
3182 kwargs["level"] = kwargs.pop("level", 0) + 1
3183 kwargs["target"] = None
-> 3184 res = self.eval(expr, **kwargs)
3185
3186 try:
D:\Anaconda\lib\site-packages\pandas\core\frame.py in eval(self, expr, inplace, **kwargs)
3298 kwargs["target"] = self
3299 kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers)
-> 3300 return _eval(expr, inplace=inplace, **kwargs)
3301
3302 def select_dtypes(self, include=None, exclude=None):
D:\Anaconda\lib\site-packages\pandas\core\computation\eval.py in eval(expr, parser, engine, truediv, local_dict, global_dict, resolvers, level, target, inplace)
325 eng = _engines[engine]
326 eng_inst = eng(parsed_expr)
--> 327 ret = eng_inst.evaluate()
328
329 if parsed_expr.assigner is None:
D:\Anaconda\lib\site-packages\pandas\core\computation\engines.py in evaluate(self)
68
69 # make sure no names in resolvers and locals/globals clash
---> 70 res = self._evaluate()
71 return _reconstruct_object(
72 self.result_type, res, self.aligned_axes, self.expr.terms.return_type
D:\Anaconda\lib\site-packages\pandas\core\computation\engines.py in _evaluate(self)
117 truediv = scope["truediv"]
118 _check_ne_builtin_clash(self.expr)
--> 119 return ne.evaluate(s, local_dict=scope, truediv=truediv)
120 except KeyError as e:
121 # python 3 compat kludge
D:\Anaconda\lib\site-packages\numexpr\necompiler.py in evaluate(ex, local_dict, global_dict, out, order, casting, **kwargs)
814 expr_key = (ex, tuple(sorted(context.items())))
815 if expr_key not in _names_cache:
--> 816 _names_cache[expr_key] = getExprNames(ex, context)
817 names, ex_uses_vml = _names_cache[expr_key]
818 arguments = getArguments(names, local_dict, global_dict)
D:\Anaconda\lib\site-packages\numexpr\necompiler.py in getExprNames(text, context)
705
706 def getExprNames(text, context):
--> 707 ex = stringToExpression(text, {}, context)
708 ast = expressionToAST(ex)
709 input_order = getInputOrder(ast, None)
D:\Anaconda\lib\site-packages\numexpr\necompiler.py in stringToExpression(s, types, context)
282 else:
283 flags = 0
--> 284 c = compile(s, '<expr>', 'eval', flags)
285 # make VariableNode's for the names
286 names = {}
MemoryError:
My question is, what is the best way of solving this memory error, without changing number of parameters? With all the search I did here and on Google I have no clear answser.
Code:
def transform(multilevelDict):
return {"t"+'_'+str(key) : (transform(value) if isinstance(value, dict) else value) for key, value in multilevelDict.items()}
df = pd.read_csv('testingwebsitedata6.csv', sep=';')
df['Element_Count'] = df['Element_Count'].apply((json.loads))
df['Tag_Count'] = df['Tag_Count'].apply((json.loads))
for i in range(len(df['Tag_Count'])):
df['Tag_Count'][i] = transform(df['Tag_Count'][i])
df1 = pd.DataFrame(df['Element_Count'].values.tolist())
df2 = pd.DataFrame(df['Tag_Count'].values.tolist())
df = pd.concat([df.drop('Element_Count', axis=1), df1], axis=1)
df= pd.concat([df.drop('Tag_Count', axis=1), df2], axis=1)
df= df.fillna(0)
df[df.select_dtypes(include=['float64']).columns]= df.select_dtypes(include=['float64']).astype(int)
df
test= ['link', 'document', 'heading', 'form', 'textbox', 'button', 'list', 'listitem', 'img', 'navigation', 'banner', 'main', 'article', 'contentinfo', 'checkbox', 'table', 'rowgroup', 'row', 'cell', 'listbox', 'presentation', 'figure', 'columnheader', 'separator', 'group', 'region', 't_html', 't_head', 't_title', 't_meta', 't_link', 't_script', 't_style', 't_body', 't_a', 't_div', 't_h1', 't_form', 't_label', 't_input', 't_ul', 't_li', 't_i', 't_img', 't_nav', 't_header', 't_span', 't_article', 't_p', 't_footer', 't_h3', 't_br', 't_noscript', 't_em', 't_strong', 't_button', 't_h2', 't_ol', 't_time', 't_center', 't_table', 't_tbody', 't_tr', 't_td', 't_font', 't_select', 't_option', 't_b', 't_figure', 't_figcaption', 't_u', 't_iframe', 't_caption', 't_thead', 't_th', 't_h5', 't_sup', 't_map', 't_area', 't_hr', 't_h4', 't_blockquote', 't_sub', 't_fieldset', 't_legend', 't_pre', 't_main', 't_section', 't_small', 't_tfoot', 't_textarea', 't_inserir', 't_s']
print('test1')
print('\n')
for g in range(0,10000):
# sample
sample_df = stratified_sample(df,test,size=38, keep_index=False)
pathaux = "C://Users//Pedro//Desktop//EscolhasAlgoritmos//Stratified//Stratified_Tests//"
example = "exampleFCUL"
randomnumber = g+1
csv = ".csv"
path = pathaux + '26'+'//'+ example +str(randomnumber) + csv
chosencolumns= ["Uri"]
sample_df.to_csv(path,sep=';', index = False, columns =chosencolumns, header = False)
Stratifed Sampling function used:
def stratified_sample(df, strata, size=None, seed=None, keep_index= True):
'''
It samples data from a pandas dataframe using strata. These functions use
proportionate stratification:
n1 = (N1/N) * n
where:
- n1 is the sample size of stratum 1
- N1 is the population size of stratum 1
- N is the total population size
- n is the sampling size
Parameters
----------
:df: pandas dataframe from which data will be sampled.
:strata: list containing columns that will be used in the stratified sampling.
:size: sampling size. If not informed, a sampling size will be calculated
using Cochran adjusted sampling formula:
cochran_n = (Z**2 * p * q) /e**2
where:
- Z is the z-value. In this case we use 1.96 representing 95%
- p is the estimated proportion of the population which has an
attribute. In this case we use 0.5
- q is 1-p
- e is the margin of error
This formula is adjusted as follows:
adjusted_cochran = cochran_n / 1+((cochran_n -1)/N)
where:
- cochran_n = result of the previous formula
- N is the population size
:seed: sampling seed
:keep_index: if True, it keeps a column with the original population index indicator
Returns
-------
A sampled pandas dataframe based in a set of strata.
Examples
--------
>> df.head()
id sex age city
0 123 M 20 XYZ
1 456 M 25 XYZ
2 789 M 21 YZX
3 987 F 40 ZXY
4 654 M 45 ZXY
...
# This returns a sample stratified by sex and city containing 30% of the size of
# the original data
>> stratified = stratified_sample(df=df, strata=['sex', 'city'], size=0.3)
Requirements
------------
- pandas
- numpy
'''
population = len(df)
size = __smpl_size(population, size)
tmp = df[strata]
tmp['size'] = 1
tmp_grpd = tmp.groupby(strata).count().reset_index()
tmp_grpd['samp_size'] = round(size/population * tmp_grpd['size']).astype(int)
# controlling variable to create the dataframe or append to it
first = True
for i in range(len(tmp_grpd)):
# query generator for each iteration
qry=''
for s in range(len(strata)):
stratum = strata[s]
value = tmp_grpd.iloc[i][stratum]
n = tmp_grpd.iloc[i]['samp_size']
if type(value) == str:
value = "'" + str(value) + "'"
if s != len(strata)-1:
qry = qry + stratum + ' == ' + str(value) +' & '
else:
qry = qry + stratum + ' == ' + str(value)
# final dataframe
if first:
stratified_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
first = False
else:
tmp_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
stratified_df = stratified_df.append(tmp_df, ignore_index=True)
return stratified_df
def stratified_sample_report(df, strata, size=None):
'''
Generates a dataframe reporting the counts in each stratum and the counts
for the final sampled dataframe.
Parameters
----------
:df: pandas dataframe from which data will be sampled.
:strata: list containing columns that will be used in the stratified sampling.
:size: sampling size. If not informed, a sampling size will be calculated
using Cochran adjusted sampling formula:
cochran_n = (Z**2 * p * q) /e**2
where:
- Z is the z-value. In this case we use 1.96 representing 95%
- p is the estimated proportion of the population which has an
attribute. In this case we use 0.5
- q is 1-p
- e is the margin of error
This formula is adjusted as follows:
adjusted_cochran = cochran_n / 1+((cochran_n -1)/N)
where:
- cochran_n = result of the previous formula
- N is the population size
Returns
-------
A dataframe reporting the counts in each stratum and the counts
for the final sampled dataframe.
'''
population = len(df)
size = __smpl_size(population, size)
tmp = df[strata]
tmp['size'] = 1
tmp_grpd = tmp.groupby(strata).count().reset_index()
tmp_grpd['samp_size'] = round(size/population * tmp_grpd['size']).astype(int)
return tmp_grpd
def __smpl_size(population, size):
'''
A function to compute the sample size. If not informed, a sampling
size will be calculated using Cochran adjusted sampling formula:
cochran_n = (Z**2 * p * q) /e**2
where:
- Z is the z-value. In this case we use 1.96 representing 95%
- p is the estimated proportion of the population which has an
attribute. In this case we use 0.5
- q is 1-p
- e is the margin of error
This formula is adjusted as follows:
adjusted_cochran = cochran_n / 1+((cochran_n -1)/N)
where:
- cochran_n = result of the previous formula
- N is the population size
Parameters
----------
:population: population size
:size: sample size (default = None)
Returns
-------
Calculated sample size to be used in the functions:
- stratified_sample
- stratified_sample_report
'''
if size is None:
cochran_n = round(((1.96)**2 * 0.5 * 0.5)/ 0.02**2)
n = round(cochran_n/(1+((cochran_n -1) /population)))
elif size >= 0 and size < 1:
n = round(population * size)
elif size < 0:
raise ValueError('Parameter "size" must be an integer or a proportion between 0 and 0.99.')
elif size >= 1:
n = size
return n
(Anything that I have forgot to mention that u feel is important to understand the problem please say and I will edit it in)
def run():
rundecision=input("What do you want to do? calculate distance(d),pace(p) time(t):")
if rundecision in ['distance', 'd']:
pace()
time()
distance=calculator(distance=None,pace=pacetotal,time=timetotal)
return str(distance) + paceunit
print (run())
my pace() is below where pace total is defined and called out above.
def pace():
while True:
pacemin=input("Enter what pace you want to run/ you ran in :00(min):")#user pace in min
pacesec=input("Enter what pace you want to run/ you ran in :00(secs):")#user pace in sec
try:
pacemin=int(pacemin)
pacesec=int(pacesec)
if 0 <= pacemin <= 59 and 0 <= pacesec <=59:
pacetotal=(to_seconds(pacemin,'min')) + (to_seconds(pacesec,'s'))
pacetotal=int(pacetotal)
return pacetotal
break
)
This is my error:
Traceback (most recent call last): File "minicapstonev2.py", line
188, in
print (run()) File "minicapstonev2.py", line 185, in run
distance=calculator(distance=None,pace=pacetotal,time=timetotal)
NameError: name 'pacetotal' is not defined
You have to assign the return values of your functions to a variable that u can use:
if rundecision in ['distance', 'd']:
pacetotal = pace() # pace returns the seconds
timetotal = time() # this should also return the value for your next computation
distance=calculator(distance=None,pace=pacetotal,time=timetotal)
return str(distance) + paceunit
You did not supply the time() function, but if it is similar to pace this should work.
Edit due to question in comments:
Some variations on how to approach returnvalues and your program:
# Different ways to return something from a function
def ModifyGivenDict(di , li):
# do some calculations
di["milesPerHour"] = 42
li.append(999)
def ReturnAValue(x):
return x ** x
def ReturnMultipleValues(x):
return [x * u for u in range(20)]
def ReturnTuple(x,y):
return (x,y,x * y,x ** y,y ** x,"someValue")
d = {"Round1": 496 }
l = [2,3,4,5,"Hurray"]
a = ModifyGivenDict(d,l)
print(d)
k = ReturnAValue(22)
print(k)
i = ReturnMultipleValues(22)
print(i)
h = ReturnTuple(4,7)
print(h)
# OOP aproach
class Timings:
#staticmethod
def toSeconds(text):
"""Understands up to '12:18:24.3545' - returns a floatvalue of seconds"""
t = [0,0] + text.split(":") # prefix with 2 * 0 if only "22" or "1:22" is given
t[-1] = float(t[-1]) # make last elements to float
t[-2] = int(t[-2]) * 60 # make integer minutes into seconds
t[-3] = int(t[-3]) * 60 * 60 # make integer hours into seconds
return sum(t)
#staticmethod
def toMeters(distance):
"""Understands '255.23 meters'"""
converterDict = {'mile':1609.34, 'mi':1609.34, 'km':1000, 'kilometer':1000,
'y':0.9144, 'yard':0.9144, 'meter':1, 'm':1}
dist,unit = distance.split(" ")
dist = float(dist)
unit = unit.rstrip("s").strip().lower()
return dist * converterDict[unit]
def __init__(self, name):
self.name = name
self.laps = {}
self.lap = 0
def addLap(self, minutesColonSeconds, distance):
t = self.toSeconds(minutesColonSeconds)
m = self.toMeters(distance)
self.laps[self.lap] = {"time":t, "distance":m}
self.lap += 1
def printLaps(self):
print("Results for " + self.name,sep="\t")
print("{:<14} {:<14} {:<14} {:<14} {:<14} {:<14} {:<14}".format(
"lap","m","s","m/s", "total time", "total dist","speed"))
tm = 0
tt = 0
# you could also use an orderedDict from collections
for k in sorted(self.laps.keys()):
m = self.laps[k]["distance"]
t = self.laps[k]["time"]
tm +=m
tt += t
print("{:<14} {:<14} {:<14} {:<14} {:<14} {:<14} {:<14}".format(
k,m,t,round(m / t,2), tt,tm,round(tm/tt,2)))
def inputTime(text):
while True:
i = input(text)
try:
t = [0,0] + i.split(":")
sec = float(t[-1])
min = int(t[-2])
hou = int(t[-3])
if sec+min*60+hou*60*60 <= 0:
raise ValueError
return i
except (ValueError,EOFError):
print("Wrong input! Use '1:12:23.99' for hour:minute:second.partials")
def inputDistance(text):
while True:
t = input(text)
try:
dis,un = t.split()
dis = float(dis)
return t
except:
print("Wrong input. Use: '23 km' - or: meter, mile(s), yard(s), m, mi, y")
print("\nClassaproach\n\n")
timing = Timings("Just Me")
while True:
dis = inputDistance("What distance did you cover?")
tim = inputTime("In what time?")
timing.addLap(tim,dis)
timing.printLaps()
Output (edited to better fit here):
{'Round1': 496, 'milesPerHour': 42}
341427877364219557396646723584
[0, 22, 44, 66, 88, 110, 132, 154, 176, 198, 220, 242, 264, 286,
308, 330, 352, 374, 396, 418]
(4, 7, 28, 16384, 2401, 'someValue')
Classaproach
What distance did you cover?100 m
In what time?12.02
Results for Just Me
lap m s m/s total time total dist speed
0 100.0 12.02 8.32 12.02 100.0 8.32
What distance did you cover?20 km
In what time?2:0:01
Results for Just Me
lap m s m/s total time total dist speed
0 100.0 12.02 8.32 12.02 100.0 8.32
1 20000.0 7201.0 2.78 7213.02 20100.0 2.79
What distance did you cover?5 mi
In what time?1:1:1
Results for Just Me
lap m s m/s total time total dist speed
0 100.0 12.02 8.32 12.02 100.0 8.32
1 20000.0 7201.0 2.78 7213.02 20100.0 2.79
2 8046.7 3661.0 2.2 10874.02 28146.7 2.59
What distance did you cover?120 km
In what time?1:02:00
Results for Just Me
lap m s m/s total time total dist speed
0 100.0 12.02 8.32 12.02 100.0 8.32
1 20000.0 7201.0 2.78 7213.02 20100.0 2.79
2 8046.7 3661.0 2.2 10874.02 28146.7 2.59
3 120000.0 3720.0 32.26 14594.02 148146.7 10.15
What distance did you cover?
...
I want to apply my function (f1) to array of numbers (cdr_test) using multiprocessing. My code:
cdr_test = [x for x in range(0, 100000)]
def f1(el):
a = Counter() #make new vector for each cdr
for k,v in d3.items():
if el in v:
a = a + Counter(itertools.product([el], v))
return a
if __name__ == '__main__':
pool = mp.Pool(20)
results = pool.map(f1, cdr_test)
pool.close()
pool.join()
out = open('out.txt', 'w')
for result in results:
for k,v in result.items():
out.write('\t'.join(map(str,k))+"\t"+str(v)+"\n")
out.close()
pool.close()
I have 'cannot allocate memory'. If I use an array of smaller length (100), then everything works.
Stacktrace:
OSError Traceback (most recent call last)
<ipython-input-3-b8dc4a3d12b3> in <module>()
9
10 if __name__ == '__main__':
---> 11 pool = mp.Pool(1000)
12 results = pool.map(f1, cdr_test)
13 #new section
/home/fedorovaad/anaconda3/lib/python3.5/multiprocessing/context.py in Pool(self, processes, initializer, initargs, maxtasksperchild)
116 from .pool import Pool
117 return Pool(processes, initializer, initargs, maxtasksperchild,
--> 118 context=self.get_context())
119
120 def RawValue(self, typecode_or_type, *args):
/home/fedorovaad/anaconda3/lib/python3.5/multiprocessing/pool.py in __init__(self, processes, initializer, initargs, maxtasksperchild, context)
166 self._processes = processes
167 self._pool = []
--> 168 self._repopulate_pool()
169
170 self._worker_handler = threading.Thread(
/home/fedorovaad/anaconda3/lib/python3.5/multiprocessing/pool.py in _repopulate_pool(self)
231 w.name = w.name.replace('Process', 'PoolWorker')
232 w.daemon = True
--> 233 w.start()
234 util.debug('added worker')
235
/home/fedorovaad/anaconda3/lib/python3.5/multiprocessing/process.py in start(self)
103 'daemonic processes are not allowed to have children'
104 _cleanup()
--> 105 self._popen = self._Popen(self)
106 self._sentinel = self._popen.sentinel
107 _children.add(self)
/home/fedorovaad/anaconda3/lib/python3.5/multiprocessing/context.py in _Popen(process_obj)
265 def _Popen(process_obj):
266 from .popen_fork import Popen
--> 267 return Popen(process_obj)
268
269 class SpawnProcess(process.BaseProcess):
/home/fedorovaad/anaconda3/lib/python3.5/multiprocessing/popen_fork.py in __init__(self, process_obj)
18 sys.stderr.flush()
19 self.returncode = None
---> 20 self._launch(process_obj)
21
22 def duplicate_for_child(self, fd):
/home/fedorovaad/anaconda3/lib/python3.5/multiprocessing/popen_fork.py in _launch(self, process_obj)
65 code = 1
66 parent_r, child_w = os.pipe()
---> 67 self.pid = os.fork()
68 if self.pid == 0:
69 try:
OSError: [Errno 12] Cannot allocate memory
Are there ways to solve this?
The code you show is different from the one in the error.
---> 11 pool = mp.Pool(1000)
You are trying to spawn way too many processes, the OS will run out of memory before it can allocate them all.
You don't need this many processes to carry on your job, just use multiprocessing.cpu_count() to know how many CPUs your platform has and spawn a pool of that size.
Hello I am am trying to use the nltk to tokenize and generate some pos tags but I get error response in spite of of importing the nltk
bs=BeautifulSoup(web.text, 'html.parser')
print (bs)
tokes=nltk.word_tokenize (bs)
tags= nltk.pos_tag(tokes)
TypeError Traceback (most recent call last)
<ipython-input-71-f1434047d3f5> in <module>()
1 bs=BeautifulSoup(web.text, 'html.parser')
2 print (bs)
----> 3 tokes=nltk.word_tokenize (bs)
4 tags= nltk.pos_tag(tokes)
5 tags
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py in word_tokenize(text, language)
104 :param language: the model name in the Punkt corpus
105 """
--> 106 return [token for sent in sent_tokenize(text, language)
107 for token in _treebank_word_tokenize(sent)]
108
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py in sent_tokenize(text, language)
89 """
90 tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
---> 91 return tokenizer.tokenize(text)
92
93 # Standard word tokenizer.
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in tokenize(self, text, realign_boundaries)
1224 Given a text, returns a list of the sentences in that text.
1225 """
-> 1226 return list(self.sentences_from_text(text, realign_boundaries))
1227
1228 def debug_decisions(self, text):
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in sentences_from_text(self, text, realign_boundaries)
1272 follows the period.
1273 """
-> 1274 return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
1275
1276 def _slices_from_text(self, text):
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in span_tokenize(self, text, realign_boundaries)
1263 if realign_boundaries:
1264 slices = self._realign_boundaries(text, slices)
-> 1265 return [(sl.start, sl.stop) for sl in slices]
1266
1267 def sentences_from_text(self, text, realign_boundaries=True):
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in <listcomp>(.0)
1263 if realign_boundaries:
1264 slices = self._realign_boundaries(text, slices)
-> 1265 return [(sl.start, sl.stop) for sl in slices]
1266
1267 def sentences_from_text(self, text, realign_boundaries=True):
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in _realign_boundaries(self, text, slices)
1302 """
1303 realign = 0
-> 1304 for sl1, sl2 in _pair_iter(slices):
1305 sl1 = slice(sl1.start + realign, sl1.stop)
1306 if not sl2:
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in _pair_iter(it)
308 """
309 it = iter(it)
--> 310 prev = next(it)
311 for el in it:
312 yield (prev, el)
C:\Users\DESDEJEI\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py in _slices_from_text(self, text)
1276 def _slices_from_text(self, text):
1277 last_break = 0
-> 1278 for match in self._lang_vars.period_context_re().finditer(text):
1279 context = match.group() + match.group('after_tok')
1280 if self.text_contains_sentbreak(context):
TypeError: expected string or bytes-like object
could anyone help me figure out where exactly i may have gone wrong with my syntax?
You're passing bs to the tokenize function when you should be passing bs.text