Python code slowing down when wrapped in a function - python-3.x

I'm reading and processing a file (using the same bit of code) which runs at two very different speeds: 1. scripted (50K+ iterations per second) and 2. wrapped in function (~300 iterations per second). I really can't figure out why there are such massive differences in reading time consumption.
The module structure (unused and irrelevant files omitted. The code is at the end.):
| experiments/
|--| experiment_runner.py
|
| module/
|--| shared/
|--|--| dataloaders.py
|--|--| data.py
In data.py we have the method (load, the class wrapping the method inherits from torch.utils.data.Dataset) actually loading the file. In dataloaders.py I prepare the arguments to pass to load, wrapped in a function for each dataset that I'm using. This is then passed to a loader function, which handles splitting the dataset and so on.
In experiment_runner then is where the differences in speed happen. If I use the dataset functions in dataloaders.py the loading happens at around 300 iterations/second. If I copy the code from the function and throw it directly into the experiment_runner, still using the loader function from dataloaders.py (so, not wrapped in a function for each dataset), the loading happens at roughly 50000 iterations / second. I am at a complete loss as to why wrapping code in a function would alter the speed of it that drastically.
Now the actual code:
data.py:
def load(self, dataset: str = 'train', skip_header = True, **kwargs) -> None:
fp = open(self.data_files[dataset])
if skip_header:
next(fp)
data = []
for line in tqdm(self.reader(fp), desc = f'loading {self.name} ({dataset})'):
data_line, datapoint = {}, base.Datapoint()
for field in self.train_fields:
idx = field.index if self.ftype in ['CSV', 'TSV'] else field.cname
data_line[field.name] = self.process_doc(line[idx].rstrip())
data_line['original'] = line[idx].rstrip()
for field in self.label_fields:
idx = field.index if self.ftype in ['CSV', 'TSV'] else field.cname
if self.label_preprocessor:
data_line[field.name] = self.label_preprocessor(line[idx].rstrip())
else:
data_line[field.name] = line[idx].rstrip()
for key, val in data_line.items():
setattr(datapoint, key, val)
data.append(datapoint)
fp.close()
if self.length is None:
# Get the max length
lens = []
for doc in data:
for f in self.train_fields:
lens.append(len([tok for tok in getattr(doc, getattr(f, 'name'))]))
self.length = max(lens)
if dataset == 'train':
self.data = data
elif dataset == 'dev':
self.dev = data
elif dataset == 'test':
self.test = data
dataloaders.py:
def loader(args: dict, **kwargs):
"""Loads the dataset.
:args (dict): Dict containing arguments to load dataaset.
:returns: Loaded and splitted dataset.
"""
dataset = GeneralDataset(**args)
dataset.load('train', **kwargs)
if (args['dev'], args['test']) == (None, None): # Only train set is given.
dataset.split(dataset.data, [0.8, 0.1, 0.1], **kwargs)
elif args['dev'] is not None and args['test'] is None: # Dev set is given, test it not.
dataset.load('dev', **kwargs)
dataset.split(dataset.data, [0.8], **kwargs)
elif args['dev'] is None and args['test'] is not None: # Test is given, dev is not.
dataset.split(dataset.data, [0.8], **kwargs)
dataset.dev_set = dataset.test
dataset.load('test', **kwargs)
else: # Both dev and test sets are given.
dataset.load('dev', **kwargs)
dataset.load('test', **kwargs)
return dataset
def binarize(label: str) -> str:
if label in ['0', '1']:
return 'pos'
else:
return 'neg'
def datal(path: str, cleaners: base.Callable, preprocessor: base.Callable = None):
args = {'data_dir': path,
'ftype': 'csv',
'fields': None,
'train': 'dataset.csv', 'dev': None, 'test': None,
'train_labels': None, 'dev_labels': None, 'test_labels': None,
'sep': ',',
'tokenizer': lambda x: x.split(),
'preprocessor': preprocessor,
'transformations': None,
'length': None,
'label_preprocessor': binarize,
'name': 'First dataset.'
}
ignore = base.Field('ignore', train = False, label = False, ignore = True)
d_text = base.Field('text', train = True, label = False, ignore = False, ix = 6, cname = 'text')
d_label = base.Field('label', train = False, label = True, cname = 'label', ignore = False, ix = 5)
args['fields'] = [ignore, ignore, ignore, ignore, ignore, d_label, d_text]
return loader(args)
And for the purposes:
experiment_runner.py
from module.dataloaders import datal, loader
dataset = datal() # Slow: 300-ish iterations/second
# Fast version: 50000 iter/second
def binarize(label: str) -> str:
if label in ['0', '1']:
return 'pos'
else:
return 'neg'
args = {'data_dir': path,
'ftype': 'csv',
'fields': None,
'train': 'dataset.csv', 'dev': None, 'test': None,
'train_labels': None, 'dev_labels': None, 'test_labels': None,
'sep': ',',
'tokenizer': lambda x: x.split(),
'preprocessor': preprocessor,
'transformations': None,
'length': None,
'label_preprocessor': binarize,
'name': 'First dataset.'
}
ignore = base.Field('ignore', train = False, label = False, ignore = True)
d_text = base.Field('text', train = True, label = False, ignore = False, ix = 6, cname = 'text')
d_label = base.Field('label', train = False, label = True, cname = 'label', ignore = False, ix = 5)
args['fields'] = [ignore, ignore, ignore, ignore, ignore, d_label, d_text]
dataset = loader(args)
I would ideally prefer to keep the dataset functions (e.g. datal) wrapped to keep the logic separate but with this speed decrease, that's not feasible.

Related

How to fix this SettingWithCopyWarning with pd.DataFrame.apply()?

I've a dataframe with 2-columns: ImageData, Label. ImageData column is of 2-D array for various dimensions. Label column is boolean True/False.
I'm trying to convert data in the "ImageData" column to 128x128 shape (along with minor other transformations). So, I'm doing following:
def convert_image_to_binary_image(img: np.ndarray, threshold: int = 1, max_value: int = 1) -> np.ndarray:
ret, bin_img = cv.threshold(img, thresh=threshold, maxval=max_value, type=cv.THRESH_BINARY)
bin_img = bin_img.astype('float32')
return bin_img
def transform_img_dimension(img: np.ndarray, target_width: int = 128, target_height: int = 128) -> np.ndarray:
img = img.astype('uint8')
bin_image = convert_image_to_binary_image(img)
bin_3dimg = tf.expand_dims(input=bin_image, axis=2)
bin_img_reshaped = tf.image.resize_with_pad(image=bin_3dimg, target_width=target_width, target_height=target_height, method="bilinear")
xformed_img = np.squeeze(bin_img_reshaped, axis=2)
# return xformed_img.copy()
return xformed_img
I'm calling apply as following:
testDF["ImageData"] = testDF.apply(lambda row: transform_img_dimension(row["ImageData"], axis=1)
But that's causing SettingWithCopyWarning.
I tried defining a wrapper function (instead of lambda) as following:
def transform_dimension(row: pd.Series, target_width: int = 128, target_height: int = 128) -> np.ndarray:
copy_row = row.copy(deep=True)
xformed_data = transform_img_dimension(copy_row["ImageData"], target_width=target_width, target_height=target_height)
del copy_row
return xformed_data
And updated the call to apply as following:
testDF["ImageData"] = testDF.apply(transform_dimension, axis=1)
However, this is not resolving the problem. What is the fix for this warning for my case?
Update 1:
If I rewrite as following, I don't get the warning
testDF2 = testDF.copy(deep=True)
testDF2["ImageData"] = testDF.apply(lambda row: transform_img_dimension(row["ImageData"], axis=1)
Is it not memory overhead now to hold 2 dataframes? Am I recommended to delete the original dataframe, testDF, now?

Plotting multiple lines with a Nested Dictionary, and unknown variables to Line Graph

I was able to find somewhat of an answer to my question, but it was not as nested as my dictionary and so I am really unsure how to proceed as I am still very new to python. I currently have a nested dictionary like
{'140.10': {'46': {'1': '-49.50918', '2': '-50.223637', '3': '49.824406'}, '28': {'1': '-49.50918', '2': '-50.223637', '3': '49.824406'}}}:
I am wanting to plot it so that '140.10' becomes the title of the graph and '46' and '28' become the individual lines and key '1' for example is on the y axis and the x axis is the final number (in this case '-49.50918). Essentially a graph like this:
I generated this graph with a csv file that is written at another part of the code just with excel:
[![enter image description here][2]][2]
The problem I am running into is that these keys are autogenerated from a larger csv file and I will not know their exact value until the code has been run. As each of the keys are autogenerated in an earlier part of the script. As I will be running it over various files called the Graph name, and each file will have a different values for:
{key1:{key2_1: {key3_1: value1, key3_2: value2, key3_3: value3}, key_2_2 ...}}}
I have tried to do something like this:
for filename in os.listdir(Directory):
if filename.endswith('.csv'):
q = filename.split('.csv')[0]
s = q.split('_')[0]
if s in time_an_dict:
atom = list(time_an_dict[s])
ion = time_an_dict[s]
for f in time_an_dict[s]:
x_val = []
y_val = []
fz = ion[f]
for i in time_an_dict[s][f]:
pos = (fz[i])
frame = i
y_val.append(frame)
x_val.append(pos)
'''ions = atom
frame = frames
position = pos
plt.plot(frame, position, label = frames)
plt.xlabel("Frame")
plt.ylabel("Position")
plt.show()
#plt.savefig('{}_Pos.png'.format(s))'''
But it has not run as intended.
I have also tried:
for filename in os.listdir(Directory):
if filename.endswith('_Atom.csv'):
q = filename.split('.csv')[0]
s = q.split('_')[0]
if s in window_dict:
name = s + '_Atom.csv'
time_an_dict[s] = analyze_time(name,window_dict[s])
new = '{}_A_pos.csv'.format(s)
ions = list(time_an_dict.values())[0].keys()
for i in ions:
x_axis_values = []
y_axis_values = []
frame = list(time_an_dict[s][i])
x_axis_values.append(frame)
empty = []
print(x_axis_values)
for x in frame:
values = time_an_dict[s][i][x]
empty.append(values)
y_axis_values.append(empty)
plt.plot(x_axis_values, y_axis_values, label = x )
plt.show()
But keep getting the error:
Traceback (most recent call last): File "Atoms_pos.py", line 175, in
plt.plot(x_axis_values, y_axis_values, label = x ) File "/Users/hxb51/opt/anaconda3/lib/python3.8/site-packages/matplotlib/pyplot.py",
line 2840, in plot
return gca().plot( File "/Users/hxb51/opt/anaconda3/lib/python3.8/site-packages/matplotlib/axes/_axes.py",
line 1743, in plot
lines = [*self._get_lines(*args, data=data, **kwargs)] File "/Users/hxb51/opt/anaconda3/lib/python3.8/site-packages/matplotlib/axes/_base.py",
line 273, in call
yield from self._plot_args(this, kwargs) File "/Users/hxb51/opt/anaconda3/lib/python3.8/site-packages/matplotlib/axes/_base.py",
line 394, in _plot_args
self.axes.xaxis.update_units(x) File "/Users/hxb51/opt/anaconda3/lib/python3.8/site-packages/matplotlib/axis.py",
line 1466, in update_units
default = self.converter.default_units(data, self) File "/Users/hxb51/opt/anaconda3/lib/python3.8/site-packages/matplotlib/category.py",
line 107, in default_units
axis.set_units(UnitData(data)) File "/Users/hxb51/opt/anaconda3/lib/python3.8/site-packages/matplotlib/category.py",
line 176, in init
self.update(data) File "/Users/hxb51/opt/anaconda3/lib/python3.8/site-packages/matplotlib/category.py",
line 209, in update
for val in OrderedDict.fromkeys(data): TypeError: unhashable type: 'numpy.ndarray'
Here is the remainder of the other parts of the code that generate the files and dictionaries I am using. I was told in another question I asked that this could be helpful.
# importing dependencies
import math
import sys
import pandas as pd
import MDAnalysis as mda
import os
import numpy as np
import csv
import matplotlib.pyplot as plt
################################################################################
###############################################################################
Directory = '/Users/hxb51/Desktop/Q_prof/Displacement_Charge/Blah'
os.chdir(Directory)
################################################################################
''' We are only looking at the positions of the CLAs and SODs and not the DRUDE counterparts. We are assuming the DRUDE
are very close and it is not something that needs to be concerned with'''
def Positions(dcd, topo):
fields = ['Window', 'ION', 'ResID', 'Location', 'Position', 'Frame', 'Final']
with open('{}_Atoms.csv'.format(s), 'a') as d:
writer = csv.writer(d)
writer.writerow(fields)
d.close()
CLAs = u.select_atoms('segid IONS and name CLA')
SODs = u.select_atoms('segid IONS and name SOD')
CLA_res = len(CLAs)
SOD_res = len(SODs)
frame = 0
for ts in u.trajectory[-10:]:
frame +=1
CLA_pos = CLAs.positions[:,2]
SOD_pos = SODs.positions[:,2]
for i in range(CLA_res):
ids = i + 46
if CLA_pos[i] < 0:
with open('{}_Atoms.csv'.format(s), 'a') as q:
new_line = [s,'CLA', ids, 'Bottom', CLA_pos[i], frame,10]
writes = csv.writer(q)
writes.writerow(new_line)
q.close()
else:
with open('{}_Atoms.csv'.format(s), 'a') as q:
new_line = [s,'CLA', ids, 'Top', CLA_pos[i], frame, 10]
writes = csv.writer(q)
writes.writerow(new_line)
q.close()
for i in range(SOD_res):
ids = i
if SOD_pos[i] < 0:
with open('{}_Atoms.csv'.format(s), 'a') as q:
new_line = [s,'SOD', ids, 'Bottom', SOD_pos[i], frame,10]
writes = csv.writer(q)
writes.writerow(new_line)
q.close()
else:
with open('{}_Atoms.csv'.format(s), 'a') as q:
new_line = [s,'SOD', ids, 'Top', SOD_pos[i], frame, 10]
writes = csv.writer(q)
writes.writerow(new_line)
q.close()
csv_Data = pd.read_csv('{}_Atoms.csv'.format(s))
filename = s + '_Atom.csv'
sorted_df = csv_Data.sort_values(["ION", "ResID", "Frame"],
ascending=[True, True, True])
sorted_df.to_csv(filename, index = False)
os.remove('{}_Atoms.csv'.format(s))
''' this function underneath looks at the ResIds, compares them to make sure they are the same and then counts how many
times the ion flip flops around the boundaries'''
def turn_dict(f):
read = open(f)
reader = csv.reader(read, delimiter=",", quotechar = '"')
my_dict = {}
new_list = []
for row in reader:
new_list.append(row)
for i in range(len(new_list[:])):
prev = i - 1
if new_list[i][2] == new_list[prev][2]:
if new_list[i][3] != new_list[prev][3]:
if new_list[i][2] in my_dict:
my_dict[new_list[i][2]] += 1
else:
my_dict[new_list[i][2]] = 1
return my_dict
def plot_flips(f):
dict = turn_dict(f)
ions = list(dict.keys())
occ = list(dict.values())
plt.bar(range(len(dict)), occ, tick_label = ions)
plt.title("{}".format(s))
plt.xlabel("Residue ID")
plt.ylabel("Boundary Crosses")
plt.savefig('{}_Flip.png'.format(s))
def analyze_time(f, dicts):
read = open(f)
reader = csv.reader(read, delimiter=",", quotechar='"')
new_list = []
keys = list(dicts.keys())
time_dict = {}
pos_matrix = {}
for row in reader:
new_list.append(row)
fields = ['ResID', 'Position', 'Frame']
with open('{}_A_pos.csv'.format(s), 'a') as k:
writer = csv.writer(k)
writer.writerow(fields)
k.close()
for i in range(len(new_list[:])):
if new_list[i][2] in keys:
with open('{}_A_pos.csv'.format(s), 'a') as k:
new_line = [new_list[i][2], new_list[i][4], new_list[i][5]]
writes = csv.writer(k)
writes.writerow(new_line)
k.close()
read = open('{}_A_pos.csv'.format(s))
reader = csv.reader(read, delimiter=",", quotechar='"')
time_list = []
for row in reader:
time_list.append(row)
for j in range(len(keys)):
for i in range(len(time_list[1:])):
if time_list[i][0] == keys[j]:
pos_matrix[time_list[i][2]] = time_list[i][1]
time_dict[keys[j]] = pos_matrix
return time_dict
window_dict = {}
for filename in os.listdir(Directory):
s = filename.split('.dcd')[0]
fors = s + '.txt'
topos = '/Users/hxb51/Desktop/Q_prof/Displacement_Charge/topo.psf'
if filename.endswith('.dcd'):
print('We are starting with {} \n '.format(s))
u = mda.Universe(topos, filename)
Positions(filename, topos)
name = s + '_Atom.csv'
plot_flips(name)
window_dict[s] = turn_dict(name)
continue
time_an_dict = {}
for filename in os.listdir(Directory):
if filename.endswith('.csv'):
q = filename.split('.csv')[0]
s = q.split('_')[0]
if s in window_dict:
name = s + '_Atom.csv'
time_an_dict[s] = analyze_time(name,window_dict[s])
for filename in os.listdir(Directory):
if filename.endswith('.csv'):
q = filename.split('.csv')[0]
s = q.split('_')[0]
if s in time_an_dict:
atom = list(time_an_dict[s])
ion = time_an_dict[s]
for f in time_an_dict[s]:
x_val = []
y_val = []
fz = ion[f]
for i in time_an_dict[s][f]:
pos = (fz[i])
frame = i
y_val.append(frame)
x_val.append(pos)
'''ions = atom
frame = frames
position = pos
plt.plot(frame, position, label = frames)
plt.xlabel("Frame")
plt.ylabel("Position")
plt.show()
#plt.savefig('{}_Pos.png'.format(s))'''
Everything here runs well except this last bottom block of code. That deals with trying to make a graph from a nested dictionary. Any help would be appreciated!
Thanks!
I figured out the answer:
for filename in os.listdir(Directory):
if filename.endswith('_Atom.csv'):
q = filename.split('.csv')[0]
s = q.split('_')[0]
if s in window_dict:
name = s + '_Atom.csv'
time_an_dict[s] = analyze_time(name,window_dict[s])
new = '{}_A_pos.csv'.format(s)
ions = list(time_an_dict[s])
plt.yticks(np.arange(-50, 50, 5))
plt.xlabel('Frame')
plt.ylabel('Z axis position(Ang)')
plt.title([s])
for i in ions:
x_value = []
y_value = []
time_frame =len(time_an_dict[s][i]) +1
for frame in range(1,time_frame):
frame = str(frame)
x_value.append(int(frame))
y_value.append(float(time_an_dict[s][i][frame]))
plt.plot(x_value, y_value, label=[i])
plt.xticks(np.arange(1, 11, 1))
plt.legend()
plt.savefig('{}_Positions.png'.format(s))
plt.clf()
os.remove("{}_A_pos.csv".format(s))
From there, with the combo of the other parts of the code, it produces these graphs:
For more than 1 file as long as there is more '.dcd' files.

What's the underlying implementation for most_common method of Counter?

I found a pyi file which has the following def
def most_common(self, n: Optional[int] = ...) -> List[Tuple[_T, int]]: ...
How could this happen? List is not defined, and no implementation?
Just highlight some valuable suggestions here for followers:
List is imported from the typing module; it's not the same thing as list. The .pyi file doesn't need to import it because stub files are never executed; they just have to be syntactically valid Python
If you use from future import annotations, you won't have to import typing to use List et al. in function annotations in .py files, either, since function annotations will be treated as string literals. (Starting in Python 4, that will be the default behavior. See PEP 563 for details.)
You are looking at the pyi file which is used solely for annotations. It is never executed by the Python interpreter. You can learn more about pyi files by reading PEP484.
Using a debugger, put a breakpoint on the line where you call most_commonand then step into the method.
Python 3.7 implementation.
...\Lib\collections\__init__.py:
def most_common(self, n=None):
'''List the n most common elements and their counts from the most
common to the least. If n is None, then list all element counts.
>>> Counter('abcdeabcdabcaba').most_common(3)
[('a', 5), ('b', 4), ('c', 3)]
'''
# Emulate Bag.sortedByCount from Smalltalk
if n is None:
return sorted(self.items(), key=_itemgetter(1), reverse=True)
return _heapq.nlargest(n, self.items(), key=_itemgetter(1))
_heapq.nlargest (in ...\Lib\heapq.py) implementation:
def nlargest(n, iterable, key=None):
"""Find the n largest elements in a dataset.
Equivalent to: sorted(iterable, key=key, reverse=True)[:n]
"""
# Short-cut for n==1 is to use max()
if n == 1:
it = iter(iterable)
sentinel = object()
if key is None:
result = max(it, default=sentinel)
else:
result = max(it, default=sentinel, key=key)
return [] if result is sentinel else [result]
# When n>=size, it's faster to use sorted()
try:
size = len(iterable)
except (TypeError, AttributeError):
pass
else:
if n >= size:
return sorted(iterable, key=key, reverse=True)[:n]
# When key is none, use simpler decoration
if key is None:
it = iter(iterable)
result = [(elem, i) for i, elem in zip(range(0, -n, -1), it)]
if not result:
return result
heapify(result)
top = result[0][0]
order = -n
_heapreplace = heapreplace
for elem in it:
if top < elem:
_heapreplace(result, (elem, order))
top, _order = result[0]
order -= 1
result.sort(reverse=True)
return [elem for (elem, order) in result]
# General case, slowest method
it = iter(iterable)
result = [(key(elem), i, elem) for i, elem in zip(range(0, -n, -1), it)]
if not result:
return result
heapify(result)
top = result[0][0]
order = -n
_heapreplace = heapreplace
for elem in it:
k = key(elem)
if top < k:
_heapreplace(result, (k, order, elem))
top, _order, _elem = result[0]
order -= 1
result.sort(reverse=True)
return [elem for (k, order, elem) in result]

How to get Tensorflow Served model to pull from passed in input and not local batch file?

I am currently trying to get seq2seq model working with TF Serving. I thought I had it correctly however it seems I was mistaken. I originally trained the model via local text file input, read in as batches. Now I want to have a passed in sentence and it return back to me the summation.
I have been successful in getting the model saved, served and now I am able to view the prediction on my front end page, however the result is still pulling from my local text file and not my passed in query param sentence.
My input is one sentence currently sent as a query param, but the result actually displayed is pulling from my text file still, even though I mapped batch_x to the value of my arg[1] which I have verified is the correct expected input.
Does anyone see what I am doing wrong? Clearly I have misunderstood the process I was supposed to take.
Now an important note to make here is that if I modify the value of the argument passed in and call the python file directly, I get the correct results. However when I make the same call to the frozen model being served, I always get the same prediction response regardless of what is sent in.
This is how I am freezing the model (Notice the mapping of inputs_dict.X to batch_x...believe the issue is something I am doing here incorrectly):
pickle_fn = 'args.pickle'
folder = os.path.dirname(os.path.abspath(__file__)) + '/pickle'
pickle_filepath = os.path.join(folder, pickle_fn)
with open(pickle_filepath, "rb") as f:
args = pickle.load(f)
print("Loading dictionary...")
word_dict, reversed_dict, article_max_len, summary_max_len = build_dict("valid", args.toy)
print("Loading validation dataset...")
#The below call will pull from the arg passed when "serve" is used
valid_x, valid_y = build_dataset("serve", word_dict, article_max_len, summary_max_len, args.toy)
valid_x_len = list(map(lambda x: len([y for y in x if y != 0]), valid_x))
with tf.Session() as sess:
print("Loading saved model...")
model = Model(reversed_dict, article_max_len, summary_max_len, args, forward_only=True)
saver = tf.train.Saver(tf.global_variables())
ckpt = tf.train.get_checkpoint_state("./saved_model/")
saver.restore(sess, ckpt.model_checkpoint_path)
batches = batch_iter(valid_x, valid_y, args.batch_size, 1)
#print(valid_x, file=open("art_working_inp.txt", "a"))
print("Writing summaries to 'result.txt'...")
for batch_x, batch_y in batches:
batch_x_len = list(map(lambda x: len([y for y in x if y != 0]), batch_x))
valid_feed_dict = {
model.batch_size: len(batch_x),
model.X: batch_x,
model.X_len: batch_x_len,
}
prediction = sess.run(model.prediction, feed_dict=valid_feed_dict)
prediction_output = list(map(lambda x: [reversed_dict[y] for y in x], prediction[:, 0, :]))
#Save out our model
cwd = os.getcwd()
path = os.path.join(cwd, 'simple')
inputs_dict = {
"X": tf.convert_to_tensor(batch_x)
}
outputs_dict = {
"prediction": tf.convert_to_tensor(prediction_output)
}
tf.saved_model.simple_save(
sess, path, inputs_dict, outputs_dict
)
print('Model Saved')
#End save model code
#Save results to file
with open("result.txt", "a") as f:
for line in prediction_output:
summary = list()
for word in line:
if word == "</s>":
break
if word not in summary:
summary.append(word)
print(" ".join(summary), file=f)
print('Summaries are saved to "result.txt"...')
Then my call to the server for inference is here. Regardless of what I put into data, it will always spit out the same prediction which is the one I originally passed in when exporting the model.
def do_inference(hostport):
"""Tests PredictionService with concurrent requests.
Args:
hostport: Host:port address of the PredictionService.
Returns:
pred values, ground truth labels, processing time
"""
# connect to server
host, port = hostport.split(':')
channel = grpc.insecure_channel(hostport)
stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
# prepare request object
request = predict_pb2.PredictRequest()
request.model_spec.name = 'saved_model'
# Get the input data from our arg
jsn_inp = sys.argv[1]
data = json.loads(jsn_inp)['tokenized']
data = np.array(data)
request.inputs['X'].CopyFrom(
tf.contrib.util.make_tensor_proto(data, shape=data.shape, dtype=tf.int64))
#print(request)
result = stub.Predict(request, 10.0) # 10 seconds
return result
Should this be of any use, this is how it is building the dataset. I modified the build_dataset function so it uses just the arg passed in, but this didnt resolve the problem either. I thought perhaps something similar to javascript closures was occuring or something, so I thought I would pull the data in this way.
def build_dataset(step, word_dict, article_max_len, summary_max_len, toy=False):
if step == "train":
article_list = get_text_list(train_article_path, toy)
title_list = get_text_list(train_title_path, toy)
elif step == "valid":
article_list = get_text_list(valid_article_path, toy)
title_list = get_text_list(valid_title_path, toy)
elif step == "serve":
arg_to_use = sys.argv[1] if ("tokenized" in sys.argv[1]) else sys.argv[2]
article_list = [json.loads(arg_to_use)["tokenized"]]
else:
raise NotImplementedError
if step != "serve":
x = list(map(lambda d: word_tokenize(d), article_list))
x = list(map(lambda d: list(map(lambda w: word_dict.get(w, word_dict["<unk>"]), d)), x))
x = list(map(lambda d: d[:article_max_len], x))
x = list(map(lambda d: d + (article_max_len - len(d)) * [word_dict["<padding>"]], x))
print(x, file=open("input_values.txt", "a"))
y = list(map(lambda d: word_tokenize(d), title_list))
y = list(map(lambda d: list(map(lambda w: word_dict.get(w, word_dict["<unk>"]), d)), y))
y = list(map(lambda d: d[:(summary_max_len-1)], y))
else:
x = article_list
#x = list(map(lambda d: word_tokenize(d), article_list))
#x = list(map(lambda d: list(map(lambda w: word_dict.get(w, word_dict["<unk>"]), d)), x))
x = list(map(lambda d: d[:article_max_len], x))
x = list(map(lambda d: d + (article_max_len - len(d)) * [word_dict["<padding>"]], x))
y = list()
return x, y
SignatureDef info (One thing that has me a bit concerned is the Const below...but not sure that is anything...going to be looking at that right now):
signature_def['serving_default']:
The given SavedModel SignatureDef contains the following input(s):
inputs['X'] tensor_info:
dtype: DT_INT64
shape: (1, 50)
name: Const:0
The given SavedModel SignatureDef contains the following output(s):
outputs['prediction'] tensor_info:
dtype: DT_STRING
shape: (1, 11)
name: Const_1:0
Method name is: tensorflow/serving/predict
Ok....so it seems the const issue was my problem or rather directed me to finding what the real issue was. The real source to my problem was that I was passing into tf.convert_to_tensor my values rather than the tf.placeholders themselves. Therefore, by modifying the logic to the below entries when saving out the model, I was able to get the proper response when sending my inputs in. As you can see I also had to feed in the other original batch_size and x_len as well. Hope others find this helpful.
inputs_dict = {
"batch_size": tf.convert_to_tensor(model.batch_size),
"X": tf.convert_to_tensor(model.X),
"X_len": tf.convert_to_tensor(model.X_len),
}
outputs_dict = {
"prediction": tf.convert_to_tensor(model.prediction)
}
This yielded a much better looking SignatureDef:
signature_def['serving_default']:
The given SavedModel SignatureDef contains the following input(s):
inputs['X'] tensor_info:
dtype: DT_INT32
shape: (-1, 50)
name: Placeholder:0
The given SavedModel SignatureDef contains the following output(s):
outputs['prediction'] tensor_info:
dtype: DT_INT32
shape: (-1, 10, -1)
name: decoder/decoder/transpose_1:0
Method name is: tensorflow/serving/predict

upgrade Django to 1.11 - formset changes

I have problem with my app.
After upgrading Django to latest version - 1.11.4, I am getting the same error for three of my tests.
AssertionError: False is not true : The formset 'formset' in context 0 does not contain the non-form error 'Musi być podana co najmniej jedna stawka' (actual errors: ['Proszę wysłać 1 lub więcej formularzy.'])
I know that errors are in polish language but the fact is one, It was working with Django 1.9 and Django 1.10 but it is not with Django 1.11.
Could somebody tell me what was changed in Django 1.11 with forests??
I have read a lot of stuff about it and I have tried almost everything but old version is not working.
Maybe this formset doesn't see my error message and I have to make errors in another way after upgrading?
class BaseTaxRateInlineFormSet(UniqueFieldsFormSetMixin,
NotEmptyInlineFormSetMixin,
BaseInlineFormSet):
_unique_fields = ('valid_from',)
msg_at_least_one_required = __('Musi być podana co najmniej jedna stawka')
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.queryset = self.queryset.order_by('valid_from')
superclasses:
class NotEmptyInlineFormSetMixin(object):
"""
Prevents from deleting last not empty row (when other are empty)
"""
msg_at_least_one_required = __(
'Co najmniej jeden wiersz musi być wypełniony'
)
code_at_least_one_required = 'at_least_one_required'
def _form_is_empty(self, form):
return not len(list(filter(None, form.cleaned_data.values())))
def clean(self):
super().clean()
if any(self.errors):
return
forms_to_delete = 0
forms_empty = 0
for form in self.forms:
if self._should_delete_form(form):
forms_to_delete += 1
elif self._form_is_empty(form):
forms_empty += 1
if forms_to_delete + forms_empty == self.total_form_count():
raise ValidationError(
self.msg_at_least_one_required,
code=self.code_at_least_one_required
)
class UniqueFieldsFormSetMixin(object):
"""
Checks if fields (_unique_fields) have unique values in all forms
"""
_unique_fields = []
msg_field_not_unique = __('Wartość nie może się powtarzać')
code_field_not_unique = 'field_not_unique'
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if 'unique_fields' in kwargs:
self._unique_fields = kwargs.pop('unique_fields', [])
def clean(self):
super().clean()
if self._unique_fields:
values = {field: set() for field in self._unique_fields}
for form in self.forms:
for field in self._unique_fields:
val = form.cleaned_data.get(field)
if val:
if val in values[field]:
form.add_error(
field,
ValidationError(
self.msg_field_not_unique,
code=self.code_field_not_unique
)
)
values[field].add(val)
and test:
def test_update_removing_last_row(self):
data = {
# One row for existing rates and one empty.
self.FORMSET_PREFIX + '-TOTAL_FORMS': 2,
self.FORMSET_PREFIX + '-INITIAL_FORMS': 1,
self.FORMSET_PREFIX + '-MAX_NUM_FORMS': 100
}
# First row exists
data = self._load_formset_data(
data, dict(
self.rate_data_1, DELETE='on',
**{'id': self.rate_pk_1, self.related_field: self.instance_pk}
), name=self.instance_name
)
self.client.force_login(self.superuser)
response = self.client.post(self.url_update, data=data)
# Can't remove last row
self.assertEqual(response.status_code, 200)
self.assertFormsetError(
response, 'formset', None, None,
self.base_formset_class.msg_at_least_one_required
)
The problem is with two lines of Django formset.
if not form.has_changed() and i >= self.initial_form_count():
empty_forms_count += 1
In this situation Django knows which of my forms are empty and not changed.
How to change my tests or maybe implementation of formset to have not fails in my testing machine?

Resources