ValueError: Unknown protobuf attr type <class 'dict'> when tried to put a nested dict/entity - python-3.x

I tried to put a nested entity/dict into datastore using Python,
metadata_row = dict()
metadata_row['batch_id'] = str(uuid1())
metadata_row['table_ok'] = True
metadata_row['table_name'] = 'metadata'
metadata_row['num_rows'] = 1
metadata_row['violations'] = []
metadata_row['errors'] = []
metadata_row['time'] = {}
metadata_row['time']['total_time'] = 82.42656564712524
metadata_row['time']['mod1'] = 5.940682411193848
metadata_row['time']['mod2'] = 19.16786551475525
metadata_row['time']['mod3'] = 31.617812633514404
metadata_row['time']['mod4'] = 0.00038933753967285156
metadata_row['time']['mod5'] = 53.35780310630798
with self.client.transaction():
entities = [Entity(self.client.key('metadata')) for i in range(len([metadata_row]))]
for entity, update_dict in zip(entities, [metadata_row]):
entity.update(update_dict)
self.client.put_multi(entities)
I tested it by using datastore emulator, but I got the following error,
ValueError: Unknown protobuf attr type <class 'dict'>
I am wondering how to fix the issue. I am also wondering does datastore natively support nested dictionary in that one doesn't have to create an entity for the inner dictionary, i.e. time in this case.
UPDATE. I added an inner entity in metadata_row for key time to solve the problem.
client = datastore.Client()
metadata_row['time'] = datastore.Entity(key=client.key('time'))
metadata_row['time']['total_time'] = 82.42656564712524
metadata_row['time']['mod1'] = 5.940682411193848
metadata_row['time']['mod2'] = 19.16786551475525
metadata_row['time']['mod3'] = 31.617812633514404
metadata_row['time']['mod4'] = 0.00038933753967285156
metadata_row['time']['mod5'] = 53.35780310630798
# code for put_multi()

def convert(obj, client, key):
"""
:param obj: dict
:param client: datastore client
:param key: Generated as client.key("your_table", "your_key")
:return: datastore.Entity
"""
if isinstance(obj, list):
return [convert(item, client, None) for item in obj]
elif isinstance(obj, dict):
entity = datastore.Entity(key)
for key in obj:
entity[key] = convert(obj[key], client, None)
return entity
else:
return obj
Please do suggest any improvements or alternatives as Google Cloud Datastore currently doesn't suggest a way to convert a JSON to an Entity.

Related

How to bulk create or update in Django

I have to process an item report CSV file every 1 hour. The CSV contains 150k+ records for 1 account and there are multiple accounts in my system. I was working previously on rails and there was active record gem to handle this use case very efficiently. I am looking for an alternate to this gem in Django or any built in method that will be helpful to import such large data in bulk.
So far I have tried this code.
class ItemReportService:
def call(self, file_url):
with open(file_url, 'r') as file:
reader = csv.DictReader(file)
products = []
for row in reader:
product = self.process_product(row)
products.append(product)
self.update_products(products)
def process_product(self, row):
print(f'Processing sku: {row["SKU"]}')
product = Product.objects.filter(
sku=row['SKU']).first() or Product(sku=row['SKU'])
product.listing_title = row['Product Name']
product.listed_price = row['Price']
product.buy_box_price = row['Buy Box Item Price'] + \
row['Buy Box Shipping Price']
product.status = row['Lifecycle Status']
return product
def update_products(self, products):
Product.objects.bulk_update(
products,
[
'listing_title',
'listed_price',
'buy_box_price',
'Lifecycle Status'
]
)
It is raising this exception because when there is a new product it doesn't have primary key assigned to it
ValueError: All bulk_update() objects must have a primary key set.
Django 4.1 has new parameters for bulk_create(update_conflicts=bool and update_fields=[])
If your model has a field UNIQUE usually Django would ignore it when creating new data. But if you set the update_conflicts parameter to True, the fields inside update_fields will be updated.
You are not saving the product in the database before applying bulk_update.
I have checked your code for this purpose, you can use bulk_insert with an additional parameter
Model.objects.bulk_create(self.data, ignore_conflicts=True)
or
columns = ['column1', 'column2']
obj = Model.objects.filter(column1="sku").first()
if not obj:
obj = Model.objects.create(column1="sku")
obj.column1 = row["column1"] or obj.column1
obj.column2 = row["column2"] or obj.column2
items_to_be_inserted.append(obj)
In the end, you can do bulk update like
Model.objects.bulk_update(items_to_be_inserted, columns)
This will solve your problem.
I made this class function which can be used on any Django model in a project.
from django.db import models
class BaseModel(models.Model):
#classmethod
def bulk_create_or_update(
cls, uniques: list[str],
defaults: list[str],
data: list[dict]
):
# Get existing object list
data_dict, select = {}, None
for entry in data:
sub_entry, key = {}, ''
for uniq in uniques:
sub_entry[uniq] = entry[uniq]
key += str(entry[uniq])
data_dict[key] = entry
if not select:
select = models.Q(**sub_entry)
continue
select |= models.Q(**sub_entry)
records = cls.objects.filter(select).values('pk', *uniques)
existing = {}
for rec in records:
key = ''
for uniq in uniques:
key += str(rec[uniq])
existing[key] = rec
# Split new objects from existing ones
to_create, to_update = [], []
for key, entry in data_dict.items():
obj = cls(**entry)
if key not in existing:
to_create.append(obj)
continue
obj.pk = existing[key]['pk']
to_update.append(obj)
cls.objects.bulk_create(to_create, batch_size=1000)
cls.objects.bulk_update(to_create, defaults, batch_size=1000)
Let take an usage example
class Product(BaseModel)
price = models.IntegerField()
name = models.CharField(max_length=128, unique=True)
status = models.CharField(max_length=128)
if __name__ == '__main__':
data = [
{'price': 50, 'name': 'p1', 'status': 'New'},
{'price': 33, 'name': 'p2', 'status': 'Old'}
]
Product.bulk_create_or_update(uniques=['name'], defaults=['price', 'status'], data=data)
Any improvement suggestion of the code is welcome.

AllenNLP DatasetReader.read returns generator instead of AllennlpDataset

While studying AllenNLP framework (version 2.0.1), I tried to implement the example code from https://guide.allennlp.org/training-and-prediction#1.
While reading the data from a Parquet file I got:
TypeError: unsupported operand type(s) for +: 'generator' and 'generator'
for the next line:
vocab = build_vocab(train_data + dev_data)
I suspect the return value should be AllennlpDataset but maybe I got it mixed up.
What did I do wrong?
Full code:
train_path = <some_path>
test_path = <some_other_path>
class ClassificationJobReader(DatasetReader):
def __init__(self,
lazy: bool = False,
tokenizer: Tokenizer = None,
token_indexers: Dict[str, TokenIndexer] = None,
max_tokens: int = None):
super().__init__(lazy)
self.tokenizer = tokenizer or WhitespaceTokenizer()
self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
self.max_tokens = max_tokens
def _read(self, file_path: str) -> Iterable[Instance]:
df = pd.read_parquet(data_path)
for idx in df.index:
text = row['title'][idx] + ' ' + row['description'][idx]
print(f'text : {text}')
label = row['class_id'][idx]
print(f'label : {label}')
tokens = self.tokenizer.tokenize(text)
if self.max_tokens:
tokens = tokens[:self.max_tokens]
text_field = TextField(tokens, self.token_indexers)
label_field = LabelField(label)
fields = {'text': text_field, 'label': label_field}
yield Instance(fields)
def build_dataset_reader() -> DatasetReader:
return ClassificationJobReader()
def read_data(reader: DatasetReader) -> Tuple[Iterable[Instance], Iterable[Instance]]:
print("Reading data")
training_data = reader.read(train_path)
validation_data = reader.read(test_path)
return training_data, validation_data
def build_vocab(instances: Iterable[Instance]) -> Vocabulary:
print("Building the vocabulary")
return Vocabulary.from_instances(instances)
dataset_reader = build_dataset_reader()
train_data, dev_data = read_data(dataset_reader)
vocab = build_vocab(train_data + dev_data)
Thanks for your help
Please find below the code fix first and the explanation afterwards.
Code Fix
# the extend_from_instances expands your vocabulary with the instances passed as an arg
# and is therefore equivalent to Vocabulary.from_instances(train_data + dev_data)
# previously
vocabulary.extend_from_instances(train_data)
vocabulary.extend_from_instances(dev_data)
Explanation
This is because the AllenNLP API have had couple of breaking changes in allennlp==2.0.1. You can find the changelog here and the upgrade guide here.The guide is outdated as per my understanding (it reflects allennlp<=1.4).
The DatasetReader returns a generator now as opposed to a List previously. DatasetReader used to have a parameter called "lazy" which was for lazy loading data. It was False by default and therefore dataset_reader.read would return a List previously. However, as of v2.0 (if i remember exactly), lazy loading is applied by default and it therefore returns a generator by default. As you know, the "+" operator has not been overridden for generator objects and therefore you cannot simply add two generators.
So, you can simply use vocab.extend_from_instances to achieve same behavior as before. Hope this helped you. If you need a full code snippet, please leave a comment below, I could post a rekated gist and share it with you.
Good day!

Simple prediction from frozen .pb saved model

I try for days to use tf exported .pb file model for prediction. The model was generated with bestExporter function as follows :
features_specs = tf.feature_column.make_parse_example_spec(serving_features)
serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec=features_specs,default_batch_size=None)
exporter[n] = tf.estimator.BestExporter(name="best_exporter", serving_input_receiver_fn=serving_input_receiver_fn,event_file_pattern='eval/*.tfevents.*',exports_to_keep=1)
if train_params["use_early_stop"] == True:
hookModel[n] = tf.estimator.experimental.stop_if_no_decrease_hook(model[n], metric_name='average_loss', max_steps_without_decrease=train_params["early_stop_max_steps_without_decrease"], min_steps=train_params["early_stop_min_steps"],run_every_secs=train_params["early_stop_run_every_secs"], run_every_steps=train_params["early_stop_run_every_steps"],)
else:hookModel[n] = None
train_spec[n] = tf.estimator.TrainSpec(input_fn=input_fn_["train"+m],hooks=[hookModel[n]])
eval_spec[n] = tf.estimator.EvalSpec(input_fn=input_fn_["test"+m],start_delay_secs = train_params["eval_specs_start_delay_secs"],throttle_secs = train_params["eval_specs_throttle_secs"],exporters=[exporter[n]])
tf.estimator.train_and_evaluate(model[n], train_spec[n], eval_spec[n])
I think in this way input dict names are referenced...
I successfully load the model with :
model_[model_stage+"_"+model_type] = tf.saved_model.load(model_path)
but i don't know how correctly pass my features dictionnary in the model_XX['prediction'](example) wrapped function.
I saw this topic but didn't help : TensorFlow v2: Replacement for tf.contrib.predictor.from_saved_model
There's no equivalent of old tf.contrib.predictor.from_saved_model i used before...
Thanks for answer.
I found the solution to pass a dict in wrapped model. This is a slightly modified synthesis of these given solutions with modifications for TF2-4/Python 3.7 :
TensorFlow v2: Replacement for tf.contrib.predictor.from_saved_model
https://www.programcreek.com/python/example/90440/tensorflow.Example
Second is particulary complete and shows a lot of cases.
So :
my_dict = {"feature_1" : str(something), "feature_2" : int(an_int), , "feature_3" : float(a_float), ...}
# Load the model
my_model = tf.saved_model.load(model_path)
# Creates a serialized example from dict
def create_serialized_example(name_to_values):
example = tf.train.Example()
for name, values in name_to_values.items():
feature = example.features.feature[name]
if isinstance(values, str):
values = values.encode() # Modified because in new tf versions strings have to be encoded
add = feature.bytes_list.value.extend
elif isinstance(values, float):
add = feature.float_list.value.extend # Modified : float_list instead of float_32 in TF 2
elif isinstance(values, int):
add = feature.int64_list.value.extend
else:
raise AssertionError('Unsupported type: %s' % type(values[0]))
add([values]) # Modified : have to be a list, not variable
return example.SerializeToString()
# Predict function
pred = my_model.signatures["predict"] (examples=tf.constant([create_serialized_example(mydict)]))

TypeError: only() takes 2 positional arguments but 3 were given

Here I am trying to get only two fields from cassandra using only() but when I am passing field names it's giving me above error, I have also tried passing self but did't work. After getting those two fields I need to convert them in two arrays so that I can plot the graph with names and marks.
Here is the code:
from flask import *
from flask_cqlalchemy import CQLAlchemy
app = Flask(__name__)
app.config['CASSANDRA_HOSTS'] = ['127.0.0.1']
app.config['CASSANDRA_KEYSPACE'] = "emp"
db = CQLAlchemy(app)
class Student(db.Model):
uid = db.columns.Integer(primary_key=True)
marks = db.columns.Integer(primary_key=True)
username = db.columns.Text(required=True)
password = db.columns.Text()
#app.route('/meriting')
def show_meritlist():
ob = Student.objects().filter().only(Student.marks, Student.username)
ob = ob.filter(Student.marks >= 65).allow_filtering()
return render_template('merit.html', ml = ob)
db.sync_db()
if __name__ == '__main__':
app.run(debug = True)
only() takes only one parameter that should be an iterable. Try:
ob = Student.objects().filter().only((Student.marks, Student.username))

Dictionary with functions versus dictionary with class

I'm creating a game where i have the data imported from a database, but i have a little problem...
Currently i get a copy of the data as a dictionary, which i need to pass as argument to my GUI, however i also need to process some data, like in this example:
I get the data as a dict (I've created the UseDatabase context manager and is working):
def get_user(name: str, passwd: str):
user = {}
user['name'] = name
user['passwd'] = passwd
with UseDatabase() as cursor:
_SQL = "SELECT id, cash, ruby FROM user WHERE name='Admin' AND password='adminpass'"
cursor.execute(_SQL)
res = cursor.fetchall()
if res:
user['id'] = res[0][0]
user['cash'] = res[0][1]
user['ruby'] = res[0][2]
return user
return res
.
.
.
def get_activities():
with UseDatabase() as cursor:
_SQL = "SELECT * FROM activities WHERE user_id='2'"
cursor.execute(_SQL)
res = cursor.fetchall()
if res:
ids = [i[0] for i in res]
activities = {}
for i in res:
activities[i[0]] = {'title':i[1],'unlock':i[2],'usr_progress':i[3]}
return (ids, activities)
return res
Need it as a dict in my GUI ("content" argument):
class SideBar:
def __init__(self, screen: 'pygame.display.set_mode()', box_width: int, box_height: int, content: dict, font: 'font = pygame.font.Font()'):
#content dict: {id: {'title':'','unlock':'','usr_progress':''},...}
self.box_width = box_width
self.box_height = box_height
self.box_per_screen = screen.get_height() // box_height
self.content = content
self.current_box = 1
self.screen = screen
self.font = font
self.generate_bar()
def generate_bar (self):
active = [i for i in self.content.keys() if i in range(self.current_box, self.current_box+self.box_per_screen)]
for i in range(self.box_per_screen):
gfxdraw.box(self.screen,pygame.Rect((0,i*self.box_height),(self.screen.get_width()/3,self.screen.get_height()/3)),(249,0,0,170))
self.screen.blit(self.font.render(str(active[i]) + ' - ' + self.content[active[i]]['title'], True, (255,255,255)),(10,i*self.box_height+4))
for i in range(self.box_per_screen):
pygame.draw.rect(self.screen,(50,0,0),pygame.Rect((0,i*self.box_height),(self.screen.get_width()/3,self.screen.get_height()/3)),2)
But still need to make some changes in the data:
def unlock_act(act_id):
if user['cash'] >= activities[act_id]['unlock'] and activities[act_id]['usr_progress'] == 0:
user['cash'] -= activities[act_id]['unlock']
activities[act_id]['usr_progress'] = 1
So the question is: in this situation should i keep a copy of the data as dict, and create a class with it plus the methods i need or use functions to edit the data inside the dict?

Resources