How to bulk create or update in Django - python-3.x

I have to process an item report CSV file every 1 hour. The CSV contains 150k+ records for 1 account and there are multiple accounts in my system. I was working previously on rails and there was active record gem to handle this use case very efficiently. I am looking for an alternate to this gem in Django or any built in method that will be helpful to import such large data in bulk.
So far I have tried this code.
class ItemReportService:
def call(self, file_url):
with open(file_url, 'r') as file:
reader = csv.DictReader(file)
products = []
for row in reader:
product = self.process_product(row)
products.append(product)
self.update_products(products)
def process_product(self, row):
print(f'Processing sku: {row["SKU"]}')
product = Product.objects.filter(
sku=row['SKU']).first() or Product(sku=row['SKU'])
product.listing_title = row['Product Name']
product.listed_price = row['Price']
product.buy_box_price = row['Buy Box Item Price'] + \
row['Buy Box Shipping Price']
product.status = row['Lifecycle Status']
return product
def update_products(self, products):
Product.objects.bulk_update(
products,
[
'listing_title',
'listed_price',
'buy_box_price',
'Lifecycle Status'
]
)
It is raising this exception because when there is a new product it doesn't have primary key assigned to it
ValueError: All bulk_update() objects must have a primary key set.

Django 4.1 has new parameters for bulk_create(update_conflicts=bool and update_fields=[])
If your model has a field UNIQUE usually Django would ignore it when creating new data. But if you set the update_conflicts parameter to True, the fields inside update_fields will be updated.

You are not saving the product in the database before applying bulk_update.
I have checked your code for this purpose, you can use bulk_insert with an additional parameter
Model.objects.bulk_create(self.data, ignore_conflicts=True)
or
columns = ['column1', 'column2']
obj = Model.objects.filter(column1="sku").first()
if not obj:
obj = Model.objects.create(column1="sku")
obj.column1 = row["column1"] or obj.column1
obj.column2 = row["column2"] or obj.column2
items_to_be_inserted.append(obj)
In the end, you can do bulk update like
Model.objects.bulk_update(items_to_be_inserted, columns)
This will solve your problem.

I made this class function which can be used on any Django model in a project.
from django.db import models
class BaseModel(models.Model):
#classmethod
def bulk_create_or_update(
cls, uniques: list[str],
defaults: list[str],
data: list[dict]
):
# Get existing object list
data_dict, select = {}, None
for entry in data:
sub_entry, key = {}, ''
for uniq in uniques:
sub_entry[uniq] = entry[uniq]
key += str(entry[uniq])
data_dict[key] = entry
if not select:
select = models.Q(**sub_entry)
continue
select |= models.Q(**sub_entry)
records = cls.objects.filter(select).values('pk', *uniques)
existing = {}
for rec in records:
key = ''
for uniq in uniques:
key += str(rec[uniq])
existing[key] = rec
# Split new objects from existing ones
to_create, to_update = [], []
for key, entry in data_dict.items():
obj = cls(**entry)
if key not in existing:
to_create.append(obj)
continue
obj.pk = existing[key]['pk']
to_update.append(obj)
cls.objects.bulk_create(to_create, batch_size=1000)
cls.objects.bulk_update(to_create, defaults, batch_size=1000)
Let take an usage example
class Product(BaseModel)
price = models.IntegerField()
name = models.CharField(max_length=128, unique=True)
status = models.CharField(max_length=128)
if __name__ == '__main__':
data = [
{'price': 50, 'name': 'p1', 'status': 'New'},
{'price': 33, 'name': 'p2', 'status': 'Old'}
]
Product.bulk_create_or_update(uniques=['name'], defaults=['price', 'status'], data=data)
Any improvement suggestion of the code is welcome.

Related

loop efficiency and performance impact calling api in python

Team:
My concern is on redundancy, efficient use of loops and best approach to get the desired result.
Usecase: get on call user name and create jira ticket with it.
below is my entire code and it runs fine for me. This is my very first OOP project.
Flow: I am calling two APIS (jira and pager api).
First calling pager api and getting who is oncall currently. Here am getting a list of nested dicts as response that am looping on.
Then calling jira api to create ticket with that above oncall user.
i want to learn to calculate Big0 and improve.
since this is my very first time can I get to see if there any problems or inefficiency or divergence from standard practices?
import requests
import json
import os
from jira import JIRA
from pdpyras import APISession
from collections import OrderedDict
JIRA_DICT_KEY = "JIRA"
JIRA_CONFIG = {'server': "https://jirasw.tom.com"}
JIRA_USER = os.environ['JIRA_USER']
JIRA_PW = os.environ['JIRA_PW']
PD_API_KEY = os.environ['PD_API_KEY']
USER_EMAIL = os.environ['USER_EMAIL']
class ZidFinder(object):
def __init__(self):
self.active_zid_errors = dict()
self.team_oncall_dict = dict()
self.onCall = self.duty_oncall()
self.jira = self.init_jira()
def init_jira(self):
jira = JIRA(options=JIRA_CONFIG, auth=(JIRA_USER, JIRA_PW))
return jira
def duty_oncall(self, *args):
session = APISession(PD_API_KEY, default_from=USER_EMAIL)
total = 1 #true or false
limit = 100 # this var is to pull limit records at a time.
teamnm = "Product SRE Team"
team_esp_name = "Product SRE Escalation Policy"
teamid = ""
teamesplcyid = ""
if args:
offset = args[0]
total_teams = args[1]
if offset <= total_teams:
print("\nfunc with args with new offset {} called\n".format(offset))
teams = session.get('/teams?limit={0}&total={1}&offset={2}'.format(limit,total,offset))
else:
print("Reached max teams, no more team records to pull")
return
else:
print("\nPull first set of {} teams as defined by limit var and loop more if team not found..\n".format(limit))
teams = session.get('/teams?limit={0}&total={1}'.format(limit,total))
if not teams.ok:
return
else:
tj = teams.json()
tjd = tj['teams']
print("\n")
for adict in tjd:
if not adict['name'] == teamnm:
continue
elif adict['name'] == teamnm:
teamid = adict['id']
print("Found team..\n",adict['name'], "id: {0}".format(teamid))
esclp = session.get('/escalation_policies?total={0}&team_ids%5B%5D={1}'.format(total,teamid))
if not esclp.ok:
print("Failed pulling Escalation polices for team '{}'".format(teamnm))
return
else:
ep = esclp.json()
epj = esclp.json()['escalation_policies']
if not epj:
print("Escalation polices for team '{}' not defined".format(teamnm))
return
else:
for adict in epj:
if not adict['summary'] == team_esp_name:
continue
else:
teamesplcyid = adict['id']
print("{} id: {}\n".format(team_esp_name, teamesplcyid))
oncalls = session.get('/oncalls?total={0}&escalation_policy_ids%5B%5D={1}'.format(total,teamesplcyid))
if not oncalls.ok:
print("Issue in getting oncalls")
return
else:
ocj = oncalls.json()['oncalls']
for adict in ocj:
if adict['escalation_level'] == 1 or adict['escalation_level'] == 2:
self.team_oncall_dict[adict['schedule']['summary']] = adict['user']['summary']
continue
if self.team_oncall_dict:
if len(self.team_oncall_dict) == 1:
print("\nOnly Primary onCall is defined")
print("\n",self.team_oncall_dict)
else:
print(" Primary and other calls defined")
print("\n",OrderedDict(self.team_oncall_dict),"\n")
return
else:
print("Calling with next offset as team was not found in the records pulled under limit..")
if tj['offset'] <= tj['total'] or tj['more'] == True:
setoffset = limit + tj['offset']
self.onCall(setoffset, tj['total'])
def create_jiras(self):
node = ["node1", "node2"]
zid_label = ["id90"]
labels = [node, zid_label]
print("Creating a ticket for node {} with description: {}".format(node, str(self.active_zid_errors[node])))
if self.msre_oncall_dict:
print("Current onCalls pulled from Duty, use them as assignee in creating jira tickets..")
new_issue = self.jira.create_issue(project='TEST', summary='ZID error on node {}'.format(node),
description=str(self.active_zid_errors[node]), issuetype={'name': 'Bug'}, assignee={'name': self.msre_oncall_dict['Product SRE Primary']},labels=labels)
print("Created a new ticket: ", new_issue.key, new_issue.fields.summary)
self.active_zid_errors[node][JIRA_DICT_KEY] = new_issue.key
else:
print("Current onCalls were not pulled from Duty, create jira with defautl assignee..")
new_issue = self.jira.create_issue(project='TEST', summary='ZID error on node {}'.format(node),
description=str(self.active_zid_errors[node]), issuetype={'name': 'Bug'},labels=labels)
print("Created a new ticket: ", new_issue.key, new_issue.fields.summary)
self.active_zid_errors[node][JIRA_DICT_KEY] = new_issue.key
if __name__== "__main__":
o = ZidFinder()

Peewee - update issue with records with ForeignKeyField('self')

I have a database that I am filling from a pd.DataFrame. One of the classes has a ForeignKeyField('self').
from peewee import SqliteDatabase, Model
from peewee import IntegerField, CharField, ForeignKeyField, BooleanField
import pandas as pd
db = SqliteDatabase(':memory:', pragmas=(('foreign_keys', 'on'),))
class BaseModel(Model):
class Meta:
database = db
class Team(BaseModel):
id = IntegerField(unique = True, primary_key = True)
name = CharField()
reserve_team = BooleanField()
parent_team = ForeignKeyField('self', related_name = 'reserve_teams', null = True)
class Meta:
db_table = 'team_team'
Team.create_table()
The dataframe I am filling from looks something like this
df = pd.DataFrame({'ID': [1,2,3,4,5],
'Name': ['A','A2','B','C','C2'],
'Reserve': [False, True, False, False, True],
'Parent': [None, 'A', None, None, 'C']})
I use the following code to fill the table. The parent_team is set to None and when the table is filled I intend to go back and update this field where appropriate.
data = []
for row in df.itertuples():
data.append((row.ID,
row.Name,
row.Reserve == True,
None))
fields = [Team.id,
Team.name,
Team.reserve_team,
Team.parent_team]
with db.atomic():
Team.insert_many(data, fields = fields).execute()
My problem is that I don't understand how to do this without looping over the dataframe/table combination. The documentation seems pretty clear that this should never be done.
for row in df.itertuples():
if row.Reserve:
r = row.ID
p = row.Parent
Team.update(parent_team = Team.get(Team.name == p)).where(Team.id == r).execute()
You could do a topo-sort of the data and then insert them directly with the parent IDs.
As far as looping and updating -- some ideas:
wrap in a transaction
use ValuesList() to provide the mapping of id->parent id and update all at once
insert id -> parent id into a temp table and update using the temp table (all at once)

Dictionary with functions versus dictionary with class

I'm creating a game where i have the data imported from a database, but i have a little problem...
Currently i get a copy of the data as a dictionary, which i need to pass as argument to my GUI, however i also need to process some data, like in this example:
I get the data as a dict (I've created the UseDatabase context manager and is working):
def get_user(name: str, passwd: str):
user = {}
user['name'] = name
user['passwd'] = passwd
with UseDatabase() as cursor:
_SQL = "SELECT id, cash, ruby FROM user WHERE name='Admin' AND password='adminpass'"
cursor.execute(_SQL)
res = cursor.fetchall()
if res:
user['id'] = res[0][0]
user['cash'] = res[0][1]
user['ruby'] = res[0][2]
return user
return res
.
.
.
def get_activities():
with UseDatabase() as cursor:
_SQL = "SELECT * FROM activities WHERE user_id='2'"
cursor.execute(_SQL)
res = cursor.fetchall()
if res:
ids = [i[0] for i in res]
activities = {}
for i in res:
activities[i[0]] = {'title':i[1],'unlock':i[2],'usr_progress':i[3]}
return (ids, activities)
return res
Need it as a dict in my GUI ("content" argument):
class SideBar:
def __init__(self, screen: 'pygame.display.set_mode()', box_width: int, box_height: int, content: dict, font: 'font = pygame.font.Font()'):
#content dict: {id: {'title':'','unlock':'','usr_progress':''},...}
self.box_width = box_width
self.box_height = box_height
self.box_per_screen = screen.get_height() // box_height
self.content = content
self.current_box = 1
self.screen = screen
self.font = font
self.generate_bar()
def generate_bar (self):
active = [i for i in self.content.keys() if i in range(self.current_box, self.current_box+self.box_per_screen)]
for i in range(self.box_per_screen):
gfxdraw.box(self.screen,pygame.Rect((0,i*self.box_height),(self.screen.get_width()/3,self.screen.get_height()/3)),(249,0,0,170))
self.screen.blit(self.font.render(str(active[i]) + ' - ' + self.content[active[i]]['title'], True, (255,255,255)),(10,i*self.box_height+4))
for i in range(self.box_per_screen):
pygame.draw.rect(self.screen,(50,0,0),pygame.Rect((0,i*self.box_height),(self.screen.get_width()/3,self.screen.get_height()/3)),2)
But still need to make some changes in the data:
def unlock_act(act_id):
if user['cash'] >= activities[act_id]['unlock'] and activities[act_id]['usr_progress'] == 0:
user['cash'] -= activities[act_id]['unlock']
activities[act_id]['usr_progress'] = 1
So the question is: in this situation should i keep a copy of the data as dict, and create a class with it plus the methods i need or use functions to edit the data inside the dict?

Assigning multiple values to dictionary keys from a file in Python 3

I'm fairly new to Python but I haven't found the answer to this particular problem.
I am writing a simple recommendation program and I need to have a dictionary where cuisine is a key and name of a restaurant is a value. There are a few instances where I have to split a string of a few cuisine names and make sure all other restaurants (values) which have the same cuisine get assigned to the same cuisine (key). Here's a part of a file:
Georgie Porgie
87%
$$$
Canadian, Pub Food
Queen St. Cafe
82%
$
Malaysian, Thai
Mexican Grill
85%
$$
Mexican
Deep Fried Everything
52%
$
Pub Food
so it's just the first and the last one with the same cuisine but there are more later in the file.
And here is my code:
def new(file):
file = "/.../Restaurants.txt"
d = {}
key = []
with open(file) as file:
lines = file.readlines()
for i in range(len(lines)):
if i % 5 == 0:
if "," not in lines[i + 3]:
d[lines[i + 3].strip()] = [lines[i].strip()]
else:
key += (lines[i + 3].strip().split(', '))
for j in key:
if j not in d:
d[j] = [lines[i].strip()]
else:
d[j].append(lines[i].strip())
return d
It gets all the keys and values printed but it doesn't assign two values to the same key where it should. Also, with this last 'else' statement, the second restaurant is assigned to the wrong key as a second value. This should not happen. I would appreciate any comments or help.
In the case when there is only one category you don't check if the key is in the dictionary. You should do this analogously as in the case of multiple categories and then it works fine.
I don't know why you have file as an argument when you have a file then overwritten.
Additionally you should make 'key' for each result, and not += (adding it to the existing 'key'
when you check if j is in dictionary, clean way is to check if j is in the keys (d.keys())
def new(file):
file = "/.../Restaurants.txt"
d = {}
key = []
with open(file) as file:
lines = file.readlines()
for i in range(len(lines)):
if i % 5 == 0:
if "," not in lines[i + 3]:
if lines[i + 3] not in d.keys():
d[lines[i + 3].strip()] = [lines[i].strip()]
else:
d[lines[i + 3]].append(lines[i].strip())
else:
key = (lines[i + 3].strip().split(', '))
for j in key:
if j not in d.keys():
d[j] = [lines[i].strip()]
else:
d[j].append(lines[i].strip())
return d
Normally, I find that if you use names for the dictionary keys, you may have an easier time handling them later.
In the example below, I return a series of dictionaries, one for each restaurant. I also wrap the functionality of processing the values in a method called add_value(), to keep the code more readable.
In my example, I'm using codecs to decode the value. Although not necessary, depending on the characters you are dealing with it may be useful. I'm also using itertools to read the file lines with an iterator. Again, not necessary depending on the case, but might be useful if you are dealing with really big files.
import copy, itertools, codecs
class RestaurantListParser(object):
file_name = "restaurants.txt"
base_item = {
"_type": "undefined",
"_fields": {
"name": "undefined",
"nationality": "undefined",
"rating": "undefined",
"pricing": "undefined",
}
}
def add_value(self, formatted_item, field_name, field_value):
if isinstance(field_value, basestring):
# handle encoding, strip, process the values as you need.
field_value = codecs.encode(field_value, 'utf-8').strip()
formatted_item["_fields"][field_name] = field_value
else:
print 'Error parsing field "%s", with value: %s' % (field_name, field_value)
def generator(self, file_name):
with open(file_name) as file:
while True:
lines = tuple(itertools.islice(file, 5))
if not lines: break
# Initialize our dictionary for this item
formatted_item = copy.deepcopy(self.base_item)
if "," not in lines[3]:
formatted_item['_type'] = lines[3].strip()
else:
formatted_item['_type'] = lines[3].split(',')[1].strip()
self.add_value(formatted_item, 'nationality', lines[3].split(',')[0])
self.add_value(formatted_item, 'name', lines[0])
self.add_value(formatted_item, 'rating', lines[1])
self.add_value(formatted_item, 'pricing', lines[2])
yield formatted_item
def split_by_type(self):
d = {}
for restaurant in self.generator(self.file_name):
if restaurant['_type'] not in d:
d[restaurant['_type']] = [restaurant['_fields']]
else:
d[restaurant['_type']] += [restaurant['_fields']]
return d
Then, if you run:
p = RestaurantListParser()
print p.split_by_type()
You should get:
{
'Mexican': [{
'name': 'Mexican Grill',
'nationality': 'undefined',
'pricing': '$$',
'rating': '85%'
}],
'Pub Food': [{
'name': 'Georgie Porgie',
'nationality': 'Canadian',
'pricing': '$$$',
'rating': '87%'
}, {
'name': 'Deep Fried Everything',
'nationality': 'undefined',
'pricing': '$',
'rating': '52%'
}],
'Thai': [{
'name': 'Queen St. Cafe',
'nationality': 'Malaysian',
'pricing': '$',
'rating': '82%'
}]
}
Your solution is simple, so it's ok. I'd just like to mention a couple of ideas that come to mind when I think about this kind of problem.
Here's another take, using defaultdict and split to simplify things.
from collections import defaultdict
record_keys = ['name', 'rating', 'price', 'cuisine']
def load(file):
with open(file) as file:
data = file.read()
restaurants = []
# chop up input on each blank line (2 newlines in a row)
for record in data.split("\n\n"):
fields = record.split("\n")
# build a dictionary by zipping together the fixed set
# of field names and the values from this particular record
restaurant = dict(zip(record_keys, fields))
# split chops apart the type cuisine on comma, then _.strip()
# removes any leading/trailing whitespace on each type of cuisine
restaurant['cuisine'] = [_.strip() for _ in restaurant['cuisine'].split(",")]
restaurants.append(restaurant)
return restaurants
def build_index(database, key, value):
index = defaultdict(set)
for record in database:
for v in record.get(key, []):
# defaultdict will create a set if one is not present or add to it if one does
index[v].add(record[value])
return index
restaurant_db = load('/var/tmp/r')
print(restaurant_db)
by_type = build_index(restaurant_db, 'cuisine', 'name')
print(by_type)

ValueError: Unknown protobuf attr type <class 'dict'> when tried to put a nested dict/entity

I tried to put a nested entity/dict into datastore using Python,
metadata_row = dict()
metadata_row['batch_id'] = str(uuid1())
metadata_row['table_ok'] = True
metadata_row['table_name'] = 'metadata'
metadata_row['num_rows'] = 1
metadata_row['violations'] = []
metadata_row['errors'] = []
metadata_row['time'] = {}
metadata_row['time']['total_time'] = 82.42656564712524
metadata_row['time']['mod1'] = 5.940682411193848
metadata_row['time']['mod2'] = 19.16786551475525
metadata_row['time']['mod3'] = 31.617812633514404
metadata_row['time']['mod4'] = 0.00038933753967285156
metadata_row['time']['mod5'] = 53.35780310630798
with self.client.transaction():
entities = [Entity(self.client.key('metadata')) for i in range(len([metadata_row]))]
for entity, update_dict in zip(entities, [metadata_row]):
entity.update(update_dict)
self.client.put_multi(entities)
I tested it by using datastore emulator, but I got the following error,
ValueError: Unknown protobuf attr type <class 'dict'>
I am wondering how to fix the issue. I am also wondering does datastore natively support nested dictionary in that one doesn't have to create an entity for the inner dictionary, i.e. time in this case.
UPDATE. I added an inner entity in metadata_row for key time to solve the problem.
client = datastore.Client()
metadata_row['time'] = datastore.Entity(key=client.key('time'))
metadata_row['time']['total_time'] = 82.42656564712524
metadata_row['time']['mod1'] = 5.940682411193848
metadata_row['time']['mod2'] = 19.16786551475525
metadata_row['time']['mod3'] = 31.617812633514404
metadata_row['time']['mod4'] = 0.00038933753967285156
metadata_row['time']['mod5'] = 53.35780310630798
# code for put_multi()
def convert(obj, client, key):
"""
:param obj: dict
:param client: datastore client
:param key: Generated as client.key("your_table", "your_key")
:return: datastore.Entity
"""
if isinstance(obj, list):
return [convert(item, client, None) for item in obj]
elif isinstance(obj, dict):
entity = datastore.Entity(key)
for key in obj:
entity[key] = convert(obj[key], client, None)
return entity
else:
return obj
Please do suggest any improvements or alternatives as Google Cloud Datastore currently doesn't suggest a way to convert a JSON to an Entity.

Resources