I have multiple dictionaries & i want to get the "oldest date" and the "name" of the branch that has the oldest date
I can get the oldest date using this but cannot get which branch has the oldest date.
x = datetime.datetime.now()
# objects
branch_1 = {
"name": "b1",
"gps": (48.8946865, 2.3622423),
"oldestDate": dt.datetime(2019, 1, 7),
}
branch_2 = {
"name": "b2",
"gps": (48.839955, 2.288605),
"cars": 7,
"oldestDate": dt.datetime(2016, 1, 17),
}
branch_3 = {
"name": "b3",
"gps": (48.844244, 2.401435),
"oldestDate": dt.datetime(2019, 1, 21),
}
listOBranches = [branch_1, branch_2, branch_3]
mtlst2 = []
def branchPriorityScore(listOBranches):
for item in listOBranches:
score = x - (item["oldestDate"])
mtlst2.append(score)
dateMax2 = np.max(mtlst2)
name = item["name"]
# return mtlst2
print("Maximum Priority Branch : ", dateMax2) # correct score
# print("Maximum Priority Branch : ", np.max(mtlst2(item["name"])), dateMax2)
# mtlst2
branchPriorityScore(listOBranches)
I'd do it like this.
def branchPriorityScore(listOBranches):
oldest = datetime.timedelta()
i = 0 #if you need index
for index,item in enumerate(listOBranches):
score = x - (item["oldestDate"])
if score > oldest:
oldest = score
i = index #if you need index
name = item["name"]
print (name,i,oldest,listOBranches[i]['oldestDate'])
No sure why you need mtlst2 but you can add it in the cycle too.
Related
I am trying to create dummy data for NER task by replacing person_name with some dummy names. But it's giving me weird results in case of same entities occuring multiple times as discussed here:
Strange result when removing item from a list while iterating over it
Modifying list while iterating
Input example spans:
{
'text':"Mohan dob is 25th dec 1980. Mohan loves to play cricket.",
'spans':[{'start':0, 'end':5,'label':'person_name','ngram':'Mohan'},
{start':28, 'end':33,'label':'person_name','ngram':'Mohan'},
{start':13, 'end':26,'label':'date','ngram':'25th dec 1980'}
]
}
The entities person_name occurs twice in a sample.
sample_names=['Jon', 'Sam']
I want to replace (0, 5, 'person_name') and (28, 33, 'person_name') with sample_names.
Dummy Examples Output:
{
{'text':"Jon dob is 25th dec 1980. Jon loves to play cricket.",
'spans':[{'start':0, 'end':3,'label':'person_name','ngram':'Jon'},
{start':26, 'end':31,'label':'person_name','ngram':'Jon'},
{start':11, 'end':24,'label':'date','ngram':'25th dec 1980'}
]
},
{'text':"Sam dob is 25th dec 1980. Sam loves to play cricket.",
'spans':[{'start':0, 'end':3,'label':'person_name','ngram':'Sam'},
{start':26, 'end':31,'label':'person_name','ngram':'Sam'},
{start':11, 'end':24,'label':'date','ngram':'25th dec 1980'}
]
}
}
The spans also get's updated in output
target_entity='person_name'
names=sample_names
Code:
def generate(data, target_entity, names):
text = data['text']
spans = data['spans']
new_sents=[]
if spans:
spans = [(d['start'], d['end'], d['label']) for d in spans]
spans.sort()
labellist=[s[2] for s in spans]
# get before_spans and after_spans around target entity
for n in names:
gap = 0
for i, tup in enumerate(spans):
lab = tup[2]
if lab == target_entity:
new_spans={"before": spans[:i], "after": spans[i+1:]}
print("the spans before and after :\n",new_spans)
start=tup[0] #check this
end=tup[1]
ngram = text[start:end]
new_s = text[:start] + n + text[end:]
gap = len(n) - len(ngram)
before = new_spans["before"]
after = [(tup[0]+gap, tup[1]+gap, tup[2]) for tup in new_spans["after"]]
s_sp = before + [(start, start + len(n), target_label)] + after
text=new_s
en={"text": new_s,"spans": [{"start": tup[0], "end": tup[1], "label": tup[2], "ngram": new_s[tup[0]:tup[1]]} for tup in s_sp]}
spans = s_sp
new_sents.append(en)
If all you seek to do is replace the placeholder with a new value, you can do something like this:
## --------------------
## Some enxaple input from you
## --------------------
input_data = [
(162, 171, 'pno'),
(241, 254, 'person_name'),
(373, 384, 'date'),
(459, 477, 'date'),
None,
(772, 785, 'person_name'),
(797, 806, 'pno')
]
## --------------------
## --------------------
## create an iterator out of our name list
## you will need to decide what happens if sample names
## gets exhausted.
## --------------------
sample_names = [
'Jon',
'Sam'
]
sample_names_itter = iter(sample_names)
## --------------------
for row in input_data:
if not row:
continue
start = row[0]
end = row[1]
name = row[2] if row[2] != "person_name" else next(sample_names_itter)
print(f"{name} dob is 25th dec 1980. {name} loves to play cricket.")
I want to create a rolling forecast for the following 12 months, the results for the month and entry must become part of the dataframe as well (Later it will be written out into excel as part of a bigger dataframe).
The entries for the new dataframe needs to be calculated based on the criteria, that the forecasted month is between start_date and start_date + duration is also in the range of the forecasted 12 months. If these are met, the value from duration should be written here.
expected output
To do this I imagine that I have to use a numpy.where(), however I can not wrap my head around it.
I came across Use lambda with pandas to calculate a new column conditional on existing column, but after some trying I came to the conclusion, that this can not be the whole truth for my case.
import numpy as np
import pandas as pd
import datetime as dt
months = ["Jan", "Feb", "Mrz", "Apr", "Mai", "Jun", "Jul", "Aug", "Sep", "Okt", "Nov", "Dez"]
cur_month = dt.date.today().month - 1
cur_year = dt.date.today().year
d = {'start_date': ['2020-12-23', '2021-02-08', '2021-06-11', '2022-01-07'], 'duration': [12, 6, 8, 3],
'effort': [0.3, 0.5, 1.2, 0.1]}
df = pd.DataFrame(data=d)
i = 0
while i < 12:
# this creates the header rows for the 12 month period
next_month = months[(cur_month + i) % len(months)]
# here goes the calculation/condition I am stuck with...
df[next_month] = np.where(...)
i += 1
So I came up with this and seems to work, I also added some logic for weighting for the cases a project starts some time during the month, so we get a more accurate effort number.
d = {"id": [1,2,3,4], "start_date": ['2020-12-23', '2021-02-08', '2021-06-11', '2022-01-07'], "duration": [12, 6, 8, 3],
"effort": [0.3, 0.5, 1.2, 0.1]}
df = pd.DataFrame(data=d)
df["EndDates"] = df["start_date"].dt.to_period("M") + df_["duration"]
i = 0
forecast = pd.Series(pd.period_range(today, freq="M", periods=12))
while i < 12:
next_month = months[(cur_month + i) % len(months)]
df[next_month] = ""
for index, row in df.iterrows():
df_tmp = df.loc[df['id'] == int(row['id'])]
if not df_tmp.empty and pd.notna(df_tmp["start_date"].item()):
if df_tmp["start_date"].item().to_period("M") <= forecast[i] <= df_tmp["EndDates"].item():
# For the current month let's calculate with the remaining value
if i == 0:
act_enddate = monthrange(today.year, today.month)[1]
weighter = 1 - (int(today.day) / int(act_enddate))
df.at[index, next_month] = round(df_tmp['effort'].values[0] * weighter,
ndigits=2)
# If it is the first entry for the oppty, how many FTEs will be needed for the first month
# of the assignment
elif df_tmp["start_date"].item().to_period("M") == forecast[i]:
first_day = df_tmp["start_date"].item().day
if first_day != 1:
months_enddate = monthrange(forecast[i].year, forecast[i].month)[1]
weighter = 1 - (int(first_day) / int(months_enddate))
df.at[index, next_month] = round(df_tmp['effort'].values[0] * weighter,
ndigits=2)
else:
df.at[index, next_month] = df_tmp['effort'].values[0]
# How many FTEs are needed for the last month of the assignment
elif df_tmp["EndDates"].item() == forecast[i]:
end_day = df_tmp["start_date"].item().day
if end_day != 1:
months_enddate = monthrange(forecast[i].year, forecast[i].month)[1]
weighter = int(end_day) / int(months_enddate)
df.at[index, next_month] = round(df_tmp['Umrechnung in FTEs'].values[0] * weighter,
ndigits=2)
else:
continue
else:
df.at[index, next_month] = df_tmp['effort'].values[0]
I am reading log files in my python code which contains some nested json data. I have a nested for loop containing 4 for-loops from which values of certain keys are extracted and appended to a dataframe.
The nested for-loop is taking too much time and I saw from other answers that multiprocessing is the way to go for nested for-loops but did not find an example for json data.
What is the best approach for this ? Below is my code to extract data from log files and into dataframes. recommendation_list is a list of json objects.
for recommendation in recommendation_list:
if recommendation['type'] == "httpRequest":
session_id = recommendation['query'].split('sessionId=')[1].split('&')[0]
category_id = recommendation['query'].split('categoryId=')[1].split('&')[0]
if recommendation['type'] == "httpResponse":
recommendation_count = recommendation_count + 1
user_id = recommendation['userId']
time_stamp = recommendation['ts']
event_date = time_stamp.split("T")[0]
time = time_stamp.split("T")[-1]
try:
product_list = json.loads(recommendation['body'])['products']
except:
product_list = []
if len(product_list) > 0:
for product in product_list:
product_id = product["id"]
if 'recommendationMeta' in product:
data_frame = data_frame.append({
"transaction_id": last_id,
"user_id": user_id,
"session_id": session_id,
"category_id": category_id,
"product_id": product_id,
"date": event_date,
"time": time[0:12],
"event": "recommendation",
"ab_bucket": "B",
"recommendation_count": recommendation_count,
}, ignore_index=True)
for learning_unit in product['recommendationMeta']:
lu_name = learning_unit['lu']
lu_value = learning_unit['value']
recommendation_mode = learning_unit['recommendationMode']
prod_def1 = products[(products["product_id"] == product_id) &
(products["lu_value"].str.lower() == lu_value)]
if len(prod_def1) != 0:
product_list = prod_def1.to_dict('records')
for product_id in product_list:
category = categories[(categories["category_def_id"] == product_id["category_def_id"]) &
(categories["lu_name"].str.lower() == lu_name)]
if len(category) != 0:
product_def_id = product_id['product_def_id']
lu_df = lu_df.append({
"lu_data_id": lu_id,
"product_def_id": product_def_id,
"transaction_id": last_id,
"rec_mode": recommendation_mode,
}, ignore_index=True)
lu_id = lu_id+1
last_id = last_id + 1
I figure that the innermost for-loop executes most number of times and decided to use multiprocessing for it.
I replaced
for product_id in product_list:
category = categories[(categories["category_def_id"] == product_id["category_def_id"]) &
(categories["lu_name"].str.lower() == lu_name)]
if len(category) != 0:
product_def_id = product_id['product_def_id']
lu_df = lu_df.append({
"lu_data_id": lu_id,
"product_def_id": product_def_id,
"transaction_id": last_id,
"rec_mode": recommendation_mode,
}, ignore_index=True)
lu_id = lu_id+1
with this...
for product_id in product_list:
pool = Pool() # Create a multiprocessing Pool
data = pool.starmap(create_lu_data, [last_id, categories, recommendation_mode,
lu_name, lu_df, lu_id, product_id])
lu_id = lu_id + 1
p.close()
print(data)
where create_lu_data is
def create_lu_data(last_id, categories, recommendation_mode, lu_name, lu_df, lu_id, product_id):
category = categories[(categories["category_def_id"] == product_id["category_def_id"]) &
(categories["lu_name"].str.lower() == lu_name)]
if len(category) != 0:
product_def_id = product_id['product_def_id']
lu_df = lu_df.append({
"lu_data_id": lu_id,
"product_def_id": product_def_id,
"transaction_id": last_id,
"rec_mode": recommendation_mode,
}, ignore_index=True)
return lu_df
I didn't get any errors, but the output dataframe has several times the expected number of rows.
So as the question said, i want to be able to display for example all items in the category of food within a range of dates. So far i tried doing this and its the only one i get with no errors but it returns none even if the date is within the range. Apologies if it is a little messy towards the end, i'm still learning how to use python
class ExpenditureList:
_types = ['Food','Transport', 'Education']
def __init__(self):
self._expenditures = []
#classmethod
def ExpenditureTypes(cls):
return cls._types
def getExpenditures(self, expenditureType, edays=0):
startDate = date.today() - timedelta(days=edays)
if expenditureType == 'Food':
for item in self._expenditures:
if item._type == ExpenditureList._types[1]:
for i in range(startDate, (date.today() + timedelta(days=1))):
if item._date == i:
return item
Try filtering by one category at a time and use the fact that you can compare dates. You don't have to iterate all of possible dates:
from dataclasses import dataclass
from datetime import datetime
#dataclass
class Expenditure:
TYPES = ["Food", "Transport", "Education"]
type: str
date: datetime
expenditures = [
Expenditure("Food", datetime(1999, 1, 1)),
Expenditure("Food", datetime(2020, 1, 1)),
Expenditure("Transport", datetime(2000, 1, 1)),
]
food_exps = [exp for exp in expenditures if exp.type == "Food"]
old_exps = [exp for exp in expenditures if exp.date.year < 2000]
print(old_exps)
start = datetime(1999, 9, 9)
end = datetime(2001, 1, 1)
custom_date_exps = [exp for exp in expenditures if start < exp.date < end]
print(custom_date_exps)
If you don't know list comprehensions yet - rembember that this:
food_exps = [exp for exp in expenditures if exp.type == "Food"]
is equivalent to this:
food_exps = []
for exp in expenditures:
if exp.type == "Food":
food_exps.append(exp)
Anyway if you just want your version working:
def getExpenditures(self, expenditureType, edays=0):
start_date = date.today() - timedelta(days=edays)
result = []
for item in self._expenditures:
if item._type == expenditureType and item._date > start_date:
result.append(item)
return result
Just Convert R Code into equivalent Python code.
Item_Type - Old Column Name
Item_Type_new - New Column Name
perishable = c("Breads", "Breakfast", "Dairy", "Fruits and Vegetables", "Meat", "Seafood")
non_perishable = c("Baking Goods", "Canned", "Frozen Foods", "Hard Drinks", "Health and Hygiene", "Household", "Soft Drinks")
# create a new feature 'Item_Type_new'
combi[,Item_Type_new := ifelse(Item_Type %in% perishable, "perishable", ifelse(Item_Type %in% non_perishable, "non_perishable", "not_sure"))]
With a simple function, you can apply on pandas dataframe:
def func(x, l1, l2):
"""
x = input value
l1 = list of perishables
l2 = list of non-perishables
"""
if x in l1:
return 'perishable'
elif x in l2:
return 'non-perishable'
else:
return 'not_sure'
perishable = ["Breads", "Breakfast", "Dairy", "Fruits and Vegetables", "Meat", "Seafood"]
non_perishable = ["Baking Goods", "Canned", "Frozen Foods", "Hard Drinks", "Health and Hygiene", "Household", "Soft Drinks"]
combi['Item_Type_new'] = combi.apply(lambda x: func(x, perishable, non_perishable), axis=1)
Use np.select() -
perishable = ["Breads", "Breakfast", "Dairy", "Fruits and Vegetables", "Meat", "Seafood"]
non_perishable = ["Baking Goods", "Canned", "Frozen Foods", "Hard Drinks", "Health and Hygiene", "Household", "Soft Drinks"]
conditions = [
(combi['Item_Type'].isin(perishable)),
(combi['Item_Type'].isin(non_perishable))]
choices = ['perishable', 'non_perishable']
combi['Item_Type_new'] = np.select(conditions, choices, default='non_perishable')