How to create new column using condition in python DataFrame? - python-3.x

Just Convert R Code into equivalent Python code.
Item_Type - Old Column Name
Item_Type_new - New Column Name
perishable = c("Breads", "Breakfast", "Dairy", "Fruits and Vegetables", "Meat", "Seafood")
non_perishable = c("Baking Goods", "Canned", "Frozen Foods", "Hard Drinks", "Health and Hygiene", "Household", "Soft Drinks")
# create a new feature 'Item_Type_new'
combi[,Item_Type_new := ifelse(Item_Type %in% perishable, "perishable", ifelse(Item_Type %in% non_perishable, "non_perishable", "not_sure"))]

With a simple function, you can apply on pandas dataframe:
def func(x, l1, l2):
"""
x = input value
l1 = list of perishables
l2 = list of non-perishables
"""
if x in l1:
return 'perishable'
elif x in l2:
return 'non-perishable'
else:
return 'not_sure'
perishable = ["Breads", "Breakfast", "Dairy", "Fruits and Vegetables", "Meat", "Seafood"]
non_perishable = ["Baking Goods", "Canned", "Frozen Foods", "Hard Drinks", "Health and Hygiene", "Household", "Soft Drinks"]
combi['Item_Type_new'] = combi.apply(lambda x: func(x, perishable, non_perishable), axis=1)

Use np.select() -
perishable = ["Breads", "Breakfast", "Dairy", "Fruits and Vegetables", "Meat", "Seafood"]
non_perishable = ["Baking Goods", "Canned", "Frozen Foods", "Hard Drinks", "Health and Hygiene", "Household", "Soft Drinks"]
conditions = [
(combi['Item_Type'].isin(perishable)),
(combi['Item_Type'].isin(non_perishable))]
choices = ['perishable', 'non_perishable']
combi['Item_Type_new'] = np.select(conditions, choices, default='non_perishable')

Related

Splitting dictionary values (list) into constant chunks

I have a dictionary of topics with lists as values.
topics = {'sports':sports, 'food':food, 'movies':movies, 'singers':singers, 'goats':goats}
These are the values
sports = ['soccer','basketball','tennis'],
food = ['burger','fried rice', 'spaghetti','curry', 'lamb','wings']
movies = ['spider man', 'batman', 'iron man', 'hustle', 'desperado']
singers = ['drake']
goats = ['messi', 'jordan']
I want to transform the dictionary in a manner that, given size k, the elements in the list shouldn't be greater than k. If they are, create a new list, and put the elements in. For instance, if k = 2, that means I want the list values to be split like this:
{'goats':['messi', 'jordan'],
'sports_1', ['soccer','basketball'],
'sports_2': ['tennis'],
'food_1': ['burger','fried rice'],
'food_2': ['spaghetti','curry'],
'food_3': ['lamb','wings'],
'movies_1': ['spider man', 'batman'],
'movies_2: ['iron man', 'hustle'],
'movies_3':['desperado'],
'singers':['drake']}
Try:
def chunks(lst, n):
for i in range(0, len(lst), n):
yield lst[i : i + n]
out = {}
for k, v in topics.items():
c = list(chunks(v, 2))
if len(c) == 1:
out[k] = c[0]
else:
for i, ch in enumerate(c, 1):
out[f"{k}_{i}"] = ch
print(out)
Prints:
{
"sports_1": ["soccer", "basketball"],
"sports_2": ["tennis"],
"food_1": ["burger", "fried rice"],
"food_2": ["spaghetti", "curry"],
"food_3": ["lamb", "wings"],
"movies_1": ["spider man", "batman"],
"movies_2": ["iron man", "hustle"],
"movies_3": ["desperado"],
"singers": ["drake"],
"goats": ["messi", "jordan"],
}

How to optimize a nested for loop, looping over json data to extract values of certain keys in python

I am reading log files in my python code which contains some nested json data. I have a nested for loop containing 4 for-loops from which values of certain keys are extracted and appended to a dataframe.
The nested for-loop is taking too much time and I saw from other answers that multiprocessing is the way to go for nested for-loops but did not find an example for json data.
What is the best approach for this ? Below is my code to extract data from log files and into dataframes. recommendation_list is a list of json objects.
for recommendation in recommendation_list:
if recommendation['type'] == "httpRequest":
session_id = recommendation['query'].split('sessionId=')[1].split('&')[0]
category_id = recommendation['query'].split('categoryId=')[1].split('&')[0]
if recommendation['type'] == "httpResponse":
recommendation_count = recommendation_count + 1
user_id = recommendation['userId']
time_stamp = recommendation['ts']
event_date = time_stamp.split("T")[0]
time = time_stamp.split("T")[-1]
try:
product_list = json.loads(recommendation['body'])['products']
except:
product_list = []
if len(product_list) > 0:
for product in product_list:
product_id = product["id"]
if 'recommendationMeta' in product:
data_frame = data_frame.append({
"transaction_id": last_id,
"user_id": user_id,
"session_id": session_id,
"category_id": category_id,
"product_id": product_id,
"date": event_date,
"time": time[0:12],
"event": "recommendation",
"ab_bucket": "B",
"recommendation_count": recommendation_count,
}, ignore_index=True)
for learning_unit in product['recommendationMeta']:
lu_name = learning_unit['lu']
lu_value = learning_unit['value']
recommendation_mode = learning_unit['recommendationMode']
prod_def1 = products[(products["product_id"] == product_id) &
(products["lu_value"].str.lower() == lu_value)]
if len(prod_def1) != 0:
product_list = prod_def1.to_dict('records')
for product_id in product_list:
category = categories[(categories["category_def_id"] == product_id["category_def_id"]) &
(categories["lu_name"].str.lower() == lu_name)]
if len(category) != 0:
product_def_id = product_id['product_def_id']
lu_df = lu_df.append({
"lu_data_id": lu_id,
"product_def_id": product_def_id,
"transaction_id": last_id,
"rec_mode": recommendation_mode,
}, ignore_index=True)
lu_id = lu_id+1
last_id = last_id + 1
I figure that the innermost for-loop executes most number of times and decided to use multiprocessing for it.
I replaced
for product_id in product_list:
category = categories[(categories["category_def_id"] == product_id["category_def_id"]) &
(categories["lu_name"].str.lower() == lu_name)]
if len(category) != 0:
product_def_id = product_id['product_def_id']
lu_df = lu_df.append({
"lu_data_id": lu_id,
"product_def_id": product_def_id,
"transaction_id": last_id,
"rec_mode": recommendation_mode,
}, ignore_index=True)
lu_id = lu_id+1
with this...
for product_id in product_list:
pool = Pool() # Create a multiprocessing Pool
data = pool.starmap(create_lu_data, [last_id, categories, recommendation_mode,
lu_name, lu_df, lu_id, product_id])
lu_id = lu_id + 1
p.close()
print(data)
where create_lu_data is
def create_lu_data(last_id, categories, recommendation_mode, lu_name, lu_df, lu_id, product_id):
category = categories[(categories["category_def_id"] == product_id["category_def_id"]) &
(categories["lu_name"].str.lower() == lu_name)]
if len(category) != 0:
product_def_id = product_id['product_def_id']
lu_df = lu_df.append({
"lu_data_id": lu_id,
"product_def_id": product_def_id,
"transaction_id": last_id,
"rec_mode": recommendation_mode,
}, ignore_index=True)
return lu_df
I didn't get any errors, but the output dataframe has several times the expected number of rows.

creating a "player" from a list of names and giving them their own independent lists

I am currently learning about classes and methods in python and am stuck in a small project im doing,
I want to create "players" from a list of player names, each of these players need their own independant lists (hand, up, down) I thought this below would achieve this
deck = ["ha", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9", "h10", "hj", "hq", "hk", "da", "d2", "d3", "d4", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "dj", "dq", "dk"]
cardsperset = 3
playernames = ["N", "L"]
class player:
def __init__(self, name):
self.name = name
hand = []
up = []
down = []
for name in playernames:
name = player(name)
def randomcard():
randcard = deck[random.randint(range(deck))]
deck.remove(randcard)
return randcard
for i in range(cardsperset):
N.hand.append(randomcard())
N.down.append(randomcard())
N.up.append(randomcard())
print(N.hand)
However when trying to print the result to see if this worked I am getting the error "NameError: name 'N' is not defined"
obviously the creation of the player "N" was not successful, am i going about this the wrong way ?
Assuming that each player should be assigned cardsperset number of cards, you should first ensure you have a list of the players created - and then you can iterate this list, assigning random cards to each player.
I made some additional changes, like capital class name (player -> Player), changing hand = [] to self.hand = [], etc.
import random
deck = ["ha", "h2", "h3", "h4", "h5", "h6", "h7", "h8", "h9", "h10", "hj", "hq", "hk", "da", "d2", "d3", "d4", "d4", "d5", "d6", "d7", "d8", "d9", "d10", "dj", "dq", "dk"]
cardsperset = 3
playernames = ["N", "L"]
players = []
class Player:
def __init__(self, name):
self.name = name
self.hand = [] # assign to self
self.up = []
self.down = []
for name in playernames:
players.append(Player(name)) # Append players to global list "players"
def randomcard():
randcard = deck[random.randint(0, len(deck)-1)]
deck.remove(randcard)
return randcard
for N in players: # for each player, N, assign cards. N now exists in scope.
for i in range(cardsperset):
N.hand.append(randomcard())
N.down.append(randomcard())
N.up.append(randomcard())
print(players[0].hand) # Access a certain player and inspect hand.

grab multiple data from multiple dictionaries having some conditions

I have multiple dictionaries & i want to get the "oldest date" and the "name" of the branch that has the oldest date
I can get the oldest date using this but cannot get which branch has the oldest date.
x = datetime.datetime.now()
# objects
branch_1 = {
"name": "b1",
"gps": (48.8946865, 2.3622423),
"oldestDate": dt.datetime(2019, 1, 7),
}
branch_2 = {
"name": "b2",
"gps": (48.839955, 2.288605),
"cars": 7,
"oldestDate": dt.datetime(2016, 1, 17),
}
branch_3 = {
"name": "b3",
"gps": (48.844244, 2.401435),
"oldestDate": dt.datetime(2019, 1, 21),
}
listOBranches = [branch_1, branch_2, branch_3]
mtlst2 = []
def branchPriorityScore(listOBranches):
for item in listOBranches:
score = x - (item["oldestDate"])
mtlst2.append(score)
dateMax2 = np.max(mtlst2)
name = item["name"]
# return mtlst2
print("Maximum Priority Branch : ", dateMax2) # correct score
# print("Maximum Priority Branch : ", np.max(mtlst2(item["name"])), dateMax2)
# mtlst2
branchPriorityScore(listOBranches)
I'd do it like this.
def branchPriorityScore(listOBranches):
oldest = datetime.timedelta()
i = 0 #if you need index
for index,item in enumerate(listOBranches):
score = x - (item["oldestDate"])
if score > oldest:
oldest = score
i = index #if you need index
name = item["name"]
print (name,i,oldest,listOBranches[i]['oldestDate'])
No sure why you need mtlst2 but you can add it in the cycle too.

How to search for a sub string within a string using Pyspark

The image added contains sample of .
For example, if sentence contains "John" and "drives" it means John has a car and to get to work he drives. I'm attaching code I'm using to do it. However, the code doesn't work correctly and is too complicated. I will appreciate your help.
%pyspark
rdd = sc.textFile("./sample.txt")
col = rdd.map(lambda line: line.split('\t'))
#remove header
header = col.first() #extract header
col = col.filter(lambda line: line != header)
def convertToRow(line):
return Row(Name = line[0],Text = line[1])
#call the function on each row, then convert to dataframe
df = col.map(convertToRow).toDF()
from pyspark.sql.functions import udf
def splitParagraphIntoSentences(paragraph):
sentences = nltk.tokenize.sent_tokenize(paragraph)
return sentences
def tokenize(text):
text = text.lower().replace('\n', '')
text = re.sub(',', '', text)
tokens = text.split()
if(len(tokens)>1):
tokens = splitParagraphIntoSentences(text)
return tokens
tokenize = udf(lambda text: tokenize(text))
data = df.select('Name', tokenize(df.Text).alias("Text"))
def how(name,paragraph):
drive = ['drives']
walks = ['walks']
comingwith = ['coming with']
for s in paragraph:
s = s.split()
if ((any(s[i:i+len(drive)]==drive for i in xrange(len(s)-len(drive)+1))) and (any(s[i:i+len(name)]==name for i in xrange(len(s)-len(name)+1)))):
return "Drives"
elif ((any(s[i:i+len(walks)]==walks for i in xrange(len(s)-len(walks)+1))) and (any(s[i:i+len(name)]==name for i in xrange(len(s)-len(name)+1)))):
return "Walks"
elif ((any(s[i:i+len(comingwith)]==comingwith for i in xrange(len(s)-len(comingwith)+1))) and (any(s[i:i+len(name)]==name for i in xrange(len(s)-len(name)+1)))):
return "Coming with"
def checkYesNo(name,paragraph):
drive = ['drives']
walks = ['walks']
comingwith = ['coming with']
for s in paragraph:
s = s.split()
if ((any(s[i:i+len(comingwith)]==comingwith for i in xrange(len(s)-len(comingwith)+1))) or (any(s[i:i+len(walks)]==walks for i in xrange(len(s)-len(walks)+1)))):
return "No"
else:
return "Yes"
how = udf(lambda name,paragraph: how(name,paragraph))
checkYesNo = udf(lambda name,paragraph: checkYesNo(name,paragraph))
final_df = data.select('Name', checkYesNo(data.Name, data.Text), how(data.Name, data.Text))
I'd do it like this:
import socket
class SparkUtil(object):
#staticmethod
def get_spark_context (host, venv, framework_name, parts):
os.environ['PYSPARK_PYTHON'] = "{0}/bin/python".format (venv)
from pyspark import SparkConf, SparkContext
from StringIO import StringIO
ip = socket.gethostbyname(socket.gethostname())
sparkConf = (SparkConf()
.setMaster(host)
.setAppName(framework_name))
return SparkContext(conf = sparkConf)
input_txt = [
[ "John", "John usually drives to work. He usually gets up early and drinks coffee. Mary usually joining him." ],
[ "Sam", "As opposed to John, Sam doesn't like to drive. Sam usually walks there." ],
[ "Mary", "Mary doesn't have driving license. Mary usually coming with John which picks her up from home." ]
]
def has_car (text):
return "drives" in text
def get_method (text):
method = None
for m in [ "drives", "walks", "coming with" ]:
if m in text:
method = m
break
return method
def process_row (row):
return [ row[0], has_car(row[1]), get_method(row[1]) ]
sc = SparkUtil.get_spark_context (host = "local[2]",
venv = "../starshome/venv",
framework_name = "app",
parts = 2)
print (sc.parallelize (input_txt).map (process_row).collect ())
The SparkUtil class you can probably ignore. I'm not using a notebook. This is just a straight up Spark app.

Resources