So friends is a column with a list in each instance such as df['friends][0] = [id1, id2, ..., idn]. I'm trying to count the number of friends in a separate column such as df['friend_counts'][0] = n.
I did the following. I've used this code in other datasets, but for some reason it's taking forever and the dataset is only 300,000 instances.
df_user['friend_counts'] = df_user['friends'].apply(lambda x: len(df_user.friends[x]))
Also, for some reason this following code creates a season column but is not populated, i.e. it's all just blank spaces. This is troublesome since I did this exact same code for every other dataset. Did they change the .apply() method?
#Convert 'date' to a date time object
df_reviews["date"] = pd.to_datetime(df_reviews["date"])
#Splitting up 'release_date' -> 'release_weekday', 'release_month',
'release_year'
df_reviews["weekday"] = df_reviews["date"].dt.weekday_name
df_reviews["month"] = df_reviews["date"].dt.month
df_reviews["year"] = df_reviews["date"].dt.year
### Helper function
def season_converter(month_name):
""" Returns the season a particular month is in """
season = ""`enter code here`
#Winter
if month_name in ['Jan', 'Feb', 'Dec']:
season = "Winter"
#Spring
if month_name in ['Mar', 'Apr', 'May']:
season = "Spring"
#Summer
if month_name in ['Jun', 'Jul', 'Aug'] :
season = "Summer"
#Fall
if month_name in ['Sep', 'Oct', 'Nov']:
season = "Fall"
#Other
if month_name == "NA":
season = "NA"
return season
#Create a new column that holds seasonal information
df_reviews['season'] = df_reviews['month'].apply(lambda x:
season_converter(x))
I suggest use map by dictionary for improve performance:
d = {1:'Winter', 2:'Winter', 12:'Winter', 3: 'Spring', .... np.nan:'NA', 'NA':'NA'}
df_reviews['season'] = df_reviews['month'].map(d)
Another solution if is possible use numeric seasons:
df_reviews['season'] = (df_reviews['month'] % 12 + 3) // 3
Related
I need to have a function which performs below mentioned action ;
The dataset is :
and output expected is value in 'Difference' column , where remaining are input column.
Please note that within each group we first need to identify the maximum 'Closing_time' and the corrosponding amount will be the maximum value for that period , and then each row value will be subtracted from maximum detected value of previous period and result would be difference for that cell.
Also in case if the record do not have previous period then max value will be NA and difference caculation would be NA for all record for that period,
Adding points - within in each group (Cost_centre, Account, Year, Month) - Closing_time values are like ( D-0 00 CST is min and D-0 18 CST is maximim , similary within D-0,D+1, D+3 etc - D+3 will be maximum)
I tried to find first if previous value exist for each of the group or not and then find maximum time within each period and then crrosponding amount value to it.
Further using the maximum value , tried to subtract record Amount from Maximum value ,
but not getting how to implement , kindly help.
post sharing the above question i came up for this solution.
I splitted this in 3 part -
a) First find previous year and month for each of cost_center and account
b) Find maximum Closing_time within each group of cost_cente,account, year and month. Then pick corrosponding Amount value as amount .
c) using amount coming from b , subtract current amount with b to get diffrence.
def prevPeriod(df):
period =[]
for i in range(df.shape[0]):
if df['Month'][i]==1:
val_year = df['Year'][i]-1
val_month = 12
new_val =(val_year,val_month)
period.append(new_val)
else:
val_year = df['Year'][i]
val_month = df['Month'][i]-1
new_val =(val_year,val_month)
period.append(new_val)
print(period)
df['Previous_period'] = period
return df
def max_closing_time(group_list):
group_list = [item.replace('CST','') for item in group_list]
group_list = [item.replace('D','') for item in group_list]
group_list = [item.split()[:len(item)] for item in group_list]
l3 =[]
l4 =[]
for item in group_list:
l3.append(item[0])
l4.append(item[1])
l3 =[int(item) for item in l3]
l4 = [int(item) for item in l4]
max_datevalue = max(l3)
max_datevalue_index = l3.index(max(l3))
max_time_value = max(l4[max_datevalue_index:])
maximum_period = 'D+'+str(max_datevalue)+' '+str(max_time_value)+' '+'CST'
return maximum_period
def calculate_difference(df):
diff =[]
for i in range(df.shape[0]):
prev_year =df['Previous_period'][i][0]
print('prev_year is',prev_year)
prev_month = df['Previous_period'][i][1]
print('prev_month is', prev_month)
max_closing_time = df[(df['Year']==prev_year)& (df['Month']==prev_month)]['Max_Closing_time']
print('max_closing_time is', max_closing_time)
#max_amount_consider = df[(df['Year']==prev_year)& (df['Month']==prev_month) &(df['Max_Closing_time']==max_closing_time)]['Amount']
if bool(max_closing_time.empty):
found_diff = np.nan
diff.append(found_diff)
else:
max_closing_time_value = list(df[(df['Year']==prev_year)& (df['Month']==prev_month)]['Max_Closing_time'])[0]
max_amount_consider = df[(df['Cost_centre']==df['Cost_centre'][i])&(df['Account']==df['Account'][i])&(df['Year']==prev_year) & (df['Month']==prev_month) &(df['Closing_time']==str(max_closing_time_value))]['Amount']
print('max_amount_consider is',max_amount_consider)
found_diff = int(max_amount_consider) - df['Amount'][i]
diff.append(found_diff)
df['Variance'] = diff
return df
def calculate_variance(df):
'''
Input data frame is coming as query used above to fetch data
'''
try:
df = prevPeriod(df)
except:
print('Error occured in prevPeriod function')
# prerequisite for max_time_period
df2 = pd.DataFrame(df.groupby(['Cost_centre','Account','Year','Month'])['Closing_time'].apply(max_closing_time).reset_index())
df = pd.merge(df,df2, on =['Cost_centre','Account','Year','Month'])
# final calculation
try:
final_result = calculate_difference(df)
except:
print('Error in calculate_difference')
return final_result
Sample DataFrame:
id date price
93 6021501535 2014-07-25 430000
93 6021501535 2014-12-23 700000
313 4139480200 2014-06-18 1384000
313 4139480200 2014-12-09 1400000
first_list = []
second_list = []
I need to add the first price that corresponds to a specific ID to the first list and the second price for that same ID to the second list.
Example:
first_list = [430,000, 1,384,000]
second_list = [700,000, 1,400,000]
After which, I'm going to plot the values from both lists on a lineplot to compare the difference in price between the first and second list.
I've tried doing this with groupby and loc and I kept running into errors. I then tried iterating over each row using a simple for loop but ran into more problems...
I would appreciate some help.
Based on your question I think it's not necessary to save them into a list because you could also store them somewhere else (e.g. another DataFrame) and plot them. The functions below should help with filling wherever you want to store your data.
def date(your_id):
first_date = df.loc[(df['id']==your_id)].iloc[0,1]
second_date = df.loc[(df['id']==your_id)].iloc[1,1]
return first_date, second_date
def price(your_id):
first_date, second_date = date(your_id)
price_first_date = df.loc[(df['id']==6021501535) & (df['date']==first_date)].iloc[0,2]
price_second_date = df.loc[(df['id']==6021501535) & (df['date']==second_date)].iloc[0,2]
return price_first_date, price_second_date
price_first_date, price_second_date = price(6021501535)
If now for example you want to store your data in a new df you could do something like:
selected_ids = [6021501535, 4139480200]
new_df = pd.DataFrame(index=np.arange(1,len(selected_ids)+1), columns=['price_first_date', 'price_second_date'])
for i in range(len(selected_ids)):
your_id = selected_ids[i]
new_df.iloc[i, 0], new_df.iloc[i, 1] = price(your_id)
new_df then contains all 'first date prices' in the first column and all 'second date prices' in the second column. Plotting should work out.
I'm trying to get the profitability of every project by dividing profit by revenue.
The code is working, I get the values back.
I just need help with the last part (the dividing part). There is where I'm having some issues.
Here is my code.
The outcome I get is
AttributeError: 'str' object has no attribute 'append'
from observations.constants import PROJECTS_DB_ID
from datetime import datetime
from dateutil.relativedelta import relativedelta
def get(gs_client):
#Sheet access
sheet = gs_client.open_by_key(
PROJECTS_DB_ID).worksheet('Finance')
#Columns necessary
projects = sheet.col_values(1)[2:]
months = sheet.col_values(2)[2:]
profit = sheet.col_values(11)[2:]
revenue = sheet.col_values(6)[2:]
last_modified = sheet.col_values(13)[2:]
#Lists
list_projects = []
list_months = []
list_profit = []
list_revenue = []
list_last_modified = []
value = []
#Gets each project
for project in projects:
list_projects.append(project)
#Gets each month
for month in months:
list_months.append(month)
#Gets each value of profit column
for val in profit:
list_profit.append(val.strip('$').replace(',',''))
#Gets each value in revenue column
for value in revenue:
list_revenue.append(value.strip('$').replace(',',''))
#Gets each date in last modified column
for update in last_modified:
list_last_modified.append(update)
#Get profitability per project (profit divided by revenue)
for x in range(len(projects)):
value1 = float(list_profit[x])/float(list_revenue[x])
value.append(value1)
print(value)
Any help would be greatly appreciated!
Your error is due to variable value, you have used it as list and as string.
#Lists
list_projects = []
list_months = []
list_profit = []
list_revenue = []
list_last_modified = []
value = []
#Gets each project
for project in projects:
list_projects.append(project)
#Gets each month
for month in months:
list_months.append(month)
#Gets each value of profit column
for val in profit:
list_profit.append(val.strip('$').replace(',',''))
#Gets each value in revenue column
for val in revenue: # here, changed value to val
list_revenue.append(val.strip('$').replace(',',''))
#Gets each date in last modified column
for update in last_modified:
list_last_modified.append(update)
#Get profitability per project (profit divided by revenue)
for x in range(len(projects)):
value1 = float(list_profit[x])/float(list_revenue[x])
value.append(value1)
whenever you use for i in somthing in python, the i isn't local variable inside the for loop like in other language, value of i is the last value of i inside the loop, which can also be accessed after the end of the loop. You have to be very careful about the use of variable names in python.
So, I asked this question yesterday but I think I can word it a lot better. I have a csv file with 4 columns, 1 of which contains the day that a ticket has been purchased for (Wed, Thur and Fri), and another containing how many tickets each customer has bought. Wed & Thur tickets are a different price from Fri tickets. I need to get the code to loop through the tickets bought column and only take the data from the rows containing 'W' or 'T' in the day of purchase column so I can calculate how much money was made from Wed & Thur sales, and then the same for the Fri sales. I hope I've explained it well. If it helps, here is my code so far:
wedThur = int(5.00)
friday = int(10.00)
def readFile():
ticketid = []
ticketsBought = []
method = []
f = open("ticketdata.csv")
csvFile = csv.reader(f)
for row in csvFile:
ticketid.append(row[1])
ticketsBought.append(int(row[2]))
method.append(row[3])
f.close()
return ticketid, ticketsBought, method
def calculatePurchases(ticketid, ticketsBought):
price = 0
amount = len(ticketid)
if 'W' or 'T' in ticketid:
price = wedThur * amount
print(price)
else:
price = friday * amount
print(price)
return price
Python has many amazing features to work with such data.
First of all, I would change your read file function to return more suitable data structure. Instead of returning tuple of lists, I would return list of tuple.
def read_file():
data = []
f = open("ticketdata.csv")
csvFile = csv.reader(f)
for row in csvFile:
data.append(row)
f.close()
return data
Python has built-in function sum, which sums all elements in a sequence.
sum([1, 2, 3]) returns 6.
All is needed to compose right sequence for it.
def iterate_by_day(days, data):
for d in data:
if d[0] in days:
yield d[1]
This creates a special object called a generator. Visit python tutorial and make yourself familiar with it.
This should print the expected result.
data = read_file()
wed_thur = 5
print(sum(iterate_by_day("WT", data) * wed_thur))
# This works the same
print(sum(iterate_by_day(["W", "T"], data)) * wed_thur)
I'm presenting a data frame in Jupyter Notebook. The initial data type of the data frame is float. I want to present rows 1 & 3 of the printed table as integers and rows 2 & 4 as percentage. How do I do that? (I've spent numerous hours looking for a solution with no success)
Here's the code I'm using:
#Creating the table
clms = sales.columns
indx = ['# of Poeple','% of Poeple','# Purchased per Activity','% Purchased per Activity']
basic_stats = pd.DataFrame(data=np.nan,index=indx,columns=clms)
basic_stats.head()
#Calculating the # of people who took part in each activity
for clm in sales.columns:
basic_stats.iloc[0][clm] = int(round(sales[sales[clm]>0][clm].count(),0))
#Calculating the % of people who took part in each activity from the total email list
for clm in sales.columns:
basic_stats.iloc[1][clm] = round((basic_stats.iloc[0][clm] / sales['Sales'].count())*100,2)
#Calculating the # of people who took part in each activity AND that bought the product
for clm in sales.columns:
basic_stats.iloc[2][clm] = int(round(sales[(sales[clm] >0) & (sales['Sales']>0)][clm].count()))
#Calculating the % of people who took part in each activity AND that bought the product
for clm in sales.columns:
basic_stats.iloc[3][clm] = round((basic_stats.iloc[2][clm] / basic_stats.iloc[0][clm])*100,2)
#Present the table
basic_stats
Here's the printed table:
Output table of 'basic_stats' data frame in Jupyter Notebook
Integer representation
You already assign integers to the cells of row 1 and 3 are. The reason why these integers are printed as floats is that all columns have the data type float64. This is caused by the way you initially create the Dataframe. You can view the data types by printing the .dtypes attribute:
basic_stats = pd.DataFrame(data=np.nan,index=indx,columns=clms)
print(basic_stats.dtypes)
# Prints:
# column1 float64
# column2 float64
# ...
# dtype: object
If you don't provide the data keyword argument in the constructor of the Data
frame, the data type of each cell will be object which can be any object:
basic_stats = pd.DataFrame(index=indx,columns=clms)
print(basic_stats.dtypes)
# Prints:
# column1 object
# column2 object
# ...
# dtype: object
When the data type of a cell is object, the content is formatted using it's builtin methods which leads to integers bein formatted properly.
Percentage representation
In order to display percentages, you can use a custom class that prints a float number the way you want:
class PercentRepr(object):
"""Represents a floating point number as percent"""
def __init__(self, float_value):
self.value = float_value
def __str__(self):
return "{:.2f}%".format(self.value*100)
Then just use this class for the values of row 1 and 3:
#Creating the table
clms = sales.columns
indx = ['# of Poeple','% of Poeple','# Purchased per Activity','% Purchased per Activity']
basic_stats = pd.DataFrame(index=indx,columns=clms)
basic_stats.head()
#Calculating the # of people who took part in each activity
for clm in sales.columns:
basic_stats.iloc[0][clm] = int(round(sales[sales[clm]>0][clm].count(),0))
#Calculating the % of people who took part in each activity from the total email list
for clm in sales.columns:
basic_stats.iloc[1][clm] = PercentRepr(basic_stats.iloc[0][clm] / sales['Sales'].count())
#Calculating the # of people who took part in each activity AND that bought the product
for clm in sales.columns:
basic_stats.iloc[2][clm] = int(round(sales[(sales[clm] >0) & (sales['Sales']>0)][clm].count()))
#Calculating the % of people who took part in each activity AND that bought the product
for clm in sales.columns:
basic_stats.iloc[3][clm] = PercentRepr(basic_stats.iloc[2][clm] / basic_stats.iloc[0][clm])
#Present the table
basic_stats
Note: This actually changes the data in your dataframe! If you want to do further processing with the data of rows 1 and 3, you should be aware that these rows don't contain float objects anymore.
Here's one way, kind of a hack, but if its simply for pretty printing, it'll work.
df = pd.DataFrame(np.random.random(20).reshape(4,5))
# first and third rows display as integers
df.loc[0,] = df.loc[0,]*100
df.loc[2,] = df.loc[2,]*100
df.loc[0,:] = df.loc[0,:].astype(int).astype(str)
df.loc[2,:] = df.loc[2,:].astype(int).astype(str)
# second and fourth rows display as percents (with 2 decimals)
df.loc[1,:] = np.round(df.loc[1,:].values.astype(float),4).astype(float)*100
df.loc[3,:] = np.round(df.loc[3,:].values.astype(float),4).astype(float)*100