Pandas pandarallel parallel_aply - python-3.x

Here is a simple program that works in parallel. But in has an issue when I want to use a previous result to apply.
import pandas as pd
import numpy as np
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=8) # nb_workers=NUMBER_OF_CPU_CORES
def dummy_fit(x, y_hint=0.5):
# Imagine quite a complicated code here
# y_hint is a previous fit. When it is not given, use default
y = (x.mean() + y_hint) / 2
return y
df = pd.DataFrame(np.random.random((10, 3)), columns=list("ABC"))
print("data:\n", df)
result = df.parallel_apply(dummy_fit, axis=1)
print(result)
We can use a global variable, but it is only one (we have more threads)
How to make it work in parallel?

Related

How do i write this piece of code most efficiently using pandas and numpy

I have written a piece of code using pandas and numpy on my dataframe. It works but i am wondering how i can define a function to do the same thing and apply it to my dataframe.
import pandas
def f(row):
if row['time_download'] == 0:
val = 0
else:
val = (row['volume_download'] - row['volume_last_second'])/(row['time_download']/1000)
return val
data['throughput'] = data.apply(f, axis=1)
this is extremely slow as my dataset is pretty large in the order of millions of rows
import pandas as pd
import numpy as np
from numpy import where
no_download = data["time_download"] == 0
data["throughput"] = where(no_download, 0, \
((data["volume_download"].where(data["time_download"] != 0) - \
data["volume_last_second"].where(data["time_download"] != 0))/\
(data["time_download"].where(data["time_download"] != 0)/1000)))
i am receiving the expected results but i am a python beginner and am interested in understanding how to write this into a function since this is applicable to several columns grouped similarly,
You can use the built in pandas.DataFrame.div function then convert the Inf values to 0.
import numpy as np
import pandas as pd
# This will divide normally if time_download is not 0, otherwise will return Inf
df['throughput'] = (df.volume_download - df.volume_last_second).div(df.time_download)
# Locate all cells where throughput value is not finite (i.e. Inf) and set to 0
df.loc[~np.isfinite(df['throughput']), 'throughput'] = 0

While creating dummy variables getting memory error

I am working on a project and While getting the dummies value i was getting memory exception
I have tried using .astype(np.int8) and i have also tried writing exception handling code by importing psutil
i am using below code
dummy_cols = ['emp_title','grade','home_ownership','verification_status','addr_state','pub_rec','application_type']
df_dummies = pd.get_dummies(df[dummy_cols], drop_first = True)
It's not working and throwing an error
pandas.get_dummies creates a dense representation of dummy variables, which may request lots of memory depending on the number of levels in the categorical features.
I would prefer scikit-learn.preprocessing.OneHotEncoder that outputs sparse matrices:
The code would look like this :
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# Create a fake dataframe
df = pd.DataFrame(
{
"df1": np.random.choice(["a", "b"], 100),
"df2": np.random.choice(["c", "d"], 100)
}
)
dummy_cols = ["df1", "df2"]
# LabelEncode categoricals
for f in dummy_cols:
df[f] = LabelEncoder().fit_transform(df[f])
# Transform to dummies in sparse representation (csr_matrix)
df_dummies = OneHotEncoder().fit_transform(df[dummy_cols])

negative forcasts using facebook prophet

I have a daily time series data for almost 2 years for cluster available space (in GB). I am trying to to use facebook's prophet to do future forecasts. Some forecasts have negative values. Since negative values does not make sense I saw that using carrying capacity for logistic growth model helps in eliminating negative forecasts with cap values. I am not sure if this is applicable for this case and how to get the cap value for my time series. Please help as I am new to this and confused. I am using Python 3.6
import numpy as np
import pandas as pd
import xlrd
import openpyxl
from pandas import datetime
import csv
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from fbprophet import Prophet
import os
import sys
import signal
df = pd.read_excel("Data_Per_day.xlsx")
df1=df.filter(['cluster_guid','date','avail_capacity'],axis=1)
uniquevalues = np.unique(df1[['cluster_guid']].values)
for id in uniquevalues:
newdf = df1[df1['cluster_guid'] == id]
newdf1=newdf.groupby(['cluster_guid','date'],as_index=False['avail_capacity'].sum()
#newdf11=newdf.groupby(['cluster_guid','date'],as_index=False)['total_capacity'].sum()
#cap[id]=newdf11['total_capacity'].max()
#print(cap[id])
newdf1.set_index('cluster_guid', inplace=True)
newdf1.to_csv('my_csv.csv', mode='a',header=None)
with open('my_csv.csv',newline='') as f:
r = csv.reader(f)
data = [line for line in r]
with open('my_csv.csv','w',newline='') as f:
w = csv.writer(f)
w.writerow(['cluster_guid','DATE_TAKEN','avail_capacity'])
w.writerows(data)
in_df = pd.read_csv('my_csv.csv', parse_dates=True, index_col='DATE_TAKEN' )
in_df.to_csv('my_csv.csv')
dfs= pd.read_csv('my_csv.csv')
uni=dfs.cluster_guid.unique()
while True:
try:
print(" Press Ctrl +C to exit or enter the cluster guid to be forcasted")
i=input('Please enter the cluster guid')
if i not in uni:
print( 'Please enter a valid cluster guid')
continue
else:
dfs1=dfs.loc[df['cluster_guid'] == i]
dfs1.drop('cluster_guid', axis=1, inplace=True)
dfs1.to_csv('dataframe'+i+'.csv', index=False)
dfs2=pd.read_csv('dataframe'+i+'.csv')
dfs2['DATE_TAKEN'] = pd.DatetimeIndex(dfs2['DATE_TAKEN'])
dfs2 = dfs2.rename(columns={'DATE_TAKEN': 'ds','avail_capacity': 'y'})
my_model = Prophet(interval_width=0.99)
my_model.fit(dfs2)
future_dates = my_model.make_future_dataframe(periods=30, freq='D')
forecast = my_model.predict(future_dates)
print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']])
my_model.plot(forecast,uncertainty=True)
my_model.plot_components(forecast)
plt.show()
os.remove('dataframe'+i+'.csv')
os.remove('my_csv.csv')
except KeyboardInterrupt:
try:
os.remove('my_csv.csv')
except OSError:
pass
sys.exit(0)
Box-Cox transform of order 0 get the trick done. Here are the steps:
1. Add 1 to each values (so as to avoid log(0))
2. Take natural log of each value
3. Make forecasts
4. Take exponent and subtract 1
This way you will not get negative forecasts. Also log have a nice property of converting multiplicative seasonality to additive form.

How do I map df column values to hex color in one go?

I have a pandas dataframe with two columns. One of the columns values needs to be mapped to colors in hex. Another graphing process takes over from there.
This is what I have tried so far. Part of the toy code is taken from here.
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# Create dataframe
df = pd.DataFrame(np.random.randint(0,21,size=(7, 2)), columns=['some_value', 'another_value'])
# Add a nan to handle realworld
df.iloc[-1] = np.nan
# Try to map values to colors in hex
# # Taken from here
norm = matplotlib.colors.Normalize(vmin=0, vmax=21, clip=True)
mapper = plt.cm.ScalarMappable(norm=norm, cmap=plt.cm.viridis)
df['some_value_color'] = df['some_value'].apply(lambda x: mapper.to_rgba(x))
df
Which outputs:
How do I convert 'some_value' df column values to hex in one go?
Ideally using the sns.cubehelix_palette(light=1)
I am not opposed to using something other than matplotlib
Thanks in advance.
You may use matplotlib.colors.to_hex() to convert a color to hexadecimal representation.
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
# Create dataframe
df = pd.DataFrame(np.random.randint(0,21,size=(7, 2)), columns=['some_value', 'another_value'])
# Add a nan to handle realworld
df.iloc[-1] = np.nan
# Try to map values to colors in hex
# # Taken from here
norm = matplotlib.colors.Normalize(vmin=0, vmax=21, clip=True)
mapper = plt.cm.ScalarMappable(norm=norm, cmap=plt.cm.viridis)
df['some_value_color'] = df['some_value'].apply(lambda x: mcolors.to_hex(mapper.to_rgba(x)))
df
Efficiency
The above method it easy to use, but may not be very efficient. In the folling let's compare some alternatives.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
def create_df(n=10):
# Create dataframe
df = pd.DataFrame(np.random.randint(0,21,size=(n, 2)),
columns=['some_value', 'another_value'])
# Add a nan to handle realworld
df.iloc[-1] = np.nan
return df
The following is the solution from above. It applies the conversion to the dataframe row by row. This quite inefficient.
def apply1(df):
# map values to colors in hex via
# matplotlib to_hex by pandas apply
norm = mcolors.Normalize(vmin=np.nanmin(df['some_value'].values),
vmax=np.nanmax(df['some_value'].values), clip=True)
mapper = plt.cm.ScalarMappable(norm=norm, cmap=plt.cm.viridis)
df['some_value_color'] = df['some_value'].apply(lambda x: mcolors.to_hex(mapper.to_rgba(x)))
return df
That's why we might choose to calculate the values into a numpy array first and just assign this array as the newly created column.
def apply2(df):
# map values to colors in hex via
# matplotlib to_hex by assigning numpy array as column
norm = mcolors.Normalize(vmin=np.nanmin(df['some_value'].values),
vmax=np.nanmax(df['some_value'].values), clip=True)
mapper = plt.cm.ScalarMappable(norm=norm, cmap=plt.cm.viridis)
a = mapper.to_rgba(df['some_value'])
df['some_value_color'] = np.apply_along_axis(mcolors.to_hex, 1, a)
return df
Finally we may use a look up table (LUT) which is created from the matplotlib colormap, and index the LUT by the normalized data. Because this solution needs to create the LUT first, it is rather ineffienct for dataframes with less entries than the LUT has colors, but will pay off for large dataframes.
def apply3(df):
# map values to colors in hex via
# creating a hex Look up table table and apply the normalized data to it
norm = mcolors.Normalize(vmin=np.nanmin(df['some_value'].values),
vmax=np.nanmax(df['some_value'].values), clip=True)
lut = plt.cm.viridis(np.linspace(0,1,256))
lut = np.apply_along_axis(mcolors.to_hex, 1, lut)
a = (norm(df['some_value'].values)*255).astype(np.int16)
df['some_value_color'] = lut[a]
return df
Compare the timings
Let's take a dataframe with 10000 rows.
df = create_df(10000)
Original solution (apply1)
%timeit apply1(df)
2.66 s per loop
Array solution (apply2)
%timeit apply2(df)
240 ms per loop
LUT solution (apply3)
%timeit apply1(df)
7.64 ms per loop
In this case the LUT solution gives almost a factor 400 of improvement.

Dask: How would I parallelize my code with dask delayed?

This is my first venture into parallel processing and I have been looking into Dask but I am having trouble actually coding it.
I have had a look at their examples and documentation and I think dask.delayed will work best. I attempted to wrap my functions with the delayed(function_name), or add an #delayed decorator, but I can't seem to get it working properly. I preferred Dask over other methods since it is made in python and for its (supposed) simplicity. I know dask doesn't work on the for loop, but they say it can work inside a loop.
My code passes files through a function that contains inputs to other functions and looks like this:
from dask import delayed
filenames = ['1.csv', '2.csv', '3.csv', etc. etc. ]
for count, name in enumerate(filenames)"
name = name.split('.')[0]
....
then do some pre-processing ex:
preprocess1, preprocess2 = delayed(read_files_and_do_some_stuff)(name)
then I call a constructor and pass the pre_results in to the function calls:
fc = FunctionCalls()
Daily = delayed(fc.function_runs)(filename=name, stringinput='Daily',
input_data=pre_result1, model1=pre_result2)
What i do here is I pass the file into the for loop, do some pre-processing and then pass the file into two models.
Thoughts or tips on how to do parallelize this? I began getting odd errors and I had no idea how to fix the code. The code does work as is. I use a bunch of pandas dataframes, series, and numpy arrays, and I would prefer not to go back and change everything to work with dask.dataframes etc.
The code in my comment may be difficult to read. Here it is in a more formatted way.
In the code below, when I type print(mean_squared_error) I just get: Delayed('mean_squared_error-3009ec00-7ff5-4865-8338-1fec3f9ed138')
from dask import delayed
import pandas as pd
from sklearn.metrics import mean_squared_error as mse
filenames = ['file1.csv']
for count, name in enumerate(filenames):
file1 = pd.read_csv(name)
df = pd.DataFrame(file1)
prediction = df['Close'][:-1]
observed = df['Close'][1:]
mean_squared_error = delayed(mse)(observed, prediction)
You need to call dask.compute to eventually compute the result. See dask.delayed documentation.
Sequential code
import pandas as pd
from sklearn.metrics import mean_squared_error as mse
filenames = [...]
results = []
for count, name in enumerate(filenames):
file1 = pd.read_csv(name)
df = pd.DataFrame(file1) # isn't this already a dataframe?
prediction = df['Close'][:-1]
observed = df['Close'][1:]
mean_squared_error = mse(observed, prediction)
results.append(mean_squared_error)
Parallel code
import dask
import pandas as pd
from sklearn.metrics import mean_squared_error as mse
filenames = [...]
delayed_results = []
for count, name in enumerate(filenames):
df = dask.delayed(pd.read_csv)(name)
prediction = df['Close'][:-1]
observed = df['Close'][1:]
mean_squared_error = dask.delayed(mse)(observed, prediction)
delayed_results.append(mean_squared_error)
results = dask.compute(*delayed_results)
A much clearer solution, IMO, than the accepted answer is this snippet.
from dask import compute, delayed
import pandas as pd
from sklearn.metrics import mean_squared_error as mse
filenames = [...]
def compute_mse(file_name):
df = pd.read_csv(file_name)
prediction = df['Close'][:-1]
observed = df['Close'][1:]
return mse(observed, prediction)
delayed_results = [delayed(compute_mse)(file_name) for file_name in filenames]
mean_squared_errors = compute(*delayed_results, scheduler="processes")

Resources