Jena Climate Code is as follows
import numpy as np
import os
from matplotlib import pyplot as plt
from numba import vectorize
f=open('jena.csv')
data=f.read()
f.close()
lines=data.split('\n')
header=lines[0].split(',')
lines=lines[1:]
print(header)
N=len(lines)
print(N)
float_data=np.zeros((len(lines),len(header)-1))
for i, line in enumerate(lines):
values=[float(x) for x in line.split(',')[1:]]
float_data[i,:]=values
mean=float_data[:200000].mean(axis=0)
float_data -=mean
std=float_data[:200000].std(axis=0)
float_data/=std
def generator(data,lookback,delay,min_index,max_index,shuffle=False,batch_size=128,step=6):
if max_index is None:
max_index=len(data)-delay-1
i=min_index+lookback
while 1:
if shuffle:
rows=np.random.randint(
min_index+lookback,max_index,size=batch_size)
else:
if i + batch_size>=max_index:
i=min_index+lookback
rows=np.arange(i,min(i+batch_size,max_index))
i+=len(rows)
samples=np.zeros((len(rows),lookback//step,data.shape[-1]))
targets=np.zeros((len(rows),))
for j, row in enumerate(rows):
indices=range(rows[j]-lookback,rows[j],step)
samples[j]=data[indices]
targets[j]=data[rows[j]+delay][1]
yield samples, targets
lookback=1440
step=6
delay=144
batch_size=128
train_gen=generator(float_data,lookback=lookback,delay=delay,min_index=0,max_index=200000,shuffle=True,step=step,batch_size=batch_size)
val_gen=generator(float_data,lookback=lookback,delay=delay,min_index=200001,max_index=300000,step=step,batch_size=batch_size)
test_gen=generator(float_data,lookback=lookback,delay=delay,min_index=300001,max_index=None,step=step,batch_size=batch_size)
val_steps=(300000-200001-lookback)
test_steps=(len(float_data)-300001-lookback)
def evaluate_naive_method():
batch_maes=[]
for step in range(val_steps):
samples,targets=next(val_gen)
mae=np.mean(np.abs(preds-targets))
batch_maes.append(mae)
print(np.mean(batch_maes))
evaluate_naive_method()
When i execute the code, it uses CPU and takes approximately 14 minutes to produce mae.
I want to use tensorflow in this section using GPU so that output can be faster.
for step in range(val_steps):
samples,targets=next(val_gen)
mae=np.mean(np.abs(preds-targets))
batch_maes.append(mae)
Should i convert the variables "samples" and "targets" into tensorflow so that I can get output faster? If so how can i convert it to tensorflow?
Tensorflow does this thing what you want, please have a look at below example using GPU:
https://www.tensorflow.org/guide/using_gpuhttps://www.tensorflow.org/guide/using_gpu
Related
I am trying to train a ml model using dask. I am training on my local machine with 1 GPU. My GPU has 24 GiBs of memory.
from dask_cuda import LocalCUDACluster
from dask.distributed import Client, LocalCluster
import dask.dataframe as dd
import pandas as pd
import numpy as np
import os
import xgboost as xgb
np.random.seed(42)
def get_columns(filename):
return pd.read_csv(filename, nrows=10).iloc[:, :NUM_FEATURES].columns
def get_data(filename, target):
import dask_cudf
X = dask_cudf.read_csv(filename)
# X = dd.read_csv(filename, assume_missing=True)
y = X[[target]]
X = X.iloc[:, :NUM_FEATURES]
return X, y
def main(client: Client) -> None:
X, y = get_data(FILENAME, TARGET)
model = xgb.dask.DaskXGBRegressor(
tree_method="gpu_hist",
objective="reg:squarederror",
seed=42,
max_depth=5,
eta=0.01,
n_estimators=10)
model.client = client
model.fit(X, y, eval_set=[(X, y)])
print("Saving the model..")
model.get_booster().save_model("xgboost.model")
print("Doing model importance..")
columns = get_columns(FILENAME)
pd.Series(model.feature_importances_, index=columns).sort_values(ascending=False).to_pickle("~/yolo.pkl")
if __name__ == "__main__":
os.environ["MALLOC_TRIM_THRESHOLD_"]="65536"
with LocalCUDACluster(device_memory_limit="15 GiB", rmm_pool_size="20 GiB") as cluster:
# with LocalCluster() as cluster:
with Client(cluster) as client:
print(client)
main(client)
Error as follows.
MemoryError: std::bad_alloc: out_of_memory: RMM failure at:/workspace/.conda-bld/work/include/rmm/mr/device/pool_memory_resource.hpp:192: Maximum pool size exceeded
Basically my GPU runs out of memory when I call model.fit. It works when I use a csv with 64100 rows and fails when I use a csv with 128198 rows (2x rows). These aren't large files so I assume I am doing something wrong.
I have tried fiddling around with
LocalCUDACluster: device_memory_limit and rmm_pool_size
dask_cudf.read_csv: chunksize
Nothing has worked.
I have been stuck on this all day so any help would be much appreciated.
You cannot train an xgboost model where the model grows larger than the remaining GPU memory size. You can scale out with dask_xgboost, but you need to ensure that the total GPU memory is sufficient.
Here is a great blog on this by Coiled: https://coiled.io/blog/dask-xgboost-python-example/
This is my python code for a model prediction.
import csv
import numpy as np
np.random.seed(1)
from keras.models import load_model
import tensorflow as tf
import pandas as pd
import time
output_location='Desktop/result/'
#load model
global graph
graph = tf.get_default_graph()
model = load_model("newmodel.h5")
def Myfun():
ecg = pd.read_csv('/Downloads/model.csv')
X = ecg.iloc[:,1:42].values
y = ecg.iloc[:,42].values
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y1 = encoder.fit_transform(y)
Y = pd.get_dummies(y1).values
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=0)
t1= timer()
with graph.as_default():
prediction = model.predict(X_test[0:1])
diff=timer()-t1
class_labels_predicted = np.argmax(prediction)
filename1=str(i)+"output.txt"
newfile=output_location+filename1
with open(str(newfile),'w',encoding = 'utf-8') as file:
file.write(" takes %f seconds time. predictedclass is %s \n" %(diff,class_labels_predicted))
return class_labels_predicted
for i in range(1,100):
Myfun()
My system GPU is of size 2GB. While running this code ,nvidia-smi -l 2 shows it consumes 1.8 GB of GPU. And 100 files are getting as a result. Soon after the task completes again GPU utilisation turns to 500MB. I have tensorflow and keras GPU version installed in my system. My Question is:
Why does this code runs on GPU. Does the complete code uses GPU or its only for importing libraries such as keras-gpu and tensorflow-gpu?
As I can see from your code, you are using Keras and Tensorflow. From Keras F.A.Q.
If you are running on the TensorFlow or CNTK backends, your code will automatically run on GPU if any available GPU is detected.
You can force Keras to run on CPU only
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = ""
I have a daily time series data for almost 2 years for cluster available space (in GB). I am trying to to use facebook's prophet to do future forecasts. Some forecasts have negative values. Since negative values does not make sense I saw that using carrying capacity for logistic growth model helps in eliminating negative forecasts with cap values. I am not sure if this is applicable for this case and how to get the cap value for my time series. Please help as I am new to this and confused. I am using Python 3.6
import numpy as np
import pandas as pd
import xlrd
import openpyxl
from pandas import datetime
import csv
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from fbprophet import Prophet
import os
import sys
import signal
df = pd.read_excel("Data_Per_day.xlsx")
df1=df.filter(['cluster_guid','date','avail_capacity'],axis=1)
uniquevalues = np.unique(df1[['cluster_guid']].values)
for id in uniquevalues:
newdf = df1[df1['cluster_guid'] == id]
newdf1=newdf.groupby(['cluster_guid','date'],as_index=False['avail_capacity'].sum()
#newdf11=newdf.groupby(['cluster_guid','date'],as_index=False)['total_capacity'].sum()
#cap[id]=newdf11['total_capacity'].max()
#print(cap[id])
newdf1.set_index('cluster_guid', inplace=True)
newdf1.to_csv('my_csv.csv', mode='a',header=None)
with open('my_csv.csv',newline='') as f:
r = csv.reader(f)
data = [line for line in r]
with open('my_csv.csv','w',newline='') as f:
w = csv.writer(f)
w.writerow(['cluster_guid','DATE_TAKEN','avail_capacity'])
w.writerows(data)
in_df = pd.read_csv('my_csv.csv', parse_dates=True, index_col='DATE_TAKEN' )
in_df.to_csv('my_csv.csv')
dfs= pd.read_csv('my_csv.csv')
uni=dfs.cluster_guid.unique()
while True:
try:
print(" Press Ctrl +C to exit or enter the cluster guid to be forcasted")
i=input('Please enter the cluster guid')
if i not in uni:
print( 'Please enter a valid cluster guid')
continue
else:
dfs1=dfs.loc[df['cluster_guid'] == i]
dfs1.drop('cluster_guid', axis=1, inplace=True)
dfs1.to_csv('dataframe'+i+'.csv', index=False)
dfs2=pd.read_csv('dataframe'+i+'.csv')
dfs2['DATE_TAKEN'] = pd.DatetimeIndex(dfs2['DATE_TAKEN'])
dfs2 = dfs2.rename(columns={'DATE_TAKEN': 'ds','avail_capacity': 'y'})
my_model = Prophet(interval_width=0.99)
my_model.fit(dfs2)
future_dates = my_model.make_future_dataframe(periods=30, freq='D')
forecast = my_model.predict(future_dates)
print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']])
my_model.plot(forecast,uncertainty=True)
my_model.plot_components(forecast)
plt.show()
os.remove('dataframe'+i+'.csv')
os.remove('my_csv.csv')
except KeyboardInterrupt:
try:
os.remove('my_csv.csv')
except OSError:
pass
sys.exit(0)
Box-Cox transform of order 0 get the trick done. Here are the steps:
1. Add 1 to each values (so as to avoid log(0))
2. Take natural log of each value
3. Make forecasts
4. Take exponent and subtract 1
This way you will not get negative forecasts. Also log have a nice property of converting multiplicative seasonality to additive form.
I am interested in Applying Henze-Zirkler's Multivariate Normality Test in python 3x and I was wondering if I may do so in python in Jupyter notebook.
I have fitted a VAR model with my data and the then I would like to test whether the residuals from this fitted VAR model are normally distributed.
How may I do so in Jupyter notebook using python?
This is another answer since I discover this method later. If you do not want to import the library of R into Python. One may call the output of R to python. i.e. one is capable of activating R function through python as follow:
import rpy2.robjects as robjects
from rpy2.robjects import r
from rpy2.robjects.numpy2ri import numpy2ri
from rpy2.robjects.packages import importr
import numpy as np
suppose that resi is a Dataframe in python say
# Create data
resi = pd.DataFrame(np.random.random((108, 2)), columns=['Number1','Number2'])
Then the code is as follow
#Converting the dataframe from python to R
# firt take the values of the dataframe to numpy
resi1=np.array(resi, dtype=float)
# Taking the variable from Python to R
r_resi = numpy2ri(resi1)
# Creating this variable in R (from python)
r.assign("resi", r_resi)
# Calling libraries in R
r('library("MVN")')
# Calling a function in R (from python)
r("res <- hzTest(resi, qqplot = F)")
# Retrieving information from R to Python
r_result = r("res")
# Printing the output in python
print(r_result)
This will generate the output:
Henze-Zirkler's Multivariate Normality Test
---------------------------------------------
data : resi
HZ : 2.841424
p-value : 1.032563e-06
Result : Data are not multivariate normal.
---------------------------------------------
Update per 2021-08-25
There has been some API changes both to the MVN package and ro rpy2. The following works with MVN version 5.9 and rpy2 version 3.4.
"""Interface file to access the R MVN package"""
import numpy as np
import rpy2.robjects.packages as rpackages
from rpy2.robjects import numpy2ri
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import StrVector
# Install packages, if they are not already installed
packages_to_install_if_needed = ("MVN",)
utils = rpackages.importr("utils")
utils.chooseCRANmirror(ind=1) # select the first mirror in the list
names_to_install = [x for x in packages_to_install_if_needed if not rpackages.isinstalled(x)]
if len(names_to_install) > 0:
utils.install_packages(StrVector(names_to_install))
# load the package
mvn = importr("MVN")
# Generate data
np_arr = np.random.multivariate_normal(np.ones(2), np.eye(2), size=100)
# activate automatic conversion from numpy to rpy2 interface objects
numpy2ri.activate()
# perform the work
res = mvn.mvn(np_arr)
print(res)
outputting
$multivariateNormality
Test HZ p value MVN
1 Henze-Zirkler 0.3885607 0.8343017 YES
$univariateNormality
Test Variable Statistic p value Normality
1 Anderson-Darling Column1 0.2443 0.7569 YES
2 Anderson-Darling Column2 0.3935 0.3692 YES
$Descriptives
n Mean Std.Dev Median Min Max 25th 75th
1 100 0.9619135 1.0353688 1.0222279 -1.994833 3.679615 0.2696537 1.758255
2 100 0.7664778 0.9134449 0.8121996 -1.568635 2.648268 0.2068718 1.418113
Skew Kurtosis
1 -0.2123274 -0.16171832
2 -0.3718904 -0.05279222
There is an open source Python package called Pingouin that provides Henze-Zirkler multivariate normality test and is tested against R's MVM.
https://pingouin-stats.org/generated/pingouin.multivariate_normality.html
Example extracted from the docs:
import pingouin as pg
data = pg.read_dataset('multivariate')
X = data[['Fever', 'Pressure', 'Aches']]
pg.multivariate_normality(X, alpha=.05)
>>> HZResults(hz=0.5400861018514641, pval=0.7173686509624891, normal=True)
There is a package in R that already does this test and it is called MVN
The first thing you have to do is to import MVN into python as described in here
Then go to your jupyter notebook and fit the VAR(1) model to your data as so
# Fit VAR(1) Model
results = Model.fit(1)
results.summary()
Store the residuals as resi
resi=results.resid
Then
# Call function from R
import os
os.environ['R_USER'] = '...\Lib\site-packages\rpy2'
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
from rpy2.robjects.packages import importr
MVN = importr("MVN", lib_loc = "C:/.../R/win-library/3.3")
After importing MVN you can simply do the normality test as so
MVNresult =MVN.hzTest(resi, qqplot = 0)
If you press on
type(MVNresult)
you will find that it is an
rpy2.robjects.methods.RS4
Therefore, in this case you will find this link a very powerful in explaining the details
Then afterwards
tuple(MVNresult.slotnames())
This will show you the observations
('HZ', 'p.value', 'dname', 'dataframe')
Then you may get the values as so
np.array(MVNresult.slots[tuple(MVNresult.slotnames())[i]])[0]
where i stands for 0, 1, 2, 3 as 'HZ', 'p-value',...
So in case the p-value i.e. i=1 is less than 0.05 then residuals (resi) are not multivariate normal at 5% confidence level.
I am currently trying using librosa to perform stfft, such that the parameter resembles a stfft process from a different framework (Kaldi).
The audio file is fash-b-an251
Kaldi does it using a sample frequency of 16 KHz, window_size = 400 (25ms), hop_length=160 (10ms).
The spectrogram extracted from this looks like this:
I then tried to do the same using librosa:
import numpy as np
import sys
import librosa
import os
import scipy
import matplotlib.pyplot as plt
from matplotlib import cm
# Input parameter
# relative_path_to_file
if len(sys.argv) < 1:
print "Missing Arguments!"
print "python spectogram_librosa.py path_to_audio_file"
sys.exit()
path = sys.argv[1]
abs_path = os.path.abspath(path)
spectogram_dnn = "/home/user/dnn/spectogram"
if not os.path.exists(spectogram_dnn):
print "spectogram_dnn folder didn't exist!"
os.makedirs(spectogram_dnn)
print "Created!"
y,sr = librosa.load(abs_path,sr=16000)
D = librosa.logamplitude(np.abs(librosa.core.stft(y, win_length=400, hop_length=160, window=scipy.signal.hanning,center=False)), ref_power=np.max)
librosa.display.specshow(D,sr=16000,hop_length=160, x_axis='time', y_axis='log', cmap=cm.jet)
plt.colorbar(format='%+2.0f dB')
plt.title('Log power spectrogram')
plt.show()
raw_input()
sys.exit()
Which is basically taken from here:
In which i've modified the stfft function such that it fits my parameters..
Problems is that is creates an entirely different plot..
So.. What am I doing wrong in librosa?.. Why is this plot so much different, from the one created in kaldi.
Am I missing something?
It has to do with the Hz scale. The one in the first image is linear while the one in the second image is logarithmic. You can fix it by either changing the scale in either of the images to match the other.