Fastai for time series regression - pytorch

So I have been using fastai library for a couple of years now. Recently, I came upon the extension library dedicated for the time series analysis - tsai
I am trying to perform simple regression task on the famous airpassengers dataset.
I have no idea what I am doing wrong:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import torch
import random
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
# fastai
from fastai import *
from fastai.text import *
from fastai.text.all import *
from tsai.all import *
flight_data = sns.load_dataset("flights")
flight_data.head(20)
scaler = MinMaxScaler(feature_range=(-1, 1))
# flight_data['passengers'] = scaler.fit_transform(flight_data['passengers'].values.reshape(-1, 1)).flatten()
plt.figure(figsize=(10, 4))
plt.plot(flight_data['passengers'])
def create_inout_sequences(input_data, tw):
inout_seq = []
label_seq = []
L = len(input_data)
for i in range(L-tw):
train_seq = input_data[i:i+tw]
train_label = input_data[i+tw:i+tw+1]
inout_seq.append(train_seq)
label_seq.append(train_label)
return np.array(inout_seq), np.array(label_seq)
data = flight_data['passengers'].values
x, y = create_inout_sequences(data, 15)
src = itemify(x, y)
yy = y.reshape(-1)
xx = x.reshape(-1)
tfms = [None, [TSRegression()]]
batch_tfms = TSStandardize(by_sample=True, by_var=True)
dls = get_ts_dls(x, yy, tfms=tfms, bs=64)
dls.show_batch()
dls.one_batch()
dls.c
learn = ts_learner(dls, InceptionTime, metrics=[mae, rmse], cbs=ShowGraph())
learn.lr_find()

Related

Annotating clustering from DBSCAN to original Pandas DataFrame

I have working code that is utilizing dbscan to find tight groups of sparse spatial data imported with pd.read_csv.
I am maintaining the original spatial data locations and would like to annotate the labels returned by dbscan for each data point to the original dataframe and then write a csv with the same information.
So the code below is doing exactly what I would expect it to at this point, I would just like to extend it to import the label for each row in the original dataframe.
import argparse
import string
import os, subprocess
import pathlib
import glob
import gzip
import re
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from sklearn.cluster import DBSCAN
X = pd.read_csv(tmp_csv_name)
X = X.drop('Name', axis = 1)
X = X.drop('Type', axis = 1)
X = X.drop('SomeValue', axis = 1)
# only columns 'x' and 'y' now remain
db=DBSCAN(eps=EPS, min_samples=minSamples, metric='euclidean', algorithm='auto', leaf_size=30).fit(X)
labels = def_inst_dbsc.labels_
unique_labels = set(labels)
# maxX , maxY are manual inputs temporarily
while sizeX > 16 or sizeY > 16 :
sizeX=sizeX*0.8 ; sizeY=sizeY*0.8
fig, ax = plt.subplots(figsize=(sizeX,sizeY))
plt.xlim(0,maxX)
plt.ylim(0,maxY)
plt.scatter(X['x'], X['y'], c=colors, marker="o", picker=True)
# hackX , hackY are manual inputs temporarily
# which represent the boundaries defined in the original dataset
poly = patches.Polygon(xy=list(zip(hackX,hackY)), fill=False)
ax.add_patch(poly)
plt.show()

Drop the features that have less correlation with respect to target variable

I have loaded a dataset and tried to find the correlation coefficient with respect to target variable.
Below are the codes:
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
#Loading the dataset
x = load_boston()
df = pd.DataFrame(x.data, columns = x.feature_names)
df["MEDV"] = x.target
X = df.drop("MEDV",1) #Feature Matrix
y = df["MEDV"] #Target Variable
df.head()
#Using Pearson Correlation
plt.figure(figsize=(12,10))
cor = df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()
#Correlation with output variable
cor_target = abs(cor["MEDV"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.4]
print(relevant_features)
How do I drop the features that have correlation coefficient < 0.4?
Try this:
#Selecting least correlated features
irelevant_features = cor_target[cor_target<0.4]
# list of irelevant_features
cols = list([i for i in irelevant_features.index])
#Dropping irelevant_features
df = df.drop(cols, axis=1)
relevant_features = cor_target[cor_target < 0.4]
print(relevant_features)
X = df.drop(['MEDV','CRIM', 'ZN', 'CHAS','AGE', 'DIS','RAD', 'B'], 1)
use: for i in irelevant_features(As written above)

Recovering features names of StandardScaler().fit_transform() with sklearn

Edited from a tutorial in Kaggle, I try to run the code below and data (available to download from here):
Code:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for plotting facilities
from datetime import datetime, date
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math
from sklearn.preprocessing import StandardScaler
df = pd.read_csv("./data/Aquifer_Petrignano.csv")
df['Date'] = pd.to_datetime(df.Date, format = '%d/%m/%Y')
df = df[df.Rainfall_Bastia_Umbra.notna()].reset_index(drop=True)
df = df.interpolate(method ='ffill')
df = df[['Date', 'Rainfall_Bastia_Umbra', 'Depth_to_Groundwater_P24', 'Depth_to_Groundwater_P25', 'Temperature_Bastia_Umbra', 'Temperature_Petrignano', 'Volume_C10_Petrignano', 'Hydrometry_Fiume_Chiascio_Petrignano']].resample('7D', on='Date').mean().reset_index(drop=False)
X = df.drop(['Depth_to_Groundwater_P24','Depth_to_Groundwater_P25','Date'], axis=1)
y1 = df.Depth_to_Groundwater_P24
y2 = df.Depth_to_Groundwater_P25
scaler = StandardScaler()
X = scaler.fit_transform(X)
model = xgb.XGBRegressor()
param_search = {'max_depth': range(1, 2, 2),
'min_child_weight': range(1, 2, 2),
'n_estimators' : [1000],
'learning_rate' : [0.1]}
tscv = TimeSeriesSplit(n_splits=2)
gsearch = GridSearchCV(estimator=model, cv=tscv,
param_grid=param_search)
gsearch.fit(X, y1)
xgb_grid = xgb.XGBRegressor(**gsearch.best_params_)
xgb_grid.fit(X, y1)
ax = xgb.plot_importance(xgb_grid)
ax.figure.tight_layout()
ax.figure.savefig('test.png')
y_val = y1[-80:]
X_val = X[-80:]
y_pred = xgb_grid.predict(X_val)
print(mean_absolute_error(y_val, y_pred))
print(math.sqrt(mean_squared_error(y_val, y_pred)))
I plotted a features importance figure whose original features names are hidden:
If I comment out these two lines:
scaler = StandardScaler()
X = scaler.fit_transform(X)
I get the output:
How could I use scaler.fit_transform() for X and get a feature importance plot with the original feature names?
The reason behind this is that StandardScaler returns a numpy.ndarray of your feature values (same shape as pandas.DataFrame.values, but not normalized) and you need to convert it back to pandas.DataFrame with the same column names.
Here's the part of your code that needs changing.
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

No output for seaborn distplot

I was trying to plot a seaborn distplot.
sample code:
import pandas as pd
import seaborn as sns
import numpy as np
import scipy
import matplotlib.pyplot as plt
# data
np.random.seed(365)
x1 = np.random.normal(10, 3.4, size=1000) # mean of 10
df = pd.DataFrame({'x1': x1})
def map_pdf(x, **kwargs):
mu, std = scipy.stats.norm.fit(x)
x0, x1 = p1.axes[0][0].get_xlim() # axes for p1 is required to determine x_pdf
x_pdf = np.linspace(x0, x1, 100)
y_pdf = scipy.stats.norm.pdf(x_pdf, mu, std)
plt.plot(x_pdf, y_pdf, c='r')
p1 = sns.displot(data=df, x='x1', kind='hist', bins=40, stat='density')
p1.map(map_pdf, 'x1')
not sure why I am not getting any output after executing the above code!
Upon execution above code, i am getting this,
<seaborn.axisgrid.FacetGrid at 0x7f6a6fa0f820>
Any help on this will be highly appreciated.
Thank you in advance for the support!
Use the plt.show to display your plot. The same was recreated and furnished below with the solution.
import pandas as pd
import seaborn as sns
import numpy as np
import scipy
import matplotlib.pyplot as plt
# data
np.random.seed(365)
x1 = np.random.normal(10, 3.4, size=1000) # mean of 10
df = pd.DataFrame({'x1': x1})
def map_pdf(x, **kwargs):
mu, std = scipy.stats.norm.fit(x)
x0, x1 = p1.axes[0][0].get_xlim() # axes for p1 is required to determine x_pdf
x_pdf = np.linspace(x0, x1, 100)
y_pdf = scipy.stats.norm.pdf(x_pdf, mu, std)
plt.plot(x_pdf, y_pdf, c='r')
p1 = sns.displot(data=df, x='x1', kind='hist', bins=40, stat='density')
p1.map(map_pdf, 'x1')
plt.show(p1)

When I applied RandomForest in Python, ValueError: Found input variables with inconsistent numbers of samples: [2883, 1236]

File "D:\Users\Watson Rockstar\Anaconda3\lib\site-packages\sklearn\utils\validation.py", line 205, in check_consistent_length
" samples: %r" % [int(l) for l in lengths])
ValueError:
Found input variables with inconsistent numbers of samples: [2883, 1236]
This dataset totally has 4119 data, and the Xtrain volum= (2883,18), Xtest volum = (1236,18)
I have tried to use LabelEncoder and OneHotEncoder to sovle the problems, but it is not helpful:
# Ignore the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import missingno as msno
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
#import the necessary modelling algos.
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
#preprocessing
from sklearn.preprocessing import LabelEncoder
telebanking = pd.read_csv('bank-additional.csv')
telebank = telebanking.drop(['duration','default'],axis =1)
def transform(feature):
le = LabelEncoder()
telebank[feature] = le.fit_transform(telebank[feature])
print(le.classes_)
cat_telebank=telebank.select_dtypes(include='object')
cat_telebank.columns
for col in cat_telebank.columns:
transform(col)
scaler=StandardScaler()
scaled_telebank=scaler.fit_transform(telebank.drop('y',axis=1))
X=scaled_telebank
Y=telebank['y'].as_matrix()
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,Y,test_size=0.3)
def compare(model):
clf = model
clf.fit(Xtrain,Ytrain)
pred = clf.predict(Xtrain)
acc.append(accuracy_score(pred,Ytest))
prec.append(precision_score(pred,Ytest))
rec.append(recall_score(pred,Ytest))
auroc.append(roc_auc_score(pred,Ytest))
acc=[]
prec=[]
rec=[]
auroc=[]
models=[RandomForestClassifier(),DecisionTreeClassifier()]
model_names=['RandomForestClassifier','DecisionTreeClassifier']
for model in range(len(models)):
compare(models[model])
d={'Modelling Algo':model_names,'Accuracy':acc,'Precision':prec,'Recall':rec,'Area Under ROC Curve':auroc}
met_telebank=pd.DataFrame(d)
met_telebank
It is the first warning's detail.
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,Y,test_size=0.3)
should be
Xtrain,Ytrain,Xtest,Ytest = train_test_split(X,Y,test_size=0.3)
This is causing the error, because it wants to use Xtest as the Ytrain values.

Resources