How to pass user defined function inside TfidfVectorizer.fit_transform() - python-3.x

I have function for text preprocessing which is simply removing stopwords as:
def text_preprocessing():
df['text'] = df['text'].apply(word_tokenize)
df['text']=df['text'].apply(lambda x: [item for item in x if item not in stopwords])
new_array=[]
for keywords in df['text']: #converts list of words into string
P=" ".join(str(x) for x in keywords)
new_array.append(P)
df['text'] = new_array
return df['text']
I want to pass text_preprocessing() into another function tf_idf() which gives feature matrix what I essentially did as:-
def tf_idf():
tfidf = TfidfVectorizer()
feature_array = tfidf.fit_transform(text_preprocessing)
keywords_data=pd.DataFrame(feature_array.toarray(), columns=tfidf.get_feature_names())
return keywords_data
I got an error as TypeError: 'function' object is not iterable

Rather than building additional functions for stop-word removal you can simply pass a custom list of stop-words to TfidfVectorizer. As you can see in the example below "test" is successfully excluded from the Tfidf vocabulary.
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
# Setting up
numbers = np.random.randint(1, 5, 3)
text = ['This is a test.', 'Is this working?', "Let's see."]
df = pd.DataFrame({'text': text, 'numbers': numbers})
# Define custom stop words and instantiate TfidfVectorizer with them
my_stopwords = ['test'] # the list can be longer
tfidf = TfidfVectorizer(stop_words=my_stopwords)
text_tfidf = tfidf.fit_transform(df['text'])
# Optional - concatenating tfidf with df
df_tfidf = pd.DataFrame(text_tfidf.toarray(), columns=tfidf.get_feature_names())
df = pd.concat([df, df_tfidf], axis=1)
# Initial df
df
Out[133]:
numbers text
0 2 This is a test.
1 4 Is this working?
2 3 Let's see.
tfidf.vocabulary_
Out[134]: {'this': 3, 'is': 0, 'working': 4, 'let': 1, 'see': 2}
# Final df
df
Out[136]:
numbers text is let see this working
0 2 This is a test. 0.707107 0.000000 0.000000 0.707107 0.000000
1 4 Is this working? 0.517856 0.000000 0.000000 0.517856 0.680919
2 3 Let's see. 0.000000 0.707107 0.707107 0.000000 0.000000

Related

Fill missing values in a dataset with information from other column

I have a dataset in python pandas with missing values for the variable Engine_model, but I have other rows with the same information. As I know that
Car_model Engine_Model
BMW 5 type A
Renault 21 type B
BMW 5 NaN
Hyunday Santro type C
For example, in here I have a NaN that should be filled with 'type A', as that information is in the first row.
How can I do that? Finding the information to fill NaN knowing that it Engine model is the same for all the cars of the same model?
I have obtained the indixes of the missing values and the car model names of those missing values:
Engine_model_missing_indices = data[data['Engine_mode'].isnull()].index
Carmodel_missing = data.loc[Engine_model_missingindices , 'Car_model']
I've found a similar solution, that refers to calculating a mean to impute missing values, based on that a working solution would be something like that:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
import scipy
from sklearn.base import BaseEstimator, TransformerMixin
example_df = pd.DataFrame({
'Car_model': ['BMW 5', 'Renault 21', 'BMW 5', 'Hyunday Santro'],
'Engine_Model': ['type A', 'type B', np.NaN, 'type C']
})
class WithinGroupModeImputer(BaseEstimator, TransformerMixin):
def __init__(self, group_var):
self.group_var = group_var
def fit(self, X, y=None):
return self
def transform(self, X):
# the copy leaves the original dataframe intact
X_ = X.copy()
for col in X_.columns:
if X_[col].dtypes == 'object':
X_.loc[(X[col].isna()) & X_[self.group_var].notna(), col] = X_[self.group_var].map(
X_.groupby(self.group_var)[col].agg(lambda x: scipy.stats.mode(x, keepdims=False)[0]))
X_[col] = X_[col].fillna(X_[col].agg(
lambda x: scipy.stats.mode(x, keepdims=False)[0]))
return X_
imp = WithinGroupModeImputer(group_var='Car_model')
imp.fit(example_df)
imp.transform(example_df)
And the output would be:
Car_model
Engine_Model
0
BMW 5
type A
1
Renault 21
type B
2
BMW 5
type A
3
Hyunday Santro
type C

Encoding in Python such that numbering starts with 1

I have a dataframe, wherein the column 'team' needs to be encoded.
These are my codes:
#Load the required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
#Create dictionary
data = {'team': ['A', 'A', 'B', 'B', 'C'],
'Income': [5849, 4583, 3000, 2583, 6000],
'Coapplicant Income': [0, 1508, 0, 2358, 0],
'LoanAmount': [123, 128, 66, 120, 141]}
#Convert dictionary to dataframe
df = pd.DataFrame(data)
print("\n df",df)
# Initiate label encoder
le = LabelEncoder()
# return encoded label
label = le.fit_transform(df['team'])
# printing label
print("\n label =",label )
# removing the column 'team' from df
df.drop("team", axis=1, inplace=True)
# Appending the array to our dataFrame
df["team"] = label
# printing Dataframe
print("\n df",df)
I am getting the below result after encoding:
However, I wish to ensure following two things:
Encoding starts with 1 and not 0
The location of column 'team' should remain the same as original
i.e. I wish to have following result:
Can somebody please help me out how to do this ?
Do not drop the column and increment the label on assignment:
le = LabelEncoder()
# return encoded label
label = le.fit_transform(df['team'])
# Replacing the column
df["team"] = label + 1
Output:
df
team
Income
Coapplicant Income
LoanAmount
0
1
5849
0
123
1
1
4583
1508
128
2
2
3000
0
66
3
2
2583
2358
120
4
3
6000
0
141

How to replace element with it's probability of belonging to class label if the label is in different Dataframe

I have a pandas DataFrame X_train containing a column 'state' having different(repeated) state names. In another Y_train DataFrame, I have class value 0-1.
In a dictionary variable Temp I have a probability of each state(unique) belonging to class 0 and 1.
Now I want to replace all the state names in X_train with their probability score corresponds to class label in Y_train.
How to do it?
Solution:
The data as you described:
import pandas as pd
X_train = pd.DataFrame([{'state': 'A'}, {'state': 'B'}, {'state': 'A'},{'state': 'A'}])
Y_train = pd.DataFrame([{'class': 1}, {'class': 0}, {'class': 1}, {'class': 1}])
Temp = {'A': {0: 0.75, 1: 0.25}, 'B': {0: 0.20, 1:0.8}}
Combined the two dataframes using a concat columnwise like so:
combined = pd.concat([X_train, Y_train], axis=1)
where axis=1 means you want to combine by column
Now run a double loop to assign the new values
for classname in combined['class'].unique():
for state in combined['state'].unique():
combined.loc[combined['class'] == classname, 'class'] = Temp[state][classname]
You'll end up with a combined looking like this:
state class
0 A 0.25
1 B 0.75
2 A 0.25
3 A 0.25
then just split up your data frames again
X_train = combined[['state']]
Y_train = combined[['class']]

Random selection of sample from CSV file with dask different than with pandas

I have 3 big CSV files. I try to randomly extract some samples from the files without loading them into the memory. I am doing this:
SITS = dd.read_csv("sits_train_0.csv", blocksize="512MB",
usecols=band_blue + ["samplefid"]).set_index("samplefid")
MASK = dd.read_csv("mask_train_0.csv", blocksize="512MB",
usecols=band_mask + ["samplefid"]).set_index("samplefid")
GP = dd.read_csv("sits_gp_train_0.csv", blocksize="512MB",
usecols=band_blue_gp + ["samplefid"]).set_index("samplefid")
# SITS = pd.read_csv("sits_train_0.csv",
# usecols=band_blue + ["samplefid"]).set_index("samplefid")
# MASK = pd.read_csv("mask_train_0.csv",
# usecols=band_mask + ["samplefid"]).set_index("samplefid")
# GP = pd.read_csv("sits_gp_train_0.csv",
# usecols=band_blue_gp + ["samplefid"]).set_index("samplefid")
np.random.seed(0)
NSAMPLES=100
samples = np.random.choice(MASK.index, size=NSAMPLES, replace=False)
s = SITS.loc[samples][band_blue].compute().values
m = MASK.loc[samples][band_mask].compute().values
sg = GP.loc[samples][band_blue_gp].compute().values
# s = SITS.loc[samples][band_blue].values
# m = MASK.loc[samples][band_mask].values
# sg = GP.loc[samples][band_blue_gp].values
I had strange results, so I compare to pandas with smaller files (see commented code above) for which I have correct results.
If I set blocksize to None, the results are fine, but it loads everything in memory, so using dask is not useful in that case and my CSV are to big to fits in memory. My CSV are written randomly so I need to use index to recover the same samples from the 3 CSV.
I feel I miss something from dask, but I don't see what.
I'd recommend using sample
In [16]: import pandas as pd
In [17]: import dask.dataframe as dd
In [18]: df = pd.DataFrame({'num_legs': [2, 4, 8, 0],
...: 'num_wings': [2, 0, 0, 0],
...: 'num_specimen_seen': [10, 2, 1, 8]},
...: index=['falcon', 'dog', 'spider', 'fish'])
In [19]: ddf = dd.from_pandas(df, npartitions=2)
In [20]: ddf.sample??
In [21]: df.sample(frac=0.5, replace=True, random_state=1)
Out[21]:
num_legs num_wings num_specimen_seen
dog 4 0 2
fish 0 0 8
In [22]: ddf.sample(frac=0.5, replace=True, random_state=1)
Out[22]:
Dask DataFrame Structure:
num_legs num_wings num_specimen_seen
npartitions=2
dog int64 int64 int64
fish ... ... ...
spider ... ... ...
Dask Name: sample, 4 tasks
In [23]: ddf.sample(frac=0.5, replace=True, random_state=1).compute()
Out[23]:
num_legs num_wings num_specimen_seen
falcon 2 2 10
fish 0 0 8

cannot convert string into float

Sales Discount Profit Product ID
0 0.050090 0.000000 0.262335 FUR-ADV-10000002
1 0.110793 0.000000 0.260662 FUR-ADV-10000108
2 0.309561 0.864121 0.241432 FUR-ADV-10000183
3 0.039217 0.591474 0.260687 FUR-ADV-10000188
4 0.070205 0.000000 0.263628 FUR-ADV-10000190
5 0.697873 0.000000 0.281162 FUR-ADV-10000571
6 0.064918 0.000000 0.261285 FUR-ADV-10000600
7 0.091950 0.000000 0.262946 FUR-ADV-10000847
8 0.056013 0.318384 0.257952 FUR-ADV-10001283
9 0.304472 0.318384 0.265739 FUR-ADV-10001440
10 0.046234 0.318384 0.261058 FUR-ADV-10001659
Am using K elbow method to find the right number of cluster
Using the elbow method to find the optimal number of clusters
import matplotlib.pyplot as plt
def kelbow(final_df,k):
from sklearn.cluster import KMeans
x = []
for i in range(1,k):
kmeans = KMeans(n_clusters = i)
kmeans.fit(final_df)
x.append(kmeans.inertia_)
plt.plot(range(1,k), 30)
plt.title('The elbow method')
plt.xlabel('The number of clusters')
plt.ylabel('WCSS')
plt.show()
return x
Returning the function,
kelbow(final_df,30),
But the code is throwing the error as,
ValueError: could not convert string to float: 'TEC-STA-10004927'
How can i find the clusters?
Make dummy variables.
final_df = pd.get_dummies(final_df, columns=['ProductID'], dtype=('int64'))
final_df = final_df.drop(['ProductID'], axis=1)
This should work for you:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
def kelbow(df, k):
x = []
final_df = pd.get_dummies(df, columns=df.select_dtypes(['object']).columns)
for i in range(1,k):
kmeans = KMeans(n_clusters = i)
kmeans.fit(final_df)
x.append(kmeans.inertia_)
plt.plot(range(1,k), 30)
plt.title('The elbow method')
plt.xlabel('The number of clusters')
plt.ylabel('WCSS')
plt.show()
return x

Resources