AttributeError: 'CountVectorizer' object has no attribute 'fit_transfrom' - python-3.x

1 from sklearn.feature_your textextraction.text import CountVectorizer
2 cv = CountVectorizer()
----> 3 X = cv.fit_transfrom(df['transformed_text']).toarray()
no error in this line

Related

Not able to access data even though it exists, using Pandas dataframe, when training a deep learning model using Python3

I am trying to do k-fold cross validation using sklearn and using pandas dataframe. This is not working as expected. Looks like the data cannot be accessed for some reason even though it exists. The code works for some time, but is unable to complete a full epoch.
Here is the error:
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
94773248/94765736 [==============================] - 1s 0us/step
94781440/94765736 [==============================] - 1s 0us/step
458/610 [=====================>........] - ETA: 21s - loss: 0.1640 - accuracy: 0.1621
---------------------------------------------------------------------------
UnknownError Traceback (most recent call last)
<ipython-input-7-28b7c7367434> in <module>()
60 validation_data=valid_gen,
61 validation_steps=len(test_index)//valid_batch_size,
---> 62 verbose=1)
...
UnknownError: Graph execution error:
2 root error(s) found.
(0) UNKNOWN: IndexError: single positional indexer is out-of-bounds
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/script_ops.py", line 271, in __call__
ret = func(*args)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 642, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/data/ops/dataset_ops.py", line 1004, in generator_py_func
values = next(generator_state.get_iterator(iterator_id))
File "/usr/local/lib/python3.7/dist-packages/keras/engine/data_adapter.py", line 830, in wrapped_generator
for data in generator_fn():
File "<ipython-input-4-8914ea8c1843>", line 6, in get_data_generator
r = df.iloc[i]
File "/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py", line 931, in __getitem__
return self._getitem_axis(maybe_callable, axis=axis)
File "/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py", line 1566, in _getitem_axis
self._validate_integer(key, axis)
File "/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py", line 1500, in _validate_integer
raise IndexError("single positional indexer is out-of-bounds")
IndexError: single positional indexer is out-of-bounds
[[{{node PyFunc}}]]
[[IteratorGetNext]]
[[IteratorGetNext/_2]]
(1) UNKNOWN: IndexError: single positional indexer is out-of-bounds
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/script_ops.py", line 271, in __call__
ret = func(*args)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/autograph/impl/api.py", line 642, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/data/ops/dataset_ops.py", line 1004, in generator_py_func
values = next(generator_state.get_iterator(iterator_id))
File "/usr/local/lib/python3.7/dist-packages/keras/engine/data_adapter.py", line 830, in wrapped_generator
for data in generator_fn():
File "<ipython-input-4-8914ea8c1843>", line 6, in get_data_generator
r = df.iloc[i]
File "/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py", line 931, in __getitem__
return self._getitem_axis(maybe_callable, axis=axis)
File "/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py", line 1566, in _getitem_axis
self._validate_integer(key, axis)
File "/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py", line 1500, in _validate_integer
raise IndexError("single positional indexer is out-of-bounds")
IndexError: single positional indexer is out-of-bounds
[[{{node PyFunc}}]]
[[IteratorGetNext]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_13498]
Here is the code that I am trying to do:
# using google colab
! pip install --upgrade --no-cache-dir gdown
! gdown 1_DgB2a2Q7eYJpXtKWfl4XPUgTIW1sXw1
! unzip -qq Train.zip
import matplotlib.pyplot as plt
import numpy as np
import cv2
import glob
import csv
import pandas as pd
# create a pandas data frame of images, age, gender and race
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input
from keras.models import Model
from tensorflow.keras.optimizers import Adam
from keras.layers import Dense, Flatten, GlobalAveragePooling2D, Multiply, Dropout
from tensorflow.keras import regularizers
from tensorflow.keras.utils import to_categorical
from PIL import Image
from keras.preprocessing.image import ImageDataGenerator
from keras import backend as K
from sklearn.model_selection import KFold
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow import keras
from datetime import datetime
from tensorflow import keras
from tqdm import tqdm
import pandas as pd
def get_data_generator(df, indices, batch_size=16):
images, labels = [], []
while True:
for i in indices:
# print(i," - ",end="")
r = df.iloc[i]
file_, label = r['file'], r['label']
im_gray = Image.open(file_).convert('L')
im_gray = im_gray.resize((360, 360))
im = np.zeros(shape=(360, 360,3))
im[:,:,0] = im_gray
im[:,:,1] = im_gray
im[:,:,2] = im_gray
im = np.array(im) / 255.0
images.append(im)
new_label = label/100.0
labels.append(new_label)
if len(images) >= batch_size:
yield np.array(images), np.array(labels)
images, labels = [], []
np.random.seed(42)
EPOCHS = 1
MODEL_NAME = 'ResNet50'
IMG_SIZE = '360x360'
all_train_imgs = glob.glob('Train/*')
# print("Length of all training images = ",len(all_train_imgs))
all_training_files_name = []
all_training_perc = []
with open('Train.csv') as f:
contents = f.readlines()
for item in contents:
# make the changes in the folder here
img_name = "Train/"+item.split(',')[0]
perc_infc = float(item.split(',')[1])
num_pat = item.split(',')[2]
# print(img_name," - ",perc_infc," - ",num_pat)
all_training_files_name.append(img_name)
all_training_perc.append(perc_infc)
attributes = {'label':all_training_perc, 'file':all_training_files_name}
df_all = pd.DataFrame(attributes)
df_all = df_all.dropna()
print(df_all.head())
kf = KFold(n_splits=5)
kf.get_n_splits(all_training_files_name)
fold_no = 0
for train_index, test_index in kf.split(all_training_files_name):
fold_no += 1
#########################################################################################
OUTPUT = 1
frozen = ResNet50 (weights="imagenet", input_shape=(360,360,3), include_top=False)
trainable = frozen.output
trainable = Dropout(0.5)(GlobalAveragePooling2D()(trainable))
trainable = Dense(2048, activation="relu")(trainable)
trainable = Dense(1024, activation="relu")(trainable)
trainable = Dense(OUTPUT, activation="sigmoid")(trainable)
model = Model(inputs=frozen.input, outputs=trainable)
opt = Adam(learning_rate=1e-5)
model.compile(optimizer=opt, loss=tf.keras.losses.MeanAbsoluteError(),#loss='binary_crossentropy',
#experimental_run_tf_function=False,
metrics = ['accuracy']
)
#########################################################################################
batch_size = 4
valid_batch_size = 4
df_train = df_all.loc[train_index.astype(int)]
df_val = df_all.loc[test_index.astype(int)]
train_gen = get_data_generator(df_train, train_index, batch_size=batch_size)
valid_gen = get_data_generator(df_val, test_index, batch_size=valid_batch_size)
callbacks = [
ModelCheckpoint("./model_checkpoint", monitor='val_loss'),
#ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=4)
]
# for storing logs into tensorboard
logdir="logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)
history = model.fit(train_gen,
steps_per_epoch=len(train_index)//batch_size,
epochs=EPOCHS,
callbacks=[tensorboard_callback,callbacks],
validation_data=valid_gen,
validation_steps=len(test_index)//valid_batch_size,
verbose=1)
Here is the code for reproducing in google colab : https://colab.research.google.com/drive/11C-GP6xCB3CCwvz6gj8gy6mTOJIc3Zld?usp=sharing
I figured it out, there were some error in dataframe creation. For this problem, making these following changes works, just using the full dataframe.
#df_train = df_all.loc[train_index.astype(int)]
#df_val = df_all.loc[test_index.astype(int)]
train_gen = get_data_generator(df_all, train_index, batch_size=batch_size)
valid_gen = get_data_generator(df_all, test_index, batch_size=valid_batch_size)

Model is unable to Predict due to a type error, applying a model on UTF-8 encoded URDU Dataset

I am trying to run an algorithm on an URDU dataset, I have a Logistic regression model for english but runs in to errors when I try to use it on Urdu Language UTF-8 encoded dataset.
I have applied the code given below on the problem.
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk
import logging
from gensim.models import word2vec
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from scipy.sparse import coo_matrix
from tqdm import tqdm
from scipy import sparse
import numpy
#import score
import re, nltk, scipy
#import gensim
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,VotingClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.model_selection import KFold
dataset1 = pd.read_csv("fakenews.csv", encoding = 'UTF-8')
ds = dataset1.copy()
print(ds.shape)
df = ds.copy()
df = ds.copy()
df.drop('FileName',axis=1,inplace = True)
df.drop('label',axis=1,inplace = True)
da = ds.copy()
da.drop('FileName',axis=1,inplace = True)
da.drop('title',axis=1,inplace = True)
da.drop('text',axis=1,inplace = True)
def extract_word_overlap(title, text):
word_overlap = []
for i, (title, body) in tqdm(enumerate(zip(title, text))):
#preprocess_headline = preprocess(headline)
#preprocess_body = preprocess(body)
features = len(set(title).intersection(text)) / float(len(set(title).union(text)))
word_overlap.append(features)
word_overlap_sparse = scipy.sparse.coo_matrix(numpy.array(word_overlap))
return word_overlap_sparse
def combine_features(tfidf_vectors, word_overlap):
combined_features = sparse.bmat([[tfidf_vectors, word_overlap.T]])
return combined_features
from sklearn.model_selection import train_test_split
x_train, x_test,y_train,y_test = train_test_split(df,da, test_size = 0.3,random_state=42)
training_bodies = x_train['text']
training_headlines = x_train['title']
test_bodies = x_test['text']
test_headlines = x_test['title']
print("\t-Extracting tfidf vectors..")
body_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
bodies_tfidf = body_vectorizer.fit_transform(training_bodies)
headline_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
headlines_tfidf = headline_vectorizer.fit_transform(training_headlines)
bodies_tfidf_test = body_vectorizer.transform(test_bodies)
headlines_tfidf_test = headline_vectorizer.transform(test_headlines)
training_tfidf = scipy.sparse.hstack([bodies_tfidf, headlines_tfidf])
test_tfidf = scipy.sparse.hstack([bodies_tfidf_test, headlines_tfidf_test])
training_overlap = extract_word_overlap(training_headlines, training_bodies)
test_overlap = extract_word_overlap(test_headlines, test_bodies)
training_features = combine_features(training_tfidf, training_overlap)
test_features = combine_features(test_tfidf, test_overlap)
print("[3] Fitting model..")
print("\t-Logistic Regression")
lr = LogisticRegression(C = 1.0, class_weight='balanced', solver="lbfgs", max_iter=150)
y_pred = lr.fit(training_features, y_train).predict(test_features)
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score, f1_score
print('classification report:')
print(classification_report(y_test, y_pred))
score = metrics.accuracy_score(y_test, y_pred)
print("Accuracy of Regression: %0.3f" % score)
print('Macro f1:', f1_score(y_test, y_pred, average='macro'))
print("[4] Evaluating model..")
score.report_score(y_test, y_pred)
print(confusion_matrix(y_test, y_pred))
"""
i expected the model to predict but it give me a type error. TypeError: '<' not supported between instances of 'float' and 'str'
761: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
Traceback (most recent call last):
File "<ipython-input-1-800030f783cf>", line 1, in <module>
runfile('C:/Users/door/Desktop/af/pycode.py', wdir='C:/Users/door/Desktop/af')
File "C:\Users\door\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 704, in runfile
execfile(filename, namespace)
File "C:\Users\door\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 108, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/door/Desktop/af/pycode.py", line 101, in <module>
y_pred = lr.fit(training_features, y_train).predict(test_features)
File "C:\Users\door\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py", line 1286, in fit
check_classification_targets(y)
File "C:\Users\door\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py", line 168, in check_classification_targets
y_type = type_of_target(y)
File "C:\Users\door\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py", line 287, in type_of_target
if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
File "C:\Users\door\Anaconda3\lib\site-packages\numpy\lib\arraysetops.py", line 233, in unique
ret = _unique1d(ar, return_index, return_inverse, return_counts)
File "C:\Users\door\Anaconda3\lib\site-packages\numpy\lib\arraysetops.py", line 281, in _unique1d
ar.sort()
TypeError: '<' not supported between instances of 'float' and 'str'
Dataset i am using looks like this
FileName title text label
0001a میں استعفیٰ نہیں دے رہا فیاض الحسن چوہان "صوبائی وزیر اطلاعات فیاض الحسن چوہان نے کہا ہے کہ میں استعفیٰ نہیں دے رہا اور نہ مجھ سے استعفیٰ مانگا گیا ہے مجھے محتاط بیان دینے کا کہا گیا ہے
اور میں نے اپنی بیان میں ہندو مذہب یا ہندو برادری کو نہیں بلکہ بھارتی فوج اور بھارتی میڈیا کو مخاطب کیا تھا" Agree

XGBModel' object has no attribute 'evals_result_'

I am trying to use xgboost on a dataset. I have seen the same syntax in various blogs but I am getting an error while calling clf.evals_result()
here is my code
from xgboost import XGBRegressor as xgb
from sklearn.metrics import mean_absolute_error as mae
evals_result ={}
eval_s = [(x, y),(xval,yval)]
clf = xgb(n_estimators=100,learning_rate=0.03,tree_method='gpu_hist',lamda=0.1,eval_metric='mae',eval_set=eval_s,early_stopping_rounds=0,evals_result=evals_result)
clf.fit(x,y)
r = clf.evals_result()
here is error I am receiving
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-138-2d6867968043> in <module>
1
----> 2 r = clf.evals_result()
3
4 p = clf.predict(xval)
/opt/conda/lib/python3.6/site-packages/xgboost/sklearn.py in evals_result(self)
399 'validation_1': {'logloss': ['0.41965', '0.17686']}}
400 """
--> 401 if self.evals_result_:
402 evals_result = self.evals_result_
403 else:
AttributeError: 'XGBRegressor' object has no attribute 'evals_result_'
I got exactly the same error, the solution it's to pass the eval_set to the fit function and not in the creation of the classifier
clf.fit(x,y,eval_set=eval_s)
Then you can run clf.evals_result()

IndexError: index 2 is out of bounds for axis 1 with size 2

I am receiving an error Index is out of bounds in my line doctopic = clf.fit_transform(dtm) and in my Data folder I have two CSV files and could someone explain how to fix this Index error.
import os
print (os.getcwd())
import numpy as np
import langdetect
from stop_words import get_stop_words
CORPUS_PATH = os.path.join('C:\\Users\\mike120\\Downloads\\TM 09-25\\Data')
filenames = sorted([os.path.join(CORPUS_PATH, fn) for fn in
os.listdir(CORPUS_PATH)])
len(filenames)
filenames[:5]
import sklearn.feature_extraction.text as text
#lang = langdetect.detect(CORPUS_PATH)
lang_stop = get_stop_words('en')
vectorizer = text.CountVectorizer(input='filename', stop_words=lang_stop, min_df=2)
dtm = vectorizer.fit_transform(filenames).toarray()
vocab = np.array(vectorizer.get_feature_names())
dtm.shape
from sklearn import decomposition
num_topics = 20
num_top_words = 20
clf = decomposition.NMF(n_components=num_topics, random_state=1)
doctopic = clf.fit_transform(dtm)

function is not defined. "NameErro: name 'train' is not defined" in jupyter python 3

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
​def warn(*args, **kwargs): pass
import warnings
warnings.warn = warn
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import StratifiedShuffleSplit
train = pd.read_csv('..\input\train.csv')
test = pd.read_csv('..\input\test.csv')
def encode(train, test):
le = LabelEncoder().fit(train.species)
labels = le.transform(train.species)
classes = list(le.classes_)
test_ids = test.id
train = train.drop(['species', 'id'], axis=1)
test = test.drop(['id'], axis=1)
return train, labels, test, test_ids, classes
train, labels, test, test_ids, classes = encode(train, test)
train.head(5)
​
NameError
Traceback (most recent call last) <ipython-input-10-08166fb1df95> in <module>()
10 return train, labels, test, test_ids, classes
11
---> 12 train, labels, test, test_ids, classes = encode(train, test)
13 train.head(5)
NameError: name 'train' is not defined

Resources