word2vec, how to classify text with SVM? - text

I have a csv file, it has 2 columns: class and text_data. I first extract biGram and TriGrams, then tried to use SVM on my data for classification. But it shows "TypeError: sequence item 0: expected a bytes-like object, str found". I have used Gensim=4.0.0. Your help highly appreciated. Code:
# all packages imported
df_covid = pd.read_csv('allCurated853_4.csv', encoding="utf8")
df_covid['label'] = df_covid['class'].map({
'covidshutdown': 0,
'manufactoring':1,
'corporate':2,
'environmental':3,
'infrastructure':4,
'other':5
})
X = df_covid['text_data']
y = df_covid['label']
corpus=X
lst_corpus = []
for string in corpus:
lst_words = string.split()
lst_grams = [" ".join(lst_words[i:i+1])
for i in range(0, len(lst_words), 1)]
lst_corpus.append(lst_grams)
bigrams_detector = gensim.models.phrases.Phrases(lst_corpus,
delimiter=" ".encode(), min_count=5, threshold=10)
bigrams_detector = gensim.models.phrases.Phraser(bigrams_detector)
trigrams_detector = gensim.models.phrases.Phrases(bigrams_detector[lst_corpus],
delimiter=" ".encode(), min_count=5, threshold=10)
trigrams_detector = gensim.models.phrases.Phraser(trigrams_detector)
cv = gensim.models.word2vec.Word2Vec(lst_corpus, size=300, window=8,
min_count=1, sg=1, iter=30)
X_trainCv, X_testCv, y_trainCv, y_testCv = train_test_split(cv, y,
test_size=0.20, random_state=42)
clf = svm.SVC(kernel='linear').fit(X_trainCv,y_trainCv)
y_pred = clf.predict(X_testCv)
print(classification_report(X_testCv, y_pred))
Complete Error message:
This is a dataset click here to download
if, I do not use biGrams and triGrams and change word2Vec model to this
cv= gensim.models.Word2Vec(lst_corpus, vector_size=100, window=5, min_count=5, workers=4)
New error message appears:
Traceback (most recent call last):
File "D:\Dropbox\AAA\50-2\word2Vec_trails\word2Vec_triGrams_34445.py", line 65, in <module>
X_trainCv, X_testCv, y_trainCv, y_testCv = train_test_split(cv, y, test_size=0.20, random_state=42)
File "C:\Users\makta\anaconda3\lib\site-packages\sklearn\model_selection\_split.py", line 2172, in train_test_split
arrays = indexable(*arrays)
File "C:\Users\makta\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 299, in indexable
check_consistent_length(*result)
File "C:\Users\makta\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 259, in check_consistent_length
lengths = [_num_samples(X) for X in arrays if X is not None]
File "C:\Users\makta\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 259, in <listcomp>
lengths = [_num_samples(X) for X in arrays if X is not None]
File "C:\Users\makta\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 202, in _num_samples
raise TypeError("Singleton array %r cannot be considered"
TypeError: Singleton array array(<gensim.models.word2vec.Word2Vec object at 0x000001A41DF59820>,
dtype=object) cannot be considered a valid collection.
I made this attempt (below code) to use word2vec vectors like count vectorizer or TFIDF. It generates output,but not corrent one. I think I should make list of lists of vectors. Help appreciated. this is Code:
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
df_covid = pd.read_csv('allCurated853_4.csv', encoding="utf8")
df['label'] = df['class'].map({'covidshutdowncsv': 0, 'manufactoringcsv':1, 'corporatecsv':2, 'environmentalcsv':3,'infrastructurecsv':4, 'other':5})
X = df['message']
y = df['label']
X=X.to_string ()
ls = []
rows = X.splitlines(True)
print('size of rows:', len(rows)) # size of rows: 852
for i in rows:
ls.append(i.split(' '))
print('total words:', len(ls)) # total words: 852
model = Word2Vec(ls, min_count=1, size = 4)
words = list(model.wv.vocab)
print('words in vocabolary :',len(words)) # words in vocabolary:3110
print(words)
words=words[0:852] # problem
vectors = []
for word in words:
vectors.append(model[word].tolist())
data = np.array(vectors)
print('vectors of words:', len(data)) # vectors of words: 852
X_trainCv, X_testCv, y_trainCv, y_testCv = train_test_split(data, y, test_size=0.20, random_state=42)
clf_covid = svm.SVC(kernel='linear').fit(X_trainCv,y_trainCv)
clf_covid.score(X_testCv,y_testCv)
# score: 0.50299

Related

using a list as a value in pandas.DataFeame and tensorFlow

Im tring to use list as a value in pandas.DataFrame
but Im getting Exception when trying to use use the adapt function in on the Normalization layer with the NumPy array
this is the error:
ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).
and this is the code:
import pandas as pd
import numpy as np
# Make NumPy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)
import tensorflow as tf
from tensorflow.keras import layers
data = [[45.975, 45.81, 45.715, 45.52, 45.62, 45.65, 4],
[55.67, 55.975, 55.97, 56.27, 56.23, 56.275, 5],
[86.87, 86.925, 86.85, 85.78, 86.165, 86.165, 3],
[64.3, 64.27, 64.285, 64.29, 64.325, 64.245, 6],
[35.655, 35.735, 35.66, 35.69, 35.665, 35.63, 5]
]
lables = [0, 1, 0, 1, 1]
def do():
d_1 = None
for l, d in zip(lables, data):
if d_1 is None:
d_1 = pd.DataFrame({'lable': l, 'close_price': [d]})
else:
d_1 = d_1.append({'lable': l, 'close_price': d}, ignore_index=True)
dataset = d_1.copy()
print(dataset.isna().sum())
dataset = dataset.dropna()
print(dataset.keys())
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)
print(train_dataset.describe().transpose())
train_features = train_dataset.copy()
test_features = test_dataset.copy()
train_labels = train_features.pop('lable')
test_labels = test_features.pop('lable')
print(train_dataset.describe().transpose()[['mean', 'std']])
normalizer = tf.keras.layers.Normalization(axis=-1)
ar = np.array(train_features)
normalizer.adapt(ar)
print(normalizer.mean.numpy())
first = np.array(train_features[:1])
with np.printoptions(precision=2, suppress=True):
print('First example:', first)
print()
print('Normalized:', normalizer(first).numpy())
diraction = np.array(train_features)
diraction_normalizer = layers.Normalization(input_shape=[1, ], axis=None)
diraction_normalizer.adapt(diraction)
diraction_model = tf.keras.Sequential([
diraction_normalizer,
layers.Dense(units=1)
])
print(diraction_model.summary())
print(diraction_model.predict(diraction[:10]))
diraction_model.compile(
optimizer=tf.optimizers.Adam(learning_rate=0.1),
loss='mean_absolute_error')
print(train_features['close_price'])
history = diraction_model.fit(
train_features['close_price'],
train_labels,
epochs=100,
# Suppress logging.
verbose=0,
# Calculate validation results on 20% of the training data.
validation_split=0.2)
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
print(hist.tail())
test_results = {}
test_results['diraction_model'] = diraction_model.evaluate(
test_features,
test_labels, verbose=0)
x = tf.linspace(0.0, 250, 251)
y = diraction_model.predict(x)
print("end")
def main():
do()
if __name__ == "__main__":
main()
I think it is not the usual practice to shrink your features into one column.
Quick-fix is you may put the following line
train_features = np.array(train_features['close_price'].to_list())
before
normalizer = tf.keras.layers.Normalization(axis=-1)
to get rid of the error, but now because your train_features has changed from a DataFrame into a np.array, your subsequent code may suffer, so you need to take care of that too.
If I were you, however, I would have constructed the DataFrame this way
df = pd.DataFrame(data)
df['label'] = lables
Please consider.

Is it possible to use a custom generator to train multi input architecture with keras tensorflow 2.0.0?

With TF 2.0.0, I can train an architecture with one input, I can train an architecture with one input using a custom generator, and I can train an architecture with two inputs. But I can't train an architecture with two inputs using a custom generator.
To keep it minimalist, here's a simple example, with no generator and no multiple inputs to start with:
from tensorflow.keras import layers, models, Model, Input, losses
from numpy import random, array, zeros
input1 = Input(shape=2)
dense1 = layers.Dense(5)(input1)
fullModel = Model(inputs=input1, outputs=dense1)
fullModel.summary()
# Generate random examples:
nbSamples = 21
X_train = random.rand(nbSamples, 2)
Y_train = random.rand(nbSamples, 5)
batchSize = 4
fullModel.compile(loss=losses.LogCosh())
fullModel.fit(X_train, Y_train, epochs=10, batch_size=batchSize)
It's a simple dense layer which takes in input vectors of size 2. The randomly generated dataset contains 21 examples and the batch size is 4. Instead of loading all the data and giving them to model.fit(), we can also give a custom generator in input. The main advantage (for RAM consumption) of this is to load only batch by batch rather that the whole dataset. Here is a simple example with the previous architecture and a custom generator:
import json
# Save the last dataset in a file:
with open("./dataset1input.txt", 'w') as file:
for i in range(nbSamples):
example = {"x": X_train[i].tolist(), "y": Y_train[i].tolist()}
file.write(json.dumps(example) + "\n")
def generator1input(datasetPath, batch_size, inputSize, outputSize):
X_batch = zeros((batch_size, inputSize))
Y_batch = zeros((batch_size, outputSize))
i=0
while True:
with open(datasetPath, 'r') as file:
for line in file:
example = json.loads(line)
X_batch[i] = array(example["x"])
Y_batch[i] = array(example["y"])
i+=1
if i % batch_size == 0:
yield (X_batch, Y_batch)
i=0
fullModel.compile(loss=losses.LogCosh())
my_generator = generator1input("./dataset1input.txt", batchSize, 2, 5)
fullModel.fit(my_generator, epochs=10, steps_per_epoch=int(nbSamples/batchSize))
Here, the generator opens the dataset file, but loads only batch_size examples (not nbSamples examples) each time it is called and slides into the file while looping.
Now, I can build a simple functional architecture with 2 inputs, and no generator:
input1 = Input(shape=2)
dense1 = layers.Dense(5)(input1)
subModel1 = Model(inputs=input1, outputs=dense1)
input2 = Input(shape=3)
dense2 = layers.Dense(5)(input2)
subModel2 = Model(inputs=input2, outputs=dense2)
averageLayer = layers.average([subModel1.output, subModel2.output])
fullModel = Model(inputs=[input1, input2], outputs=averageLayer)
fullModel.summary()
# Generate random examples:
nbSamples = 21
X1 = random.rand(nbSamples, 2)
X2 = random.rand(nbSamples, 3)
Y = random.rand(nbSamples, 5)
fullModel.compile(loss=losses.LogCosh())
fullModel.fit([X1, X2], Y, epochs=10, batch_size=batchSize)
Until here, all models compile and run, but I'm not able to use a generator with the last architecture and its 2 inputs... By trying the following code (which should logically work in my opinion):
# Save data in a file:
with open("./dataset.txt", 'w') as file:
for i in range(nbSamples):
example = {"x1": X1[i].tolist(), "x2": X2[i].tolist(), "y": Y[i].tolist()}
file.write(json.dumps(example) + "\n")
def generator(datasetPath, batch_size, inputSize1, inputSize2, outputSize):
X1_batch = zeros((batch_size, inputSize1))
X2_batch = zeros((batch_size, inputSize2))
Y_batch = zeros((batch_size, outputSize))
i=0
while True:
with open(datasetPath, 'r') as file:
for line in file:
example = json.loads(line)
X1_batch[i] = array(example["x1"])
X2_batch[i] = array(example["x2"])
Y_batch[i] = array(example["y"])
i+=1
if i % batch_size == 0:
yield ([X1_batch, X2_batch], Y_batch)
i=0
fullModel.compile(loss=losses.LogCosh())
my_generator = generator("./dataset.txt", batchSize, 2, 3, 5)
fullModel.fit(my_generator, epochs=10, steps_per_epoch=(nbSamples//batchSize))
I obtain the following error:
File "C:\Anaconda\lib\site-packages\tensorflow_core\python\keras\engine\training.py", line 729, in fit
use_multiprocessing=use_multiprocessing)
File "C:\Anaconda\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py", line 224, in fit
distribution_strategy=strategy)
File "C:\Anaconda\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py", line 547, in _process_training_inputs
use_multiprocessing=use_multiprocessing)
File "C:\Anaconda\lib\site-packages\tensorflow_core\python\keras\engine\training_v2.py", line 606, in _process_inputs
use_multiprocessing=use_multiprocessing)
File "C:\Anaconda\lib\site-packages\tensorflow_core\python\keras\engine\data_adapter.py", line 566, in __init__
reassemble, nested_dtypes, output_shapes=nested_shape)
File "C:\Anaconda\lib\site-packages\tensorflow_core\python\data\ops\dataset_ops.py", line 540, in from_generator
output_types, tensor_shape.as_shape, output_shapes)
File "C:\Anaconda\lib\site-packages\tensorflow_core\python\data\util\nest.py", line 471, in map_structure_up_to
results = [func(*tensors) for tensors in zip(*all_flattened_up_to)]
File "C:\Anaconda\lib\site-packages\tensorflow_core\python\data\util\nest.py", line 471, in <listcomp>
results = [func(*tensors) for tensors in zip(*all_flattened_up_to)]
File "C:\Anaconda\lib\site-packages\tensorflow_core\python\framework\tensor_shape.py", line 1216, in as_shape
return TensorShape(shape)
File "C:\Anaconda\lib\site-packages\tensorflow_core\python\framework\tensor_shape.py", line 776, in __init__
self._dims = [as_dimension(d) for d in dims_iter]
File "C:\Anaconda\lib\site-packages\tensorflow_core\python\framework\tensor_shape.py", line 776, in <listcomp>
self._dims = [as_dimension(d) for d in dims_iter]
File "C:\Anaconda\lib\site-packages\tensorflow_core\python\framework\tensor_shape.py", line 718, in as_dimension
return Dimension(value)
File "C:\Anaconda\lib\site-packages\tensorflow_core\python\framework\tensor_shape.py", line 193, in __init__
self._value = int(value)
TypeError: int() argument must be a string, a bytes-like object or a number, not 'tuple'
As explain in the doc, x argument of model.fit() can be A generator or keras.utils.Sequence returning (inputs, targets), and The iterator should return a tuple of length 1, 2, or 3, where the optional second and third elements will be used for y and sample_weight respectively. Thus, I think that it can not take in input more than one generator. Perhaps multiple inputs are not possible with custom generator. Please, would you have an explanation? A solution?
(otherwise, it seems possible to go through tf.data.Dataset.from_generator() with a less custom approach, but I have difficulties to understand what to indicate in the output_signature argument)
[EDIT] Thank you for your response #Francis Tang. In fact, it's possible to use a custom generator, but it allowed me to understand that I just had to change the line:
yield ([X1_batch, X2_batch], Y_batch)
To:
yield (X1_batch, X2_batch), Y_batch
Nevertheless, it is indeed perhaps better to use tf.keras.utils.Sequence. But I find it a bit restrictive.
In particular, I understand in the example given (as well as in most of the examples I could find about Sequence) that __init__() is first used to load the full dataset, which is against the interest of the generator.
But maybe it was a particular example about Sequence(), and there is no need to use __init__() like that: you can directly read a file and load the desired batch into the __getitem__().
In this case, it seems to push to browse each time the data file, or else it is necessary to create a file per batch beforehand (not really optimal).
from tensorflow.python.keras.utils.data_utils import Sequence
class generator(Sequence):
def __init__(self,filename,batch_size):
data = pickle.load(open(filename,'rb'))
self.X1 = data['X1']
self.X2 = data['X2']
self.y = data['y']
self.bs = batch_size
def __len__(self):
return (len(self.y) - 1) // self.bs + 1
def __getitem__(self,idx):
start, end = idx * self.bs, (idx+1) * self.bs
return (self.X1[start:end], self.X2[start:end]), self.y[start:end]
You need to write a class using Sequence: https://www.tensorflow.org/api_docs/python/tf/keras/utils/Sequence

RuntimeError in Pytorch when increasing batch size to more than 1

This code for my custom data loader runs smoothly with batch_size=1, but when I increase batch size I get the following Error:
RuntimeError: Expected object of scalar type Double but got scalar type Long for sequence element 1 in sequence argument at position #1 'tensors'
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use("TkAgg")
import os, h5py
import PIL
#------------------------------
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
#------------------------------
from data_augmentation import *
#------------------------------
dtype = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor
class NiftiDataset(Dataset):
def __init__(self,transformation_params,data_path, mode='train',transforms=None ):
"""
Parameters:
data_path (string): Root directory of the preprocessed dataset.
mode (string, optional): Select the image_set to use, ``train``, ``valid``
transforms (callable, optional): Optional transform to be applied
on a sample.
"""
self.data_path = data_path
self.mode = mode
self.images = []
self.labels = []
self.W_maps = []
self.centers = []
self.radiuss = []
self.pixel_spacings = []
self.transformation_params = transformation_params
self.transforms = transforms
#-------------------------------------------------------------------------------------
if self.mode == 'train':
self.data_path = os.path.join(self.data_path,'train_set')
elif self.mode == 'valid':
self.data_path = os.path.join(self.data_path,'validation_set')
#-------------------------------------------------------------------------------------
for _, _, f in os.walk(self.data_path):
for file in f:
hdf_file = os.path.join(self.data_path,file)
data = h5py.File(hdf_file,'r') # Dictionary
# Preprocessing of Input Image and Label
patch_img, patch_gt, patch_wmap = PreProcessData(file, data, self.mode, self.transformation_params)
#print(type(data))
self.images.append(patch_img) # 2D image
#print('image shape is : ',patch_img.shape)
self.labels.append(patch_gt) # 2D label
#print('label shape is : ',patch_img.shape)
self.W_maps.append(patch_wmap) # Weight_Map
# self.centers.append(data['roi_center'][:]) # [x,y]
# self.radiuss.append(data['roi_radii'][:]) # [R_min,R_max]
# self.pixel_spacings.append(data['pixel_spacing'][:]) # [x , y , z]
def __len__(self):
return len(self.images)
def __getitem__(self, index):
image = self.images[index]
label = self.labels[index]
W_map = self.W_maps[index]
if self.transforms is not None:
image, label, W_maps = self.transforms(image, label, W_map)
return image, label, W_map
#=================================================================================================
if __name__ == '__main__':
# Test Routinue to check your threaded dataloader
# ACDC dataset has 4 labels
n_labels = 4
path = './hdf5_files'
batch_size = 1
# Data Augmentation Parameters
# Set patch extraction parameters
size1 = (128, 128)
patch_size = size1
mm_patch_size = size1
max_size = size1
train_transformation_params = {
'patch_size': patch_size,
'mm_patch_size': mm_patch_size,
'add_noise': ['gauss', 'none1', 'none2'],
'rotation_range': (-5, 5),
'translation_range_x': (-5, 5),
'translation_range_y': (-5, 5),
'zoom_range': (0.8, 1.2),
'do_flip': (False, False),
}
valid_transformation_params = {
'patch_size': patch_size,
'mm_patch_size': mm_patch_size}
transformation_params = { 'train': train_transformation_params,
'valid': valid_transformation_params,
'n_labels': 4,
'data_augmentation': True,
'full_image': False,
'data_deformation': False,
'data_crop_pad': max_size}
#====================================================================
dataset = NiftiDataset(transformation_params=transformation_params,data_path=path,mode='train')
dataloader = DataLoader(dataset=dataset,batch_size=2,shuffle=True,num_workers=0)
dataiter = iter(dataloader)
data = dataiter.next()
images, labels,W_map = data
#===============================================================================
# Data Visualization
#===============================================================================
print('image: ',images.shape,images.type(),'label: ',labels.shape,labels.type(),
'W_map: ',W_map.shape,W_map.type())
img = transforms.ToPILImage()(images[0,0,:,:,0].float())
lbl = transforms.ToPILImage()(labels[0,0,:,:].float())
W_mp = transforms.ToPILImage()(W_map [0,0,:,:].float())
plt.subplot(1,3,1)
plt.imshow(img,cmap='gray',interpolation=None)
plt.title('image')
plt.subplot(1,3,2)
plt.imshow(lbl,cmap='gray',interpolation=None)
plt.title('label')
plt.subplot(1,3,3)
plt.imshow(W_mp,cmap='gray',interpolation=None)
plt.title('Weight Map')
plt.show()
I have noticed some strange things such as Tensor types are different even though images and labels and weight maps are images with same type and size.
The Error Traceback:
Traceback (most recent call last):
File "D:\Saudi_CV\Vibot\Smester_2\2_Medical Image analysis\Project_2020\OUR_Project\data_loader.py", line 118, in <module>
data = dataiter.next()
File "F:\Download_2019\Anaconda3\lib\site-packages\torch\utils\data\dataloader.py", line 345, in __next__
data = self._next_data()
File "F:\Download_2019\Anaconda3\lib\site-packages\torch\utils\data\dataloader.py", line 385, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "F:\Download_2019\Anaconda3\lib\site-packages\torch\utils\data\_utils\fetch.py", line 47, in fetch
return self.collate_fn(data)
File "F:\Download_2019\Anaconda3\lib\site-packages\torch\utils\data\_utils\collate.py", line 79, in default_collate
return [default_collate(samples) for samples in transposed]
File "F:\Download_2019\Anaconda3\lib\site-packages\torch\utils\data\_utils\collate.py", line 79, in <listcomp>
return [default_collate(samples) for samples in transposed]
File "F:\Download_2019\Anaconda3\lib\site-packages\torch\utils\data\_utils\collate.py", line 64, in default_collate
return default_collate([torch.as_tensor(b) for b in batch])
File "F:\Download_2019\Anaconda3\lib\site-packages\torch\utils\data\_utils\collate.py", line 55, in default_collate
return torch.stack(batch, 0, out=out)
RuntimeError: Expected object of scalar type Double but got scalar type Long for sequence element 1 in sequence argument at position #1 'tensors'
[Finished in 19.9s with exit code 1]
The problem was solved through this solution explained on this page link
image = torch.from_numpy(self.images[index]).type(torch.FloatTensor)
label = torch.from_numpy(self.labels[index]).type(torch.FloatTensor)
W_map = torch.from_numpy(self.W_maps[index]).type(torch.FloatTensor)

Keras fit_generator() & Input array should have the same as target samples

So far I'm trying to implement the fit-generator for sentiment analysis as I only have a small PGU and big dataset. But I keep getting this error
Using Theano backend.
Can not use cuDNN on context None: cannot compile with cuDNN. We got this error:
b'In file included from C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\include/driver_types.h:53:0,\r\n from C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\include/cudnn.h:63,\r\n from C:\\Users\\Def\\AppData\\Local\\Temp\\try_flags_p2iwer2o.c:4:\r\nC:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\include/host_defines.h:84:0: warning: "__cdecl" redefined\r\n #define __cdecl\r\n ^\r\n<built-in>: note: this is the location of the previous definition\r\nd000029.o:(.idata$5+0x0): multiple definition of `__imp___C_specific_handler\'\r\nd000026.o:(.idata$5+0x0): first defined here\r\nC:/Users/Def/Anaconda3/envs/Final/Library/mingw-w64/bin/../lib/gcc/x86_64-w64-mingw32/5.3.0/../../../../x86_64-w64-mingw32/lib/../lib/crt2.o: In function `__tmainCRTStartup\':\r\nC:/repo/mingw-w64-crt-git/src/mingw-w64/mingw-w64-crt/crt/crtexe.c:285: undefined reference to `_set_invalid_parameter_handler\'\r\ncollect2.exe: error: ld returned 1 exit status\r\n'
Mapped name None to device cuda: GeForce GTX 960M (0000:01:00.0)
Epoch 1/10
Traceback (most recent call last):
File "C:/Users/Def/PycharmProjects/KerasUkExpenditure/TweetParsing.py", line 136, in <module>
epochs=10)
File "C:\Users\Def\Anaconda3\envs\Final\lib\site-packages\keras\legacy\interfaces.py", line 88, in wrapper
return func(*args, **kwargs)
File "C:\Users\Def\Anaconda3\envs\Final\lib\site-packages\keras\models.py", line 1097, in fit_generator
initial_epoch=initial_epoch)
File "C:\Users\Def\Anaconda3\envs\Final\lib\site-packages\keras\legacy\interfaces.py", line 88, in wrapper
return func(*args, **kwargs)
File "C:\Users\Def\Anaconda3\envs\Final\lib\site-packages\keras\engine\training.py", line 1876, in fit_generator
class_weight=class_weight)
File "C:\Users\Def\Anaconda3\envs\Final\lib\site-packages\keras\engine\training.py", line 1614, in train_on_batch
check_batch_axis=True)
File "C:\Users\Def\Anaconda3\envs\Final\lib\site-packages\keras\engine\training.py", line 1307, in _standardize_user_data
_check_array_lengths(x, y, sample_weights)
File "C:\Users\Def\Anaconda3\envs\Final\lib\site-packages\keras\engine\training.py", line 229, in _check_array_lengths
'and ' + str(list(set_y)[0]) + ' target samples.')
ValueError: Input arrays should have the same number of samples as target arrays. Found 1000 input samples and 1 target samples.
I have a matrix that is 1000 elements long since I only have a maximum corpus of 1000 words which is specified in the Tokenizer().
I then have the sentiment which is either a 0 for negative or a 1 for positive.
My question is why do I receive the error? I have tried to use the transform on both the data and labels and I still receive the same error. here is my code.
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.preprocessing.text import Tokenizer
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import re
"""
the amount of samples out to the 1 million to use, my 960m 2GB can only handle
about 30,000ish at the moment depending on a number of neurons in the
deep layer and a number of layers.
"""
maxSamples = 3000
#Load the CSV and get the correct columns
data = pd.read_csv("C:\\Users\\Def\\Desktop\\Sentiment Analysis Dataset1.csv")
dx = pd.DataFrame()
dy = pd.DataFrame()
dy[['Sentiment']] = data[['Sentiment']]
dx[['SentimentText']] = data[['SentimentText']]
dataY = dy.iloc[0:maxSamples]
dataX = dx.iloc[0:maxSamples]
testY = dy.iloc[maxSamples: maxSamples + 1000]
testX = dx.iloc[maxSamples: maxSamples + 1000]
"""
here I filter the data and clean it up by removing # tags, hyperlinks and
also any characters that are not alpha-numeric.
"""
def removeTagsAndLinks(dataframe):
for x in dataframe.iterrows():
#Removes Hyperlinks
x[1].values[0] = re.sub("(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,#?^=%&:/~+#-]*[\w#?^=%&/~+#-])?", "", str(x[1].values[0]))
#Removes # tags
x[1].values[0] = re.sub("#\\w+", '', str(x[1].values[0]))
#keeps only alpha-numeric chars
x[1].values[0] = re.sub("\W+", ' ', str(x[1].values[0]))
return dataframe
xData = removeTagsAndLinks(dataX)
xTest = removeTagsAndLinks(testX)
"""
This loop looks for any Tweets with characters shorter than 2 and once found write the
index of that Tweet to an array so I can remove from the Dataframe of sentiment and the
list of Tweets later
"""
indexOfBlankStrings = []
for index, string in enumerate(xData):
if len(string) < 2:
indexOfBlankStrings.append(index)
for row in indexOfBlankStrings:
dataY.drop(row, axis=0, inplace=True)
"""
This makes a BOW model out of all the tweets then creates a
vector for each of the tweets containing all the words from
the BOW model, each vector is the same size becuase the
network expects it
"""
def vectorise(tokenizer, list):
return tokenizer.fit_on_texts(list)
#Make BOW model and vectorise it
t = Tokenizer(lower=False, num_words=1000)
t.fit_on_texts(dataX.iloc[:,0].tolist())
t.fit_on_texts(dataX.iloc[:,0].tolist())
"""
Here im experimenting with multiple layers of the total
amount of words in the syllabus divided by ^2 - This
has given me quite accurate results compared to random guess's
of amount of neron's.
"""
l1 = int(xData.shape[0] / 4) #To big for my GPU
l2 = int(xData.shape[0] / 8) #To big for my GPU
l3 = int(xData.shape[0] / 16)
l4 = int(xData.shape[0] / 32)
l5 = int(xData.shape[0] / 64)
l6 = int(xData.shape[0] / 128)
#Make the model
model = Sequential()
model.add(Dense(l1, input_dim=xData.shape[1]))
model.add(Dropout(0.15))
model.add(Dense(l2))
model.add(Dropout(0.2))
model.add(Dense(l3))
model.add(Dropout(0.2))
model.add(Dense(l4))
model.add(Dense(1, activation='relu'))
#Compile the model
model.compile(optimizer='RMSProp', loss='binary_crossentropy', metrics=['acc'])
"""
This here will use multiple batches to train the model.
startIndex:
This is the starting index of the array for which you want to
start training the network from.
dataRange:
The number of elements use to train the network in each batch so
since dataRange = 1000 this mean it goes from
startIndex...dataRange OR 0...1000
amountOfEpochs:
This is kinda self explanitory, the more Epochs the more it
is supposed to learn AKA updates the optimisation algo numbers
"""
amountOfEpochs = 1
dataRange = 1000
startIndex = 0
def generator(tokenizer, data, labels, totalSize=maxSamples, startIndex=0):
l = labels.as_matrix()
while True:
for i in range(startIndex, totalSize):
batch_features = tokenizer.texts_to_matrix(xData.iloc[i])
batch_labels = l[i]
yield batch_features, batch_labels
derp = generator(t, data=xData, labels=dataY)
##This runs the model for batch AKA load a little them process then load a little more
for amountOfData in range(1000, maxSamples, 1000):
#(loss, acc) = model.train_on_batch(x=dim[startIndex:amountOfData], y=np.asarray(dataY.iloc[startIndex:amountOfData]))
history = model.fit_generator(generator=generator(tokenizer=t,
data=xData,
labels=dataY),
steps_per_epoch=1,
epochs=10)
Thanks
The problem you are having is that the number of samples in your input array, do not equal the number of samples in your target array. This means the number of rows in you matrices do not match. The problems stems from your generator function. You index the data as
batch_labels = l[i]
which is only returning one sample (row of matrix). When instead it should be something like...
batch_labels = l[i:i+1000]
However there are other problems with your use of the fit_generator. You should not be using this within a loop. I don't see how it is benefiting the program, and calling the fit_generator in a loop defeats the purpose of using a generator. The function you would use to train an an individual batch of data would be
train_on_batch()
as seen in the docs

test and train dataset has different number of features

I am trying to train svm model on some training and test data. Program works well if I combine the test and training data, but If I divide them and test the model accuracy it says
Traceback (most recent call last):
File "/home/PycharmProjects/analysis.py", line 160, in <module>
main()
File "/home/PycharmProjects/analysis.py", line 156, in main
learn_model(tf_idf_train,target,tf_idf_test)
File "/home/PycharmProjects/analysis.py", line 113, in learn_model
predicted = classifier.predict(data_test)
File "/home/.local/lib/python3.4/site-packages/sklearn/svm/base.py", line 573, in predict
y = super(BaseSVC, self).predict(X)
File "/home/.local/lib/python3.4/site-packages/sklearn/svm/base.py", line 310, in predict
X = self._validate_for_predict(X)
File "/home/.local/lib/python3.4/site-packages/sklearn/svm/base.py", line 479, in _validate_for_predict
(n_features, self.shape_fit_[1]))
ValueError: X.shape[1] = 19137 should be equal to 4888, the number of features at training time
Here the test set is larger than trainset. so test set naturally has more number of the feature than trainset.so its giving value error.
here is my code:
def load_train_file():
with open('~1k comments.csv',encoding='ISO-8859-1',) as csv_file:
reader = csv.reader(csv_file,delimiter=",",quotechar='"')
reader.__next__()
data =[]
target = []
for row in reader:
if row[0] and row[1]:
data.append(row[0])
target.append(row[1])
return data,target
def load_file():
with open('comments.csv',encoding='ISO-8859-1',) as csv_file:
reader = csv.reader(csv_file,delimiter=",",quotechar='"')
reader.__next__()
data =[]
target = []
for row in reader:
if row[0] and row[1]:
data.append(row[0])
target.append(row[1])
print(len(data))
return data
# preprocess creates the term frequency matrix for the review data set
def preprocess():
dataTrain,targetTrain = load_train_file()
testData=load_file()
count_vectorizer = CountVectorizer(binary='true')
dataTrain = count_vectorizer.fit_transform(dataTrain)
tfidf_train_data = TfidfTransformer(use_idf=True).fit_transform(dataTrain)
count_vectorizer = CountVectorizer()
testData = count_vectorizer.fit_transform(testData)
tfidf_test_data = TfidfTransformer(use_idf=True).fit_transform(testData)
return tfidf_train_data,tfidf_test_data
def learn_model(data,target,testData):
data_train,data_test,target_train,target_test = cross_validation.train_test_split(data,target,test_size=0.001,random_state=43)
e = np.zeros(testData.shape[0])
data_train1, data_test, target_train1, target_test = cross_validation.train_test_split(testData, e,test_size=.9,random_state=43)
classifier = SVC(gamma=.01, C=100.)
classifier.fit(data_train, target_train)
predicted = classifier.predict(data_test)
for x in range(0,50):
print(testData[x]+str(predicted[x]))
def evaluate_model(target_true,target_predicted):
print (classification_report(target_true,target_predicted))
print ("The accuracy score is {:.2%}".format(accuracy_score(target_true,target_predicted)))
def main():
data,target = load_train_file()
datatest=load_file()
tf_idf_train,tf_idf_test = preprocess()
# print(tf_idf_train.shape())
# print(tf_idf_test.shape())
learn_model(tf_idf_train,target,tf_idf_test)
# learn_model(data,target,datatest)
main()
how can solve this?
The same vectorizer and transformer must be used both for train and for test parts; also, vectorizers shouldn't be fit on test data. So instead of
count_vectorizer = CountVectorizer(binary='true')
dataTrain = count_vectorizer.fit_transform(dataTrain)
tfidf_train_data = TfidfTransformer(use_idf=True).fit_transform(dataTrain)
count_vectorizer = CountVectorizer()
testData = count_vectorizer.fit_transform(testData)
tfidf_test_data = TfidfTransformer(use_idf=True).fit_transform(testData)
use something like this:
count_vectorizer = CountVectorizer(binary=True)
tfidf_transformer = TfidfTransformer(use_idf=True)
dataTrain = count_vectorizer.fit_transform(dataTrain)
tfidf_train_data = transformer.fit_transform(dataTrain)
testData = count_vectorizer.transform(testData)
tfidf_test_data = tfidf_transformer.transform(testData)
You can also use Pipeline to make it nicer:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(
CountVectorizer(binary=True),
TfidfTransformer(use_idf=True),
)
tfidf_train_data = pipe.fit_transform(dataTrain)
tfidf_test_data = pipe.transform(testData)
Or even use TfidfVectorizer which combines CountVectorizer and TfidfTransformer in a single vectorizer object:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(binary=True, use_idf=True)
tfidf_train_data = vec.fit_transform(dataTrain)
tfidf_test_data = vec.transform(testData)

Resources