object has no attribute error with python3 - python-3.x

I have a error when trying to call calculate_similarity2 function which in in DocSim.py file from my notebook.
The error message is : 'DocSim' object has no attribute 'calculate_similarity2'
Here the content of my docsim File :
import numpy as np
class DocSim(object):
def __init__(self, w2v_model , stopwords=[]):
self.w2v_model = w2v_model
self.stopwords = stopwords
def vectorize(self, doc):
"""Identify the vector values for each word in the given document"""
doc = doc.lower()
words = [w for w in doc.split(" ") if w not in self.stopwords]
word_vecs = []
for word in words:
try:
vec = self.w2v_model[word]
word_vecs.append(vec)
except KeyError:
# Ignore, if the word doesn't exist in the vocabulary
pass
# Assuming that document vector is the mean of all the word vectors
# PS: There are other & better ways to do it.
vector = np.mean(word_vecs, axis=0)
return vector
def _cosine_sim(self, vecA, vecB):
"""Find the cosine similarity distance between two vectors."""
csim = np.dot(vecA, vecB) / (np.linalg.norm(vecA) * np.linalg.norm(vecB))
if np.isnan(np.sum(csim)):
return 0
return csim
def calculate_similarity(self, source_doc, target_docs=[], threshold=0):
"""Calculates & returns similarity scores between given source document & all
the target documents."""
if isinstance(target_docs, str):
target_docs = [target_docs]
source_vec = self.vectorize(source_doc)
results = []
for doc in target_docs:
target_vec = self.vectorize(doc)
sim_score = self._cosine_sim(source_vec, target_vec)
if sim_score > threshold:
results.append({
'score' : sim_score,
'sentence' : doc
})
# Sort results by score in desc order
results.sort(key=lambda k : k['score'] , reverse=True)
return results
def calculate_similarity2(self, source_doc=[], target_docs=[], threshold=0):
"""Calculates & returns similarity scores between given source document & all the target documents."""
if isinstance(source_doc, str):
target_docs = [source_doc]
if isinstance(target_docs, str):
target_docs = [target_docs]
#source_vec = self.vectorize(source_doc)
results = []
for doc in source_doc:
source_vec = self.vectorize(doc)
for doc1 in target_docs:
target_vec = self.vectorize(doc)
sim_score = self._cosine_sim(source_vec, target_vec)
if sim_score > threshold:
results.append({
'score' : sim_score,
'source sentence' : doc,
'target sentence' : doc1
})
# Sort results by score in desc order
results.sort(key=lambda k : k['score'] , reverse=True)
return results
here in instruction code when i try to call the fucntion :
To create DocSim Object
ds = DocSim(word2vec_model,stopwords=stopwords)
sim_scores = ds.calculate_similarity2(source_doc, target_docs)
the error message is :
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-54-bb0bd1e0e0ad> in <module>()
----> 1 sim_scores = ds.calculate_similarity2(source_doc, target_docs)
AttributeError: 'DocSim' object has no attribute 'calculate_similarity2'
i don't undersantand how to resolve this problem.
I can access to all function except calculate_similarity2
Can you help me please?
thanks

You have defined the calculate_similarity2 function inside the __init__ scope. Try getting it out of there

Related

Сompare two faces using python3 module face_recognition?

sorry for my bad english.
I am trying to compare two faces using python3 module 'face_recognition'
here is an example of calculating euclidean distance in python
pdist([vector1, vector2], 'euclidean')
I want to calculate euclidean distance only in SQL query, because all faces(theirs vectors) will be stored in my database, but I do not know how to do this with a SQL query.
Information:
MariaDB version: 10.5.11
Python: 3.9.2
#!/usr/bin/env python3
import cv2
import face_recognition
import mysql.connector as mysql
def get_image_hash(image):
# Open image
img = face_recognition.load_image_file(image)
# Save as black
#img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Get vector
vector = face_recognition.face_encodings(img)[0]
vector = (str(vector),)
return vector
# Open DB
conn = mysql.connect(
host = '127.0.0.1',
user = 'user',
passwd = 'password'
)
cur = conn.cursor()
cur.execute("SHOW DATABASES")
# Check if db 'test' already exist
db_found = False
for db in cur:
if 'test' in db:
db_found = True
if not db_found:
cur.execute("CREATE DATABASE IF NOT EXISTS test;")
conn.commit()
cur.execute("USE test;")
cur.execute("""CREATE TABLE IF NOT EXISTS faces(id_face BIGINT PRIMARY KEY NOT NULL AUTO_INCREMENT, face_hash TEXT)""")
new_image = get_image_hash('test.jpg')
# Add face(array) in DB
cur.execute('''INSERT INTO faces (face_hash) VALUES(%s)''', new_image)
conn.commit()
# Upload a picture for search
find_me_image = get_image_hash('findme.jpg')
#print('d: ', find_me_image[0])
# How should i compare these two arrays in my SQL query to find a similar face?
cur.execute("SELECT * FROM faces WHERE ..... ;")
cur.close()
print('find_me_image: ', str(find_me_image))
print('new_image: ', str(new_image))
Result:
Find_me_image: ('[-0.04221933 0.04460172 0.10287622 -0.14319997 -0.13808066 0.00552465\n -0.04414323 -0.07157505 0.23200855 -0.12091423 0.16892464 -0.16992114\n -0.2487883 0.09141497 -0.14198568 0.1824664 -0.11484738 -0.1130986\n -0.14396232 -0.06075872 -0.00201617 0.07473749 -0.01706937 0.05610432\n -0.11021845 -0.30173326 -0.02712429 -0.10394925 -0.05155517 -0.21909578\n 0.03083897 0.16680503 -0.09715255 -0.0407755 -0.01714687 0.08432341\n -0.01913652 -0.13662203 0.21924476 0.04394831 -0.20848413 -0.03259828\n 0.04784738 0.30321479 0.22730266 -0.02372641 -0.01165112 -0.12765107\n 0.13877977 -0.3403039 0.0424962 0.10813272 0.0511388 0.12078771\n 0.04942191 -0.13038178 0.02736722 0.15339687 -0.24367541 0.10453884\n 0.13450858 -0.09997959 0.01744595 -0.10602434 0.2614505 0.10681546\n -0.12075276 -0.12065229 0.195976 -0.11606392 -0.0447496 0.08198876\n -0.13573587 -0.18409243 -0.19127932 0.01680213 0.35644779 0.16652581\n -0.12988403 -0.00341757 -0.15569599 -0.09128557 -0.03799717 0.09235845\n 0.06296059 -0.07972728 0.00744779 0.07452074 0.23394027 -0.0726112\n -0.00072305 0.2978259 -0.01452125 -0.06529554 -0.08694689 0.01903715\n -0.14941891 0.10714116 -0.1096215 0.00143995 0.00146057 0.00348109\n 0.06795555 0.10826397 -0.18627991 0.21965174 -0.04136307 -0.01491791\n 0.03774849 -0.07495191 -0.03808937 -0.02331351 0.29242265 -0.23740929\n 0.13265632 0.1274993 0.17672779 0.11845816 0.01477844 0.07670261\n 0.11437597 -0.03779818 -0.21296507 0.03480547 0.06180557 -0.01749492\n -0.023851 0.11586148]',)
New_image: ('[-0.04221933 0.04460172 0.10287622 -0.14319997 -0.13808066 0.00552465\n -0.04414323 -0.07157505 0.23200855 -0.12091423 0.16892464 -0.16992114\n -0.2487883 0.09141497 -0.14198568 0.1824664 -0.11484738 -0.1130986\n -0.14396232 -0.06075872 -0.00201617 0.07473749 -0.01706937 0.05610432\n -0.11021845 -0.30173326 -0.02712429 -0.10394925 -0.05155517 -0.21909578\n 0.03083897 0.16680503 -0.09715255 -0.0407755 -0.01714687 0.08432341\n -0.01913652 -0.13662203 0.21924476 0.04394831 -0.20848413 -0.03259828\n 0.04784738 0.30321479 0.22730266 -0.02372641 -0.01165112 -0.12765107\n 0.13877977 -0.3403039 0.0424962 0.10813272 0.0511388 0.12078771\n 0.04942191 -0.13038178 0.02736722 0.15339687 -0.24367541 0.10453884\n 0.13450858 -0.09997959 0.01744595 -0.10602434 0.2614505 0.10681546\n -0.12075276 -0.12065229 0.195976 -0.11606392 -0.0447496 0.08198876\n -0.13573587 -0.18409243 -0.19127932 0.01680213 0.35644779 0.16652581\n -0.12988403 -0.00341757 -0.15569599 -0.09128557 -0.03799717 0.09235845\n 0.06296059 -0.07972728 0.00744779 0.07452074 0.23394027 -0.0726112\n -0.00072305 0.2978259 -0.01452125 -0.06529554 -0.08694689 0.01903715\n -0.14941891 0.10714116 -0.1096215 0.00143995 0.00146057 0.00348109\n 0.06795555 0.10826397 -0.18627991 0.21965174 -0.04136307 -0.01491791\n 0.03774849 -0.07495191 -0.03808937 -0.02331351 0.29242265 -0.23740929\n 0.13265632 0.1274993 0.17672779 0.11845816 0.01477844 0.07670261\n 0.11437597 -0.03779818 -0.21296507 0.03480547 0.06180557 -0.01749492\n -0.023851 0.11586148]',)
New:
#!/usr/bin/env python3
import cv2
import json
import face_recognition
import mysql.connector as mysql
# DB
conn = mysql.connect(
host = 'localhost',
user = '',
passwd = ''
)
def load(str_data):
str_data = str_data.replace("[", "").replace("]", "")
result = []
for i, line in enumerate(str_data.split("\n")):
result.append([])
for element in line.replace(" ", " ").split(" "):
try:
result[i].append(float(element))
except ValueError:
pass
return result
def distance(model, test):
distance = 0
for i, line in enumerate(model):
dist_line = 0
for j, element in enumerate(line):
dist_line += (element - test[i][j]) ** 2
distance += dist_line ** 0.5
return distance
def get_image_hash(image):
# Open image
img = face_recognition.load_image_file(image)
# Save as black
#img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Get vector
vector = face_recognition.face_encodings(img)[0]
# We can save only iterable object
vector = (str(vector),)
return vector
cur = conn.cursor(buffered=True)
cur.execute("SHOW DATABASES")
# Check if db 'test' already exist
db_found = False
for db in cur:
if 'test' in db:
db_found = True
if not db_found:
cur.execute("CREATE DATABASE IF NOT EXISTS test;")
conn.commit()
cur.execute("USE test;")
cur.execute("""CREATE TABLE IF NOT EXISTS faces(id_face BIGINT PRIMARY KEY NOT NULL AUTO_INCREMENT, face_hash TEXT)""")
# Add face in DB
new_image = get_image_hash('test.jpg')
print('new_image debug: ', new_image)
cur.execute('''INSERT INTO faces (face_hash) VALUES(%s)''', new_image)
conn.commit()
# Find added face
find_me_image = get_image_hash('findme.jpg')
print('debug find_me_image: ', find_me_image)
# Get data from DB
cur.execute("SELECT * FROM faces;")
face_data = cur.fetchall()
# Check
for x in face_data:
print('1: ', load(find_me_image[0]))
print('2: ', load(x[1]))
# x[1] == row face_hash
compare_result = distance(load(find_me_image[0]), load(x[1]))
#print('Result: ', compare_result)
# Got error
'''
Traceback (most recent call last):
File "/home/user/Desktop/parser_steam/image_recognition/test/./test.py", line 102, in <module>
compare_result = distance(load(find_me_image[0]), load(x[1]))
File "/home/user/Desktop/parser_steam/image_recognition/test/./test.py", line 35, in distance
dist_line += (element - test[i][j]) ** 2
IndexError: list index out of range
'''
cur.close()
Error:
Here is what you need!
import json
def load(str_data):
str_data = str_data.replace("[", "").replace("]", "")
result = []
for i, line in enumerate(str_data.split("\n")):
result.append([])
for element in line.replace(" ", " ").split(" "):
try:
result[i].append(float(element))
except ValueError:
pass
return result
def distance(model, test):
distance = 0
for i, line in enumerate(model):
dist_line = 0
for j, element in enumerate(line):
dist_line += (element - test[i][j]) ** 2
distance += dist_line ** 0.5
return distance
if __name__ == "__main__":
Find_me_image = '[-0.04221933 0.04460172 0.10287622 -0.14319997 -0.13808066 0.00552465\n -0.04414323 -0.07157505 0.23200855 -0.12091423 0.16892464 -0.16992114\n -0.2487883 0.09141497 -0.14198568 0.1824664 -0.11484738 -0.1130986\n -0.14396232 -0.06075872 -0.00201617 0.07473749 -0.01706937 0.05610432\n -0.11021845 -0.30173326 -0.02712429 -0.10394925 -0.05155517 -0.21909578\n 0.03083897 0.16680503 -0.09715255 -0.0407755 -0.01714687 0.08432341\n -0.01913652 -0.13662203 0.21924476 0.04394831 -0.20848413 -0.03259828\n 0.04784738 0.30321479 0.22730266 -0.02372641 -0.01165112 -0.12765107\n 0.13877977 -0.3403039 0.0424962 0.10813272 0.0511388 0.12078771\n 0.04942191 -0.13038178 0.02736722 0.15339687 -0.24367541 0.10453884\n 0.13450858 -0.09997959 0.01744595 -0.10602434 0.2614505 0.10681546\n -0.12075276 -0.12065229 0.195976 -0.11606392 -0.0447496 0.08198876\n -0.13573587 -0.18409243 -0.19127932 0.01680213 0.35644779 0.16652581\n -0.12988403 -0.00341757 -0.15569599 -0.09128557 -0.03799717 0.09235845\n 0.06296059 -0.07972728 0.00744779 0.07452074 0.23394027 -0.0726112\n -0.00072305 0.2978259 -0.01452125 -0.06529554 -0.08694689 0.01903715\n -0.14941891 0.10714116 -0.1096215 0.00143995 0.00146057 0.00348109\n 0.06795555 0.10826397 -0.18627991 0.21965174 -0.04136307 -0.01491791\n 0.03774849 -0.07495191 -0.03808937 -0.02331351 0.29242265 -0.23740929\n 0.13265632 0.1274993 0.17672779 0.11845816 0.01477844 0.07670261\n 0.11437597 -0.03779818 -0.21296507 0.03480547 0.06180557 -0.01749492\n -0.023851 0.11586148]'
New_image = '[-0.04221933 0.04460172 0.10287622 -0.14319997 -0.13808064 0.00552465\n -0.04414323 -0.07157505 0.23200855 -0.12091423 0.16892464 -0.16992114\n -0.2487883 0.09141497 -0.14198568 0.18246 -0.11484738 -0.1130986\n -0.14396232 -0.06075872 -0.0020117 0.07473749 -0.01706937 0.05610432\n -0.11021845 -0.30173326 -0.02712429 -0.10394925 -0.05155517 -0.21909578\n 0.03083897 0.16680503 -0.09715255 -0.0407755 -0.01714687 0.08432341\n -0.01913652 -0.13662203 0.21924476 0.04394831 -0.20848413 -0.03259828\n 0.04784738 0.30321479 0.22730266 -0.02372641 -0.0116112 -0.12765107\n 0.13877977 -0.3403039 0.0424962 0.10813272 0.0511388 0.12078771\n 0.04942191 -0.13038178 0.02736722 0.15339687 -0.24367541 0.10453884\n 0.13450858 -0.09997959 0.01744595 -0.10602434 0.2614505 0.10681546\n -0.12075276 -0.12065229 0.195976 -0.11606392 -0.0447496 0.08198876\n -0.1357387 -0.18409243 -0.19127932 0.01680213 0.35644779 0.16652581\n -0.12988403 -0.00341757 -0.15569599 -0.09128557 -0.03799717 0.09235845\n 0.06296059 -0.07972728 0.00744779 0.07452074 0.23394027 -0.0726112\n -0.00072305 0.2978259 -0.01452125 -0.06529554 -0.08694689 0.0193715\n -0.14941891 0.10714116 -0.1096215 0.00143995 0.00146057 0.00348109\n 0.06795555 0.10826397 -0.18627991 0.21965174 -0.04136307 -0.01491791\n 0.03774849 -0.07495191 -0.03808937 -0.02331351 0.29242265 -0.23740929\n 0.13265632 0.1274993 0.1762779 0.11845816 0.01477844 0.07670261\n 0.11437597 -0.03779818 -0.21296507 0.03480547 0.0618057 -0.01749492\n -0.023851 0.1158648]'
print(distance(
load(Find_me_image),
load(New_image)
))
You need first to convert your datas using load function. Then calculate distance using distance function.
As your datas are the sames, I modify New_image datas to test the function.

Error message on certain text inputs using Python

I'm pretty new to python and I'm currently working on an assignment to implement a movie recommendation system. I have a .csv file that contains various descriptions of a given movie's attribute. I ask the user for a movie title and then the system returns similar movies.
The dataset is named movie_dataset.csv from this folder on GitHub: https://github.com/codeheroku/Introduction-to-Machine-Learning/tree/master/Building%20a%20Movie%20Recommendation%20Engine
The problem I am encountering is that when I ask the user to enter a movie title, the program only works for certain titles.
The code:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#helper functions#
def get_title_from_index(index):
return df[df.index == index]["title"].values[0]
def get_index_from_title(title):
return df[df.title == title]["index"].values[0]
df = pd.read_csv("movie_dataset.csv")
#print (df.columns)
features = ['keywords','cast','genres','director']
for feature in features:
df[feature] = df[feature].fillna('')
def combine_features(row):
return row['keywords'] +" "+ row['cast'] +" "+ row['genres'] +" "+ row['director']
df["combine_features"] = df.apply(combine_features, axis=1)
#print (df["combine_features"].head())
cv = CountVectorizer()
count_matrix = cv.fit_transform(df["combine_features"])
#MTitle = input("Type in a movie title: ")
cosine_sim = cosine_similarity(count_matrix)
movie_user_likes = 'Avatar'#MTitle
movie_index = get_index_from_title(movie_user_likes)
similar_movies = list(enumerate(cosine_sim[movie_index]))
sorted_similar_movies = sorted(similar_movies, key= lambda x:x[1], reverse=True)
i = 0
for movie in sorted_similar_movies:
print (get_title_from_index(movie[0]))
i=i+1
if i>10:
break
When I enter "Batman" the program runs fine. But when I run "Harry Potter" I get:
IndexError Traceback (most recent call last)
<ipython-input-51-687ddb420709> in <module>
30 movie_user_likes = MTitle
31
---> 32 movie_index = get_index_from_title(movie_user_likes)
33
34 similar_movies = list(enumerate(cosine_sim[movie_index]))
<ipython-input-51-687ddb420709> in get_index_from_title(title)
8
9 def get_index_from_title(title):
---> 10 return df[df.title == title]["index"].values[0]
11
12 df = pd.read_csv("movie_dataset.csv")
IndexError: index 0 is out of bounds for axis 0 with size 0
There's simply no entry in the data base for the movie "Harry Potter"
You should add some testing for these cases such as:
def get_index_from_title(title):
try:
return df[df.title == title]["index"].values[0]
except IndexError:
return None
Then of course in the calling code you'll have to test if you got a None from the function and act accordingly.

keras IndexError: list index out of range

I'm encoding a keras layer named A where I need Two tensor a,b . How should I do to fix the inputs [a,b] to get a correct return (the return of A are also tow tesor)?
def get_model(latent_dim):
# Input variables
u = Input(shape=(k,), dtype='float32', name = 'u]')
i = Input(shape=(k,), dtype='float32', name = 'i]')
LA=A(latent_dim)
list = A([u ,i])
u_return = list_co[-2]
i_return = list_co[-1]
encoding environment : For simply , I just set A as flows:
class A(latent_dim):
def call(inputs):
m = inputs[0]
n = inputs[1]
return [m,n]
but it still wrong
File "C:\ProgramData\Anaconda3\envs\network\lib\site-packages\keras\engine\topology.py", line 703, in _add_inbound_node
output_tensors[i]._keras_shape = output_shapes[i]
IndexError: list index out of range

PdfMiner: Erro processing the page literal required: /b'begin'

I am trying to read .pdf file using python3 with package called pdfminer which I have done successfully but for some of the page in .pdf file while reading the page using interpreter.process_page in getAllPages() of the following code I am getting an errors as follows:
error processing the page literal required: /b'begin'.
error processing the page Unknown operator: 'Qq'.
This is happening only for few docs but not able to find out what is the problem , in which case this could happen?
Code:-
class PDFDoc():
def __init__(self):
self.rsrcmgr = PDFResourceManager()
self.laparams = LAParams()
self.device = PDFPageDetailedAggregator(self.rsrcmgr, laparams=self.laparams)
self.interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)
self.doc_values = []
self.total_no_of_pages = 0
self.doc_page_dict = collections.OrderedDict()
# self.doc = None
"""
Read PDF Document
"""
def readDoc(self, doc_name):
fp = open(doc_name, 'rb')
self.parser = PDFParser(fp)
self.doc = PDFDocument(self.parser)
"""
Read all pages in the document and saved in List of tuples format.
It contains the text and their coordinate info along with page number
"""
def getAllPages(self):
for page in PDFPage.create_pages(self.doc):
self.interpreter.process_page(page)
# receive the LTPage object for this page
self.device.get_result()
self.doc_values = self.device.rows
"""
Get the total number of pages
"""
def getTotalPages(self):
self.total_no_of_pages = max(self.doc_page_dict)+1
"""
Convert the document info into Page-wise dict. {Key:Value}-->{Page no:[Page text, coordinates]}
"""
def getPageDict(self):
for i in range(len(self.doc_values)):
left = self.doc_values[i][1]
bottom = self.doc_values[i][2]
content = self.doc_values[i][-1]
if self.doc_page_dict.get(self.doc_values[i][0]):
self.doc_page_dict[self.doc_values[i][0]].append({'left':left, 'bottom':bottom, 'content':content})
else:
self.doc_page_dict[self.doc_values[i][0]]=[{'left':left, 'bottom':bottom, 'content':content}]
"""
Align the page text in case they are misaligned
"""
def create_page_table_modified(self, pagedict_list):
# ##print(pagedict_list)
page_dict = collections.OrderedDict()
page_table_1 = []
page_table = []
exc_arr = []
count = 0
for line in pagedict_list:
row = []
temp_key = float(line['bottom'])
if not line in exc_arr and line["content"]:
row.append(line)
exc_arr.append(line)
for line_1 in pagedict_list:
if not line_1 in exc_arr and line_1["content"]:
# #print('last_top:', last_top, each_dict_adjusted['bottom'])
if abs(int(line["bottom"]) - int(line_1["bottom"])) <= 6:
row.append(line_1)
exc_arr.append(line_1)
if row:
page_dict[temp_key] = row
page_table.append(row)
count += 1
# ##print("\n\nPage:",page_table)
page_dict_keys = sorted(page_dict, reverse=True)
for i in page_dict_keys:
# i = sorted(i, key=lambda k: k['left'])
page_table_1.append(page_dict[i])
return page_table_1
"""
Sort the line elements based on its position coordinates
"""
def sortRowElements(self,row_list):
return sorted(row_list, key=lambda k:k['left'])
"""
Combine line elements to form the line text
"""
def combineText(self, row):
temp_ = []
# for i in range(len(row)):
text = [k['content'] for k in row]
temp_.append(' '.join(text))
return ' '.join(temp_)
"""
To call aligning and sorting functions
"""
def sortText(self):
for page in self.doc_page_dict:
self.doc_page_dict[page] = self.create_page_table_modified(self.doc_page_dict[page])
self.doc_page_dict[page] = [self.sortRowElements(line) for line in self.doc_page_dict[page]]
"""
To get text from particular page of the document --> List of line text
"""
def pageText(self, page_no):
page_text = [self.combineText(line) for line in self.doc_page_dict[page_no]]
return page_text
read_document = PDFDoc()

'Word2Vec' object has no attribute 'index2word'

I'm getting this error "AttributeError: 'Word2Vec' object has no attribute 'index2word'" in following code in python. Anyone knows how can I solve it?
Acctually "tfidf_weighted_averaged_word_vectorizer" throws the error. "obli.csv" contains line of sentences.
Thank you.
from feature_extractors import tfidf_weighted_averaged_word_vectorizer
dataset = get_data2()
corpus, labels = dataset.data, dataset.target
corpus, labels = remove_empty_docs(corpus, labels)
# print('Actual class label:', dataset.target_names[labels[10]])
train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(corpus,
labels,
test_data_proportion=0.3)
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(train_corpus)
vocab = tfidf_vectorizer.vocabulary_
tfidf_wv_train_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_train,
tfidf_vectors=tfidf_train_features,
tfidf_vocabulary=vocab,
model=model,
num_features=100)
def get_data2():
obli = pd.read_csv('db/obli.csv').values.ravel().tolist()
cl0 = [0 for x in range(len(obli))]
nonObli = pd.read_csv('db/nonObli.csv').values.ravel().tolist()
cl1 = [1 for x in range(len(nonObli))]
all = obli + nonObli
db = Db(all,cl0 + cl1)
db.data = all
db.target = cl0 + cl1
return db
This is code from chapter 4 of Text Analytics for Python by Dipanjan Sarkar.
index2word in gensim has been moved since that text was published.
Instead of model.index2word you should use model.wv.index2word.

Resources