Attribute error while scraping gdelt data - python-3.x

I am scraping data from GDELT [https://www.gdeltproject.org]. It is a pretty cool project that checks ~100,000 news sites each day, labels all the articles, and makes them available. I am getting attribute error while extracting the data. The code use is the following:
import gdelt
gd = gdelt.gdelt(version=1)
from statsmodels.tsa.api import VAR
import pandas as pd
import os
os.makedirs("data",exist_ok=True)
import datetime
cur_date = datetime.datetime(2022,1,10) - datetime.timedelta(days=10)
end_date = datetime.datetime(2022,1,10)
year = cur_date.year
month = str(cur_date.month)
day = str(cur_date.day)
if cur_date.month < 10:
month = "0" + month
if cur_date.day < 10:
day = "0" + day
gd.Search(['%s %s %s'%(year, month, day)],table='gkg',coverage=True, translation=False)
I am getting attribute error
AttributeError Traceback (most recent call last)
<ipython-input-10-2f00cabbf1ac> in <module>
----> 1 results = gd.Search(['%s %s %s'%(year, month, day)],table='gkg',coverage=True,
translation=False)
~\anaconda3\lib\site-packages\gdelt\base.py in Search(self, date, table, coverage,
translation, output, queryTime, normcols)
646
647 if self.table == 'gkg' and self.version == 1:
--> 648 results.columns = results.ix[0].values.tolist()
649 results.drop([0], inplace=True)
650 columns = results.columns
~\anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
5463 if self._info_axis._can_hold_identifiers_and_holds_name(name):
5464 return self[name]
-> 5465 return object.__getattribute__(self, name)
5466
5467 def __setattr__(self, name: str, value) -> None:
AttributeError: 'DataFrame' object has no attribute 'ix'

Related

I'm getting " ValueError: 111816 is not in range" error when trying to use FuzzyWuzzy between two other dataframe column

I am getting error when trying to use FuzzyWuzzy between two other dataframe column.
I want to match df_1['name_new'] to df['term'].
below is the site where I got my code
https://towardsdatascience.com/fuzzy-string-match-with-python-on-large-dataset-and-why-you-should-not-use-fuzzywuzzy-4ec9f0defcd
#Transform text to vectors with TF-IDF:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=5, token_pattern='(\S+)')
tf_idf_matrix_1 = tfidf_vectorizer.fit_transform(df_1['name_new'])
tf_idf_matrix_2 = tfidf_vectorizer.fit_transform(df['term'])
I careated "tf_idf_matrix_2 " to match other df's 'term' column
from scipy.sparse import csr_matrix
!pip install sparse_dot_topn
import sparse_dot_topn.sparse_dot_topn as ct
def awesome_cossim_top(A, B, ntop, lower_bound=0):
# force A and B as a CSR matrix.
# If they have already been CSR, there is no overhead
A = A.tocsr()
B = B.tocsr()
M, _ = A.shape
_, N = B.shape
idx_dtype = np.int32
nnz_max = M*ntop
indptr = np.zeros(M+1, dtype=idx_dtype)
indices = np.zeros(nnz_max, dtype=idx_dtype)
data = np.zeros(nnz_max, dtype=A.dtype)
ct.sparse_dot_topn(
M, N, np.asarray(A.indptr, dtype=idx_dtype),
np.asarray(A.indices, dtype=idx_dtype),
A.data,
np.asarray(B.indptr, dtype=idx_dtype),
np.asarray(B.indices, dtype=idx_dtype),
B.data,
ntop,
lower_bound,
indptr, indices, data)
return csr_matrix((data,indices,indptr),shape=(M,N))
import time
t1 = time.time()
# adjust lower bound: 0.8
# keep top 10 similar results
matches = awesome_cossim_top(tf_idf_matrix_1, tf_idf_matrix_2.transpose(), 10, 0.8)
t = time.time()-t1
print("finished in:", t)
def get_matches_df(sparse_matrix, name_vector, top=10000):
non_zeros = sparse_matrix.nonzero()
sparserows = non_zeros[0]
sparsecols = non_zeros[1]
if top:
nr_matches = top
else:
nr_matches = sparsecols.size
left_side = np.empty([nr_matches], dtype=object)
right_side = np.empty([nr_matches], dtype=object)
similairity = np.zeros(nr_matches)
for index in range(0, nr_matches):
left_side[index] = name_vector[sparserows[index]]
right_side[index] = name_vector[sparsecols[index]]
similairity[index] = sparse_matrix.data[index]
return pd.DataFrame({'name_new_1': left_side,
'term_1': right_side,
'similairity_score': similairity})
matches_df = pd.DataFrame()
matches_df = get_matches_df(matches, df_1['name_new'], top=10000)
# Remove all exact matches
I get my error like this=>
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
384 try:
--> 385 return self._range.index(new_key)
386 except ValueError as err:
ValueError: 111816 is not in range
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
4 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/range.py in get_loc(self, key, method, tolerance)
385 return self._range.index(new_key)
386 except ValueError as err:
--> 387 raise KeyError(key) from err
388 raise KeyError(key)
389 return super().get_loc(key, method=method, tolerance=tolerance)
KeyError: 111816
Please help... what is wrong with my code?

How do I ask question about lowercase error in casefolding?

I don't understand this error... I've already turned df into lowercase before turning it into a list dataframe:
0 Masuk ke Liang Lahat, Rian D’Masiv Makin Sadar... Infotainment Untuk pertama kalinya, Rian masuk ke liang lah...
1 Alasan PPKM, Kuasa Hukum Vicky Prasetyo Sebut ... Infotainment Andai saja persidangan tetap berjalan seperti ...
...
1573 Jessica Iskandar Syok Tahu Kabar Nia Ramadhani... Infotainment “Banyak wartawan juga nanyain. Itu aku baru ba...
1574 Show 10 Menit BTS dalam Koleksi LV Music & Movie BTS melaksanakan ’’tugas’’ perdananya sebagai ...
Code:
import pandas as pd
import numpy as np
import re
import string
import nltk
import str
def load_data():
dataset = pd.read_csv("jawapos_entertainment.csv")
return dataset
news_df = load_data()
news_df.head()
df = pd.DataFrame(news_df[['judul_name','judul_kategori','judul_Headline']])
df
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
factory = StopWordRemoverFactory()
stopwords = factory.create_stop_word_remover()
kalimat = df [['judul_name','judul_Headline']]
kalimat = kalimat.lower()
stop = stopwords.remove(kalimat)
print(stop)
But I have an error in this line:
AttributeError Traceback (most recent call last)
<ipython-input-17-ce52d5ec4fb2> in <module>
4
5 kalimat = df [['judul_name','judul_Headline']]
----> 6 kalimat = kalimat.lower()
7
8 stop = stopwords.remove(kalimat)
~\anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
5463 if self._info_axis._can_hold_identifiers_and_holds_name(name):
5464 return self[name]
-> 5465 return object.__getattribute__(self, name)
5466
5467 def __setattr__(self, name: str, value) -> None:
AttributeError: 'DataFrame' object has no attribute 'lower'
But why is the program returning a lowercase error if I've already passed the lowercase dataframe before?
You can't just lower a Dataframe object. First you have to point that you want to use vectorized string functions for Series and Index-pd.Series.str.
Converting whole dataframe to lowercase format should looks like this:
for columns in kalimat.columns:
kalimat[columns] = kalimat[columns].str.lower()

IndexError multiprocessing.Pool

I'm getting an IndexError using multiprocessing to process parts of a pandas DataFrame in parallel. vacancies is a pandas DataFrame containing several vacancies, of which one column is the raw text.
def addSkillRelevance(vacancies):
skills = pickle.load(open("skills.pkl", "rb"))
vacancies['skill'] = ''
vacancies['skillcount'] = 0
vacancies['all_skills_in_vacancy'] = ''
new_vacancies = pd.DataFrame(columns=vacancies.columns)
for vacancy_index, vacancy_row in vacancies.iterrows():
#Create a df for which each row is a found skill (with the other attributes of the vacancy)
per_vacancy_df = pd.DataFrame(columns=vacancies.columns)
all_skills_in_vacancy = []
skillcount = 0
for skill_index, skill_row in skills.iterrows():
#Making the search for the skill in the text body a bit smarter
spaceafter = ' ' + skill_row['txn_skill_name'] + ' '
newlineafter = ' ' + skill_row['txn_skill_name'] + '\n'
tabafter = ' ' + skill_row['txn_skill_name'] + '\t'
#Statement that returns true if we find a variation of the skill in the text body
if((spaceafter in vacancies.at[vacancy_index,'body']) or (newlineafter in vacancies.at[vacancy_index,'body']) or (tabafter in vacancies.at[vacancy_index,'body'])):
#Adding the skill to the list of skills found in the vacancy
all_skills_in_vacancy.append(skill_row['txn_skill_name'])
#Increasing the skillcount
skillcount += 1
#Adding the skill to the row
vacancies.at[vacancy_index,'skill'] = skill_row['txn_skill_name']
#Add a row to the vacancy df where 1 row, means 1 skill
per_vacancy_df = per_vacancy_df.append(vacancies.iloc[vacancy_index])
#Adding the list of all found skills in the vacancy to each (skill) row
per_vacancy_df['all_skills_in_vacancy'] = str(all_skills_in_vacancy)
per_vacancy_df['skillcount'] = skillcount
#Adds the individual vacancy df to a new vacancy df
new_vacancies = new_vacancies.append(per_vacancy_df)
return(new_vacancies)
def executeSkillScript(vacancies):
from multiprocessing import Pool
vacancies = vacancies.head(100298)
num_workers = 47
pool = Pool(num_workers)
vacancy_splits = np.array_split(vacancies, num_workers)
results_list = pool.map(addSkillRelevance,vacancy_splits)
new_vacancies = pd.concat(results_list, axis=0)
pool.close()
pool.join()
executeSkillScript(vacancies)
The function addSkillRelevance() takes in a pandas DataFrame and outputs a pandas DataFrame (with more columns). For some reason, after finishing all the multiprocessing, I get an IndexError on results_list = pool.map(addSkillRelevance,vacancy_splits). I'm quite stuck as I don't know how to handle the error. Does anyone have tips as to why the IndexError is occurring?
The error:
IndexError Traceback (most recent call last)
<ipython-input-11-7cb04a51c051> in <module>()
----> 1 executeSkillScript(vacancies)
<ipython-input-9-5195d46f223f> in executeSkillScript(vacancies)
14
15 vacancy_splits = np.array_split(vacancies, num_workers)
---> 16 results_list = pool.map(addSkillRelevance,vacancy_splits)
17 new_vacancies = pd.concat(results_list, axis=0)
18
~/anaconda3/envs/amazonei_tensorflow_p36/lib/python3.6/multiprocessing/pool.py in map(self, func, iterable, chunksize)
264 in a list that is returned.
265 '''
--> 266 return self._map_async(func, iterable, mapstar, chunksize).get()
267
268 def starmap(self, func, iterable, chunksize=None):
~/anaconda3/envs/amazonei_tensorflow_p36/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
642 return self._value
643 else:
--> 644 raise self._value
645
646 def _set(self, i, obj):
IndexError: single positional indexer is out-of-bounds
As per the suggestion
The error is coming from this line:
per_vacancy_df = per_vacancy_df.append(vacancies.iloc[vacancy_index])
The error is occuring because vacancy_index is not in the index of the vacancies dataframe.

Error message on certain text inputs using Python

I'm pretty new to python and I'm currently working on an assignment to implement a movie recommendation system. I have a .csv file that contains various descriptions of a given movie's attribute. I ask the user for a movie title and then the system returns similar movies.
The dataset is named movie_dataset.csv from this folder on GitHub: https://github.com/codeheroku/Introduction-to-Machine-Learning/tree/master/Building%20a%20Movie%20Recommendation%20Engine
The problem I am encountering is that when I ask the user to enter a movie title, the program only works for certain titles.
The code:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#helper functions#
def get_title_from_index(index):
return df[df.index == index]["title"].values[0]
def get_index_from_title(title):
return df[df.title == title]["index"].values[0]
df = pd.read_csv("movie_dataset.csv")
#print (df.columns)
features = ['keywords','cast','genres','director']
for feature in features:
df[feature] = df[feature].fillna('')
def combine_features(row):
return row['keywords'] +" "+ row['cast'] +" "+ row['genres'] +" "+ row['director']
df["combine_features"] = df.apply(combine_features, axis=1)
#print (df["combine_features"].head())
cv = CountVectorizer()
count_matrix = cv.fit_transform(df["combine_features"])
#MTitle = input("Type in a movie title: ")
cosine_sim = cosine_similarity(count_matrix)
movie_user_likes = 'Avatar'#MTitle
movie_index = get_index_from_title(movie_user_likes)
similar_movies = list(enumerate(cosine_sim[movie_index]))
sorted_similar_movies = sorted(similar_movies, key= lambda x:x[1], reverse=True)
i = 0
for movie in sorted_similar_movies:
print (get_title_from_index(movie[0]))
i=i+1
if i>10:
break
When I enter "Batman" the program runs fine. But when I run "Harry Potter" I get:
IndexError Traceback (most recent call last)
<ipython-input-51-687ddb420709> in <module>
30 movie_user_likes = MTitle
31
---> 32 movie_index = get_index_from_title(movie_user_likes)
33
34 similar_movies = list(enumerate(cosine_sim[movie_index]))
<ipython-input-51-687ddb420709> in get_index_from_title(title)
8
9 def get_index_from_title(title):
---> 10 return df[df.title == title]["index"].values[0]
11
12 df = pd.read_csv("movie_dataset.csv")
IndexError: index 0 is out of bounds for axis 0 with size 0
There's simply no entry in the data base for the movie "Harry Potter"
You should add some testing for these cases such as:
def get_index_from_title(title):
try:
return df[df.title == title]["index"].values[0]
except IndexError:
return None
Then of course in the calling code you'll have to test if you got a None from the function and act accordingly.

Ignore unconverted data from date-time stamp when using strptime

A third party API returns me a CSV data containing date time stamp like this:
dtval = '2016-10-14 05:09:30+00:00'
I have to convert it in the format : mm/dd/yyyy.
Here I'm not sure about last +XX:XX of directive:
datetime.datetime.strptime(dtval, "%Y-%m-%d %H:%M:%S+XX:XX").strftime('%m/%d/%Y')
I tried followings but did not work:
>>>datetime.datetime.strptime('2016-10-14 05:09:30+00:00', '%Y-%m-%d %H:%M:%S')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/lib/python3.5/_strptime.py", line 500, in _strptime_datetime
tt, fraction = _strptime(data_string, format)
File "/usr/lib/python3.5/_strptime.py", line 340, in _strptime
data_string[found.end():])
ValueError: unconverted data remains: +00:00
>>>datetime.datetime.strptime('2016-10-14 05:09:30+00:00', "%Y-%m-%d %H:%M:%S%z")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/lib/python3.5/_strptime.py", line 500, in _strptime_datetime
tt, fraction = _strptime(data_string, format)
File "/usr/lib/python3.5/_strptime.py", line 337, in _strptime
(data_string, format))
ValueError: time data '2016-10-14 05:09:30+00:00' does not match format '%Y-%m-%d %H:%M:%S+%z'
Is there any option in the Python3.4+'s datetime module to ignore remainging unconverted data?
I gone through this but did not find any such option
After a little bit research I found this fix in django's source code :
class FixedOffset(tzinfo):
"""
Fixed offset in minutes east from UTC. Taken from Python's docs.
Kept as close as possible to the reference version. __init__ was changed
to make its arguments optional, according to Python's requirement that
tzinfo subclasses can be instantiated without arguments.
"""
def __init__(self, offset=None, name=None):
if offset is not None:
self.__offset = timedelta(minutes=offset)
if name is not None:
self.__name = name
def utcoffset(self, dt):
return self.__offset
def tzname(self, dt):
return self.__name
def dst(self, dt):
return timedelta(0)
def get_timezone(offset):
"""
Returns a tzinfo instance with a fixed offset from UTC.
"""
if isinstance(offset, timedelta):
offset = offset.seconds // 60
sign = '-' if offset < 0 else '+'
hhmm = '%02d%02d' % divmod(abs(offset), 60)
name = sign + hhmm
return FixedOffset(offset, name)
def custom_strptime(self, value):
"""Parses a string and return a datetime.datetime.
This function supports time zone offsets. When the input contains one,
the output uses a timezone with a fixed offset from UTC.
Raises ValueError if the input is well formatted but not a valid datetime.
Returns None if the input isn't well formatted.
"""
datetime_re = re.compile(
r'(?P<year>\d{4})-(?P<month>\d{1,2})-(?P<day>\d{1,2})'
r'[T ](?P<hour>\d{1,2}):(?P<minute>\d{1,2})'
r'(?::(?P<second>\d{1,2})(?:\.(?P<microsecond>\d{1,6})\d{0,6})?)?'
r'(?P<tzinfo>Z|[+-]\d{2}(?::?\d{2})?)?$'
)
match = datetime_re.match(value)
if match:
kw = match.groupdict()
if kw['microsecond']:
kw['microsecond'] = kw['microsecond'].ljust(6, '0')
tzinfo = kw.pop('tzinfo')
if tzinfo == 'Z':
tzinfo = utc
elif tzinfo is not None:
offset_mins = int(tzinfo[-2:]) if len(tzinfo) > 3 else 0
offset = 60 * int(tzinfo[1:3]) + offset_mins
if tzinfo[0] == '-':
offset = -offset
tzinfo = get_timezone(offset)
kw = {k: int(v) for k, v in kw.items() if v is not None}
kw['tzinfo'] = tzinfo
return datetime.datetime(**kw)

Resources