IndexError: list index out of range ontology annotation - python-3.x

I am adding annotation to ontology using following code
from owlready import *
onto = get_ontology("C://Users//sharm//Documents//ISWC2020//Ontology_read_play//Covid_v1.owl")
ANNOTATIONS[Thing].add_annotation("comment", "My comment")
onto.save()
Then it shows an IndexError, when I have save it using onto.save()
IndexError Traceback (most recent call last)<module>
--> onto.save()
C:\ProgramData\Anaconda3\lib\site-packages\owlready\__init__.py in save(self, filename)
282 owl = to_owl(self)
283 if filename: f = open(filename, "w")
--> 284 else: f = _open_onto_file(self.base_iri, self.name, "w")
285 print("* Owlready * Saving ontology %s to %s..." % (self.name, getattr(f, "name", "???")), file = sys.stderr)
286 f.write(owl)
C:\ProgramData\Anaconda3\lib\site-packages\owlready\__init__.py in _open_onto_file(base_iri, name, mode, only_local)
199 if os.path.exists(filename): return open(filename, mode)
200 if (mode == "r") and not only_local: return urllib.request.urlopen(base_iri)
--> 201 if (mode == "w"): return open(os.path.join(onto_path[0], "%s.owl" % name), "w")
202 raise FileNotFoundError
203
IndexError: list index out of range

Quick search on the doc (https://pythonhosted.org/Owlready2/onto.html) shows that if you don't specify a file or a filename in the save method, it uses the first path in onto_path module variable. You never set it up and apparently onto_path starts empty.
Add the appropriate name parameter to the save method and checkout the doc next time you have problems.

Related

KeyError: 1 when trying to do sentiment analysis by python

This is the error info+ most recent call
KeyError Traceback (most recent call last)
D:\python\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3079 try:
-> 3080 return self._engine.get_loc(casted_key)
3081 except KeyError as err:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 1
The above exception was the direct cause of the following exception:
And this is the code that I use for practice from Github
#Define the main sentiment analysis function
def sentiment_check(file):
with open(file, 'r') as myfile:
file_content = open(file, 'r').read()
#Tokenise the management discussion using NLTK
file_content_tokenized= nltk.word_tokenize(file_content)
#Create a frequence distribution table of word tokens
freq=pd.Series(nltk.FreqDist(file_content_tokenized)).sort_values(ascending=False)
#print('Most popular 10 stop words',freq.iloc[0:10])
#print('fraction of total word count that are stop words:',freq.iloc[0:10].sum()/freq.sum())
#The top 10 most common words have been identified as stop words.
#These are words like: 'The', 'Ok', etc.
stopwords=pd.Series(freq.iloc[0:10].index)
#Remove stopwords
file_content_tokenized=pd.Series([x for x in file_content_tokenized if x not in stopwords.values]).str.lower()
# Load Loughran and McDonald dictionaries
#these dictionaries are specially used for textual analysis of financial statements
#More details on this in the README.md
pos = pd.read_csv('POSITIVE.txt', squeeze=True).str.lower()
neg = pd.read_csv('NEGATIVE.txt', squeeze=True).str.lower()
positive_words= file_content_tokenized.isin(pos).sum()
negative_words= file_content_tokenized.isin(neg).sum()
#Total Positive & Negative words in the statement
#("Positive Words:",positive_words)
#print("Negative Words:",negative_words)
sentiment_score = (positive_words-negative_words)/file_content_tokenized.count()
print("for",file.rstrip('.txt'),"(positive words - negative words)/total words:",sentiment_score)
print("for",file.rstrip('.txt'),"negative words/total words:",(negative_words)/file_content_tokenized.count())
#print((positive_words-negative_words)/file_content_tokenized.count())
nnn_words= pd.DataFrame(file_content_tokenized.isin(['no', 'not', 'never']))
nnn_words=nnn_words[nnn_words.iloc[:,0]]
nnn_words['idx']=nnn_words.index.values
nnn_words['words']=file_content_tokenized[nnn_words['idx']]
pos_after_neg=nnn_words.apply(pos_after_negator,axis=1,args=(pos.values,file_content_tokenized)).dropna()
print('+ve words after a negator:',pos_after_neg.values)
print('')
return sentiment_score;
def pos_after_negator(row,pos,file_content_tokenized):
#pos = pd.read_csv('LM_pos_words.txt', squeeze=True).str.lower()
#print(row)
string = row['words']
#print(file_content_tokenized.get(row[1]+1,''))
string+=' '+ str(file_content_tokenized.get(row[1]+1,''))
if file_content_tokenized.get(row[1]+1,'') in pos:
return string
string+=' '+ str(file_content_tokenized.get(row[1]+2,''))
if file_content_tokenized.get(row[1]+2,'') in pos:
return string
string+=' '+ str(file_content_tokenized.get(row[1]+3,''))
if file_content_tokenized.get(row[1]+3,'') in pos:
return string
# print(string)
return None
def driver():
#I have extracted Management Discussion section from last 5 10K annual reports and placed them in data folder
path = "D:\history data\Dissertation\MDA copy"
files = [s for s in os.listdir(path) if s.endswith('.txt')]
year = pd.Series([],dtype=pd.StringDtype())
sentiment = pd.Series([],dtype=pd.StringDtype())
for file in files:
year = year.append(pd.Series([int(file.split('.')[0])]))
sentiment = sentiment.append(pd.Series([sentiment_check(path+'\\'+file)]))
return (year, sentiment)
#Run for last 5 years
year, sentiment = driver()
I'm new to python and this error has been bothering me for hours T_T Please help! I literally have no idea about where this code could go wrong, so I put all of my codes here in case I miss the true cause. (Sorry for my messy format)

Pytrends is only implemented in city level for USA (and without geocode)

I have been trying pytrends and I discovered that interest_by_region=city is only implemented for USA:
if self.geo == '':
self.interest_by_region_widget['request'][
'resolution'] = resolution
elif self.geo == 'US' and resolution in ['DMA', 'CITY', 'REGION']:
self.interest_by_region_widget['request'][
'resolution'] = resolution
I tried to discover what is missing in the code for other countries, but I am not able to find. I only know based on this piece of code above, that it only works for USA. Furthermore, I know that I can specify the city level in google trends. Can one help me find what is the part of pytrends that I have to implement?
EDIT:
I implemented the suggestion of #mcskinner (+1) that really makes the things simpler (but I got the same problem of my hack). Now, my code is:
import json
import pandas as pd
from pytrends.request import TrendReq
#from request import TrendReq
class MyTrendReq(TrendReq):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def interest_by_region(self, resolution='COUNTRY', inc_low_vol=False,
inc_geo_code=False):
"""Request data from Google's Interest by Region section and return a dataframe"""
# make the request
region_payload = dict()
if self.geo == '':
self.interest_by_region_widget['request']['resolution'] = resolution
elif self.geo == 'US' and resolution in ['DMA', 'CITY', 'REGION']:
self.interest_by_region_widget['request']['resolution'] = resolution
elif len(self.geo) == 2 and resolution in ['CITY', 'REGION']:
self.interest_by_region_widget['request']['resolution'] = resolution
self.interest_by_region_widget['request'][
'includeLowSearchVolumeGeos'] = inc_low_vol
# convert to string as requests will mangle
region_payload['req'] = json.dumps(
self.interest_by_region_widget['request'])
region_payload['token'] = self.interest_by_region_widget['token']
region_payload['tz'] = self.tz
# parse returned json
req_json = self._get_data(
url=TrendReq.INTEREST_BY_REGION_URL,
method=TrendReq.GET_METHOD,
trim_chars=5,
params=region_payload,
)
df = pd.DataFrame(req_json['default']['geoMapData'])
if (df.empty):
return df
# rename the column with the search keyword
df = df[['geoName', 'geoCode', 'value']].set_index(
['geoName']).sort_index()
# split list columns into seperate ones, remove brackets and split on comma
result_df = df['value'].apply(lambda x: pd.Series(
str(x).replace('[', '').replace(']', '').split(',')))
if inc_geo_code:
result_df['geoCode'] = df['geoCode']
# rename each column with its search term
for idx, kw in enumerate(self.kw_list):
result_df[kw] = result_df[idx].astype('int')
del result_df[idx]
return result_df
#import pytrends
if __name__=="__main__":
pytrend = MyTrendReq()
pytrend.build_payload(kw_list=['BMW'],geo='BR',timeframe='2019-03-01 2020-03-02')
# df = pytrend.interest_by_region(resolution='REGION', inc_low_vol=True, inc_geo_code=True)
df = pytrend.interest_by_region(resolution='CITY', inc_low_vol=True, inc_geo_code=True)
I got the following error (it seems that something is missing, but I am able to manually do this kind of search in google trends):
runfile('/home/daniel/Documents/caju/testingPytrendsStackoverflow.py', wdir='/home/daniel/Documents/caju')
Traceback (most recent call last):
File "<ipython-input-8-3a8c4f9b3a66>", line 1, in <module>
runfile('/home/daniel/Documents/caju/testingPytrendsStackoverflow.py', wdir='/home/daniel/Documents/caju')
File "/usr/lib/python3/dist-packages/spyder/utils/site/sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "/usr/lib/python3/dist-packages/spyder/utils/site/sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "/home/daniel/Documents/caju/testingPytrendsStackoverflow.py", line 72, in <module>
df = pytrend.interest_by_region(resolution='CITY', inc_low_vol=True, inc_geo_code=True)
File "/home/daniel/Documents/caju/testingPytrendsStackoverflow.py", line 53, in interest_by_region
df = df[['geoName', 'geoCode', 'value']].set_index(
File "/home/daniel/.local/lib/python3.6/site-packages/pandas/core/frame.py", line 2986, in __getitem__
indexer = self.loc._convert_to_indexer(key, axis=1, raise_missing=True)
File "/home/daniel/.local/lib/python3.6/site-packages/pandas/core/indexing.py", line 1285, in _convert_to_indexer
return self._get_listlike_indexer(obj, axis, **kwargs)[1]
File "/home/daniel/.local/lib/python3.6/site-packages/pandas/core/indexing.py", line 1092, in _get_listlike_indexer
keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
File "/home/daniel/.local/lib/python3.6/site-packages/pandas/core/indexing.py", line 1185, in _validate_read_indexer
raise KeyError("{} not in index".format(not_found))
KeyError: "['geoCode'] not in index"
If I replace in my code
df = pytrend.interest_by_region(resolution='CITY', inc_low_vol=True, inc_geo_code=True)
by
# df = pytrend.interest_by_region(resolution='REGION', inc_low_vol=True, inc_geo_code=True)
It works.
EDIT 2:
#mcskinner is right.
If I make inc_geo_code=False and I comment
# df = df[['geoName', 'geoCode', 'value']].set_index(
# ['geoName']).sort_index()
It works, but I loose the information of the city:
BMW
0 100
1 90
2 88
3 88
4 84
.. ...
105 43
106 43
107 42
108 42
109 38
The point is where should I include the missing geocode information for Brazil?
Right after the code you identified, as part of the same if/elif branching, you could add an additional branch for all non-global and non-US regions.
if self.geo == '':
self.interest_by_region_widget['request']['resolution'] = resolution
elif self.geo == 'US' and resolution in ['DMA', 'CITY', 'REGION']:
self.interest_by_region_widget['request']['resolution'] = resolution
elif len(self.geo) == 2 and resolution in ['CITY', 'REGION']:
self.interest_by_region_widget['request']['resolution'] = resolution
The condition on length 2 is a bit of a hack to identify countries. You could also get rid of the if condition and just always try to use the resolution.
self.interest_by_region_widget['request']['resolution'] = resolution
Some combinations are now invalid (REGION breakdown of a METRO), and Google Trends will fail for those. You would still need to be careful to handle those or only send valid combinations, but this would give you the freedom to do that.
Note that all of these require modifying the library code. To do it yourself, you would want to create your own subclass of TrendReq and override the interest_by_region method with your own modified copy.
class MyTrendReq(TrendReq):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def interest_by_region(self, resolution='COUNTRY', inc_low_vol=False,
inc_geo_code=False):
# Your modified copy goes here.
There is a small bug in pytrends' source code. There are no geocodes associated with cities.
To fix the problem, change the line
df = df[['geoName', 'geoCode', 'value']].set_index(['geoName']).sort_index()
to
df = df[['geoName', 'coordinates', 'value']].set_index(['geoName']).sort_index()

Getting Type Error Expected Strings or Bytes Like Object

I am working on a dataset with tweets and I am trying to find the mentions to other users in a tweet, these tweets can have none, single or multiple users mentioned.
Here is the head of the DataFrame:
The following is the function that I created to extract the list of mentions in a tweet:
def getMention(text):
mention = re.findall('(^|[^#\w])#(\w{1,15})', text)
if len(mention) > 0:
return [x[1] for x in mention]
else:
return None
I'm trying to create a new column in the DataFrame and apply the function with the following code:
df['mention'] = df['text'].apply(getMention)
On running this code I get the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-43-426da09a8770> in <module>
----> 1 df['mention'] = df['text'].apply(getMention)
~/anaconda3_501/lib/python3.6/site-packages/pandas/core/series.py in apply(self, func, convert_dtype, args, **kwds)
3192 else:
3193 values = self.astype(object).values
-> 3194 mapped = lib.map_infer(values, f, convert=convert_dtype)
3195
3196 if len(mapped) and isinstance(mapped[0], Series):
pandas/_libs/src/inference.pyx in pandas._libs.lib.map_infer()
<ipython-input-42-d27373022afd> in getMention(text)
1 def getMention(text):
2
----> 3 mention = re.findall('(^|[^#\w])#(\w{1,15})', text)
4 if len(mention) > 0:
5 return [x[1] for x in mention]
~/anaconda3_501/lib/python3.6/re.py in findall(pattern, string, flags)
220
221 Empty matches are included in the result."""
--> 222 return _compile(pattern, flags).findall(string)
223
224 def finditer(pattern, string, flags=0):
TypeError: expected string or bytes-like object
I can't comment (not enough rep) so here's what I suggest to troubleshoot the error.
It seems findall raises an exception because text is not a string so you might want to check which type text actually is, using this:
def getMention(text):
print(type(text))
mention = re.findall(r'(^|[^#\w])#(\w{1,15})', text)
if len(mention) > 0:
return [x[1] for x in mention]
else:
return None
(or the debugger if you know how to)
And if text can be converted to a string maybe try this ?
def getMention(text):
mention = re.findall(r'(^|[^#\w])#(\w{1,15})', str(text))
if len(mention) > 0:
return [x[1] for x in mention]
else:
return None
P.S.: don't forget the r'...' in front of your regexp, to avoid special chars to be interpreted

Unable to submit a post with PRAW API

I have this code:
import praw
print('starting')
reddit = praw.Reddit(client_id='****',
client_secret='********',
user_agent='****',
username = '****',
password = '****')
r = reddit.post("/api/submit",data={'title':'my firts title','text':'the text of my post','sr':'r/test'})
print("finishing")
But it returns with the error:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-19-7e66ffa81635> in <module>
9 password = '*****')
10
---> 11 r = reddit.post("/api/submit",data={'title':'my firts title','text':'the text of my post','sr':'r/test'})
12
13 print("finishing")
~\AppData\Local\Continuum\anaconda3\lib\site-packages\praw\reddit.py in post(self, path, data, files, params)
481 data = self.request('POST', path, data=data or {}, files=files,
482 params=params)
--> 483 return self._objector.objectify(data)
484
485 def put(self, path, data=None):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\praw\objector.py in objectify(self, data)
148 if len(errors) == 1:
149 raise APIException(*errors[0])
--> 150 assert not errors
151
152 elif isinstance(data, dict):
AssertionError:
and in some occasions the same code returns :
---------------------------------------------------------------------------
APIException Traceback (most recent call last)
<ipython-input-27-b62f9f5f585d> in <module>
9 password = '****')
10
---> 11 r = reddit.post("/api/submit",data={'title':'my firts title','text':'the text of my post','sr':'r/test'})
12
13 print("finishing")
~\AppData\Local\Continuum\anaconda3\lib\site-packages\praw\reddit.py in post(self, path, data, files, params)
481 data = self.request('POST', path, data=data or {}, files=files,
482 params=params)
--> 483 return self._objector.objectify(data)
484
485 def put(self, path, data=None):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\praw\objector.py in objectify(self, data)
147 errors = data['json']['errors']
148 if len(errors) == 1:
--> 149 raise APIException(*errors[0])
150 assert not errors
151
APIException: INVALID_OPTION: 'opci\xf3n inv\xe1lida' on field 'sr'
To be honest I do not know what I am doing wrong. I suppose there is a better way to simple submit a post in reddit, but the documentation is not so helpful
You should do:
my_post = reddit.subreddit('subreddit').submit('My Title', selftext='Stuff you want to put in the textbox')
Note that subreddit shouldn't include the r/.
as per:
https://praw.readthedocs.io/en/latest/code_overview/models/subreddit.html#praw.models.Subreddit.submit

how to convert multiple csv files to multiple tables in sqlite using python3?

I was trying to import multiple csv files into sqlite database into multiple tables(using jupyter notebook in python3). The name of each file will be the name of the table. I have defined a function to covert the encoding to utf8 as below:
import sqlite3
import glob
import csv
import sys
def convert_to_utf8(dirname):
for filename in glob.glob(os.path.join(dirname, '*.csv')):
ifp = open(filename, "rt", encoding='cp1252')
input_data = ifp.read()
ifp.close()
ofp = open(filename + ".fix", "wt", encoding='utf-8')
for c in input_data:
if c != '\0':
ofp.write(c)
ofp.close()
return
all the files are in the same folder. staging_dir_name_1 is where the files are. And I have below code to covert the csv file into tables, some of the codes are from similar questions in StackFlow:
convert_to_utf8(staging_dir_name_1)
conn = sqlite3.connect("medicare_hospital_compare_1.db")
c = conn.cursor()
for filename in glob.glob(os.path.join(staging_dir_name_1, '*.csv')):
with open(filename, "rb") as f:
data = csv.DictReader(f)
cols = data.fieldnames
tablename = os.path.splitext(os.path.basename(filename))[0]
sql_str = "drop table if exists %s" % tablename
c.execute(sql_str)
sql_str = "create table if not exists %s (%s)" % (tablename, ','.join(["%s text" % col for col in cols]))
c.execute(sql_str)
sql_str = "insert into %s values (%s)" % (tablename, ','.join(["?" for col in cols]))
c.executemany(sql_str, (list(map(row.get, cols)) for row in data))
conn.commit()
but when i run this i get this error
> Error Traceback (most recent call
> last) <ipython-input-29-be7c1f43e4c5> in <module>()
> 2 with open(filename, "rb") as f:
> 3 data = csv.DictReader(f)
> ----> 4 cols = data.fieldnames
> 5 tablename = os.path.splitext(os.path.basename(filename))[0]
> 6
>
> C:\Users\dupin\Anaconda3\lib\csv.py in fieldnames(self)
> 96 if self._fieldnames is None:
> 97 try:
> ---> 98 self._fieldnames = next(self.reader)
> 99 except StopIteration:
> 100 pass
>
> Error: iterator should return strings, not bytes (did you open the
> file in text mode?)
Could anyone help me on how to resolve this issue? I have been thinking about it for a while but still couldn't figure out how to resolve this.
**===UPDATE===**
Now i have changed 'rb' to 'rt', i got a new error full NULL values, i think the first function has already removed all the null values
Error Traceback (most recent call last)
<ipython-input-77-68d56c0b4cf2> in <module>()
3
4 data = csv.DictReader(f)
----> 5 cols = data.fieldnames
6 table = os.path.splitext(os.path.basename(filename))[0]
7
C:\Users\dupin\Anaconda3\lib\csv.py in fieldnames(self)
96 if self._fieldnames is None:
97 try:
---> 98 self._fieldnames = next(self.reader)
99 except StopIteration:
100 pass
Error: line contains NULL byte

Resources