I have trouble doing a language detection.
The code below raises an Exception Error.
from langdetect import detect
for row in df['Comments']:
text = str(row)
language_code = detect(text)
sentence = [all_languages_codes.get(language_code)]
df['Language']=sentence[0]
Error Message:
148 ngrams = self._extract_ngrams()
149 if not ngrams:
--> 150 raise LangDetectException(ErrorCode.CantDetectError, 'No features in text.')
151
152 self.langprob = [0.0] * len(self.langlist)
LangDetectException: No features in text.
How to print-out the row that causes the LangDetectException?
It looks like your Contents string is empty:
detect("")
LangDetectException: No features in text.
You can launch a debugger or interactive shell to know for sure, wrapping this in a try/except block and launching a debugger when an exception is raised:
from langdetect import detect
for row in df['Comments']:
try:
text = str(row)
language_code = detect(text)
sentence = [all_languages_codes.get(language_code)]
df['Language']=sentence[0]
except Exception:
import ipdb; ipdb.set_trace()
Related
I want to convert the text for suitable "natural language processing"
There are approx 3000+ books in column of "TEXT"
every row has huge text or one book in every row so when I apply this code I am getting a error as shown bellow.
When I am applying the below code
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(len(dt)):
review = re.sub('[^a-zA-Z0-9]', ' ', dt['TEXT'][i])
review = review.lower()
review = review.split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review = ' '.join(review)
corpus.append(review)
I am getting the following error
TypeError Traceback (most recent call last)
<ipython-input-16-47569f8727fa> in <module>
6 corpus = []
7 for i in range(1000,2000):
----> 8 review = re.sub('[^a-zA-Z0-9]', ' ', dt['TEXT'][i])
9 review = review.lower()
10 review = review.split()
~\anaconda3\lib\re.py in sub(pattern, repl, string, count, flags)
190 a callable, it's passed the Match object and must return
191 a replacement string to be used."""
--> 192 return _compile(pattern, flags).sub(repl, string, count)
193
194 def subn(pattern, repl, string, count=0, flags=0):
TypeError: expected string or bytes-like object
This means that in your DataFrame column 'TEXT' there are values that are not strings.
You can do this instead:
for i in range(len(df)):
try:
re.sub('[^a-zA-Z0-9]', ' ', df['TEXT'][i])
# the rest of your code ...
except TypeError:
pass
I'm trying to make a program that will grab a random word from a JSON file and print it and it's definition using PyDictionary. It works occasionally but I think the issue I am having is displaying output from dictionary.meaning(word) when the word has multiple meanings. I get an IndexError when that appears the be the case.
example outputs: expected: tinamidae Noun ['comprising the
tinamous']
unwanted result: unmaterially Error: The Following Error occured: list
index out of range No definition found!
import json
import random
from PyDictionary import PyDictionary
dictionary = PyDictionary()
with open('C:\\Users\\jabes\\Desktop\\words_dictionary.json') as json_file:
words = json.load(json_file)
word = random.choice(list(words.keys()))
print(word)
try:
meanings = dictionary.meaning(word)
if meanings:
for k,v in meanings.items():
print(k, v)
else:
print("No definition found!")
except Exception as error:
print(error)
print("Exiting!")
I was running pytest and using someone else's exception handling library. It supposed to run older version of python, not sure which one. However when I try to run it with python3, it spouted error that I didn't understand, also for some reason I have trouble finding the meaning of the keyword error (odict_keys) in the web.
The following was the result from the pytest. The exception handling inside test_analysis procedure was calling run_with_timeout(timeoutwrapper_analysis,max_seconds_per_call,(),{}) before the error occurred here. Inside run_with_timeout, the error happened when it raised e as an exception:
#pytest.mark.parametrize("inputs,outputs,description", portfolio_test_cases)
def test_analysis(inputs, outputs, description, grader):
"""Test get_portfolio_value() and get_portfolio_stats() return correct values.
Requires test inputs, expected outputs, description, and a grader fixture.
"""
points_earned = 0.0 # initialize points for this test case
try:
# Try to import student code (only once)
if not main_code in globals():
import importlib
# * Import module
mod = importlib.import_module(main_code)
globals()[main_code] = mod
# Unpack test case
start_date_str = inputs['start_date'].split('-')
start_date = datetime.datetime(int(start_date_str[0]),int(start_date_str[1]),int(start_date_str[2]))
end_date_str = inputs['end_date'].split('-')
end_date = datetime.datetime(int(end_date_str[0]),int(end_date_str[1]),int(end_date_str[2]))
symbols = inputs['symbol_allocs'].keys() # e.g.: ['GOOG', 'AAPL', 'GLD', 'XOM']
allocs = inputs['symbol_allocs'].values() # e.g.: [0.2, 0.3, 0.4, 0.1]
start_val = inputs['start_val']
risk_free_rate = inputs.get('risk_free_rate',0.0)
# the wonky unpacking here is so that we only pull out the values we say we'll test.
def timeoutwrapper_analysis():
student_rv = analysis.assess_portfolio(\
sd=start_date, ed=end_date,\
syms=symbols,\
allocs=allocs,\
sv=start_val, rfr=risk_free_rate, sf=252.0, \
gen_plot=False)
return student_rv
# Error happen in the following line:
result = run_with_timeout(timeoutwrapper_analysis,max_seconds_per_call,(),{})
grade_analysis.py:176:
func = .timeoutwrapper_analysis at 0x7f8c458347b8>, timeout_seconds = 5, pos_args = (), keyword_args = {}
def run_with_timeout(func,timeout_seconds,pos_args,keyword_args):
rv_dict = timeout_manager.dict()
p = multiprocessing.Process(target=proc_wrapper,args=(func,rv_dict,pos_args,keyword_args))
p.start()
p.join(timeout_seconds)
if p.is_alive():
p.terminate()
raise TimeoutException("Exceeded time limit!")
if not('output' in rv_dict):
if 'exception' in rv_dict:
e = rv_dict['exception']
e.grading_traceback=None
if 'traceback' in rv_dict:
e.grading_traceback = rv_dict['traceback']
# Error occurred after the following line:
raise e
E TypeError: can only concatenate list (not "odict_keys") to list
grading.py:134: TypeError
Looks the script didn't like
raise e
statement. What is that to do with odict_keys?
regards
two of the inputs of analysis.assess_portfolio, symbols and allocs are in the forms of odic_keys. Apparently, this worked in python 2.7, however when running with python 3, they need to be in the form of list, thus changing the input statement from
symbols = inputs['symbol_allocs'].keys()
allocs = inputs['symbol_allocs'].values()
to
symbols = list(inputs['symbol_allocs'].keys())
allocs = list(inputs['symbol_allocs'].values())
fixed it
when i use large number of data show this Error:('no unique mode; found %d equally common values' % len(table) statistics.StatisticsError: no unique mode; found 2 equally common values). But use 100 number of data it's work.i can't understand what the reason it doesn't work any one help and how to solve this Error pls.
data link:https://github.com/YoeriNijs/TweetAnalyzer
code:
import warnings
warnings.filterwarnings("ignore")
import nltk, random, csv, sys
from nltk.probability import FreqDist, ELEProbDist
from nltk.classify.util import apply_features,accuracy
from nltk.corpus import names
from nltk.tokenize import word_tokenize
import nltk.classify.util
from nltk import NaiveBayesClassifier
from textblob import TextBlob
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def get_words_in_tweets(tweets):
all_words = []
try:
for (words, sentiment) in tweets:
all_words.extend(words)
return all_words
except Exception as e:
print(e)
def get_word_features(wordlist):
wordlist = FreqDist(wordlist)
word_features = wordlist.keys()
#print (word_features)
return word_features
def selectTweets(row):
tweetWords = []
words = row[0].split()
for i in words:
i = i.lower()
i = i.strip('##\'"?,.!')
tweetWords.append(i)
row[0] = tweetWords
if counter <= 120:
trainTweets.append(row)
#print(trainTweets)
#print(('*')*30)
else:
testTweets.append(row)
#print(testTweets)
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
trainTweets = []
testTweets = []
#csvfile.csv
while True:
# Ask for filename
filename = str(input("> Please enter a filename (.csv): "))
#Check if filename ends with .csv
if filename.endswith(".csv"):
try:
#Open file
with open(filename, 'r',encoding='utf-8') as csvfile:
reader = csv.reader(csvfile, delimiter=';', quotechar='|')
#Print succes message
print ("> File opened successfully!")
counter = 0
for row in reader:
selectTweets(row)
counter += 1
print (counter,"> Wait a sec for the results...")
word_features = get_word_features(get_words_in_tweets(trainTweets))
training_set = apply_features(extract_features, trainTweets)
test_training_set=apply_features(extract_features, testTweets)
classifier = nltk.classify.NaiveBayesClassifier.train(training_set)
classifier.show_most_informative_features(5)
print (nltk.classify.util.accuracy(classifier,test_training_set))
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MultinomialNB accuracy percent:",nltk.classify.accuracy(MNB_classifier, test_training_set))
BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BernoulliNB accuracy percent:",nltk.classify.accuracy(BNB_classifier, test_training_set))
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, test_training_set))*100)
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, test_training_set))*100)
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, test_training_set))*100)
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, test_training_set))*100)
voted_classifier = VoteClassifier(classifier,
LinearSVC_classifier,
SGDClassifier_classifier,
MNB_classifier,
BNB_classifier,
LogisticRegression_classifier)
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier,test_training_set ))*100)
while True:
tweet = str(input("Please enter the text of the tweet you want to analize: "))
print (classifier.classify(extract_features(tweet.split())))
while True:
print
repeat = str(input("> Do you want to check another tweet (y/n)? "))
if repeat == "n":
print ("Exit program")
sys.exit()
if repeat != "y":
print ("Something went wrong")
if repeat == "y":
break
#If file does not exist, display this"""
except IOError:
print ("File does not exist.")
#Else if file does not end with .csv, do this
else:
print ("Please open a file that ends with .csv")
Show this Error:
Traceback (most recent call last):
File "C:\Users\Nahid\Desktop\main folder\newcheck.py", line 163, in <module>
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier,test_training_set ))*100)
File "C:\Users\Nahid\AppData\Local\Programs\Python\Python36-32\lib\site-packages\nltk\classify\util.py", line 87, in accuracy
results = classifier.classify_many([fs for (fs, l) in gold])
File "C:\Users\Nahid\AppData\Local\Programs\Python\Python36-32\lib\site-packages\nltk\classify\api.py", line 77, in classify_many
return [self.classify(fs) for fs in featuresets]
File "C:\Users\Nahid\AppData\Local\Programs\Python\Python36-32\lib\site-packages\nltk\classify\api.py", line 77, in <listcomp>
return [self.classify(fs) for fs in featuresets]
File "C:\Users\Nahid\Desktop\main folder\newcheck.py", line 35, in classify
return mode(votes)
File "C:\Users\Nahid\AppData\Local\Programs\Python\Python36-32\lib\statistics.py", line 507, in mode
'no unique mode; found %d equally common values' % len(table)
statistics.StatisticsError: no unique mode; found 2 equally common values
The easiest way to solve this is to upgrade Python to 3.8 or higher.
In Python versions 3.7 and older there can to be only a single number that occurs the most times in the whole set. If a set contains two or more such numbers than mode becomes inconclusive and returns the exact error you got.
However, since version 3.8 the whole mathematical concept is changed. In cases in which there are two or more modes in a set, the smallest mode is selected as the result.
Example:
result = statistics.mode([1,1,2,2,3,3])
has three possible and equal solutions: 1, 2, or 3 as each number occurs two times in the set
in Python 3.7 this returns an error,
in Python 3.8 this returns 1 as the mode
I try to append my vertica (SQL-type) table through pandas using sqlalchemy
import pandas as pd
import sqlalchemy as sa
Create engine to vertica:
def get_engine(base):
engine = sa.create_engine("{sys}+{dri}://{user}:" + \
"{password}#{host}:{port}/{database}".format(**login[base]))
return engine
engine = get_engine('vertica')
Just for clarity a simple query:
table = '***'
sql =\
'''
select *
from public.{table}
'''.format(table=table)
connection = engine.connect()
data = pd.read_sql(sql, connection)
connection.close()
Data is not empty:
print(len(data))
569955
And try to write to the same table:
fields = list(data.columns)
connection = engine.connect()
data.to_sql(table, connection, schema='public', index=False, if_exists='append', chunksize=30000,
dtype={fields[0]:sa.types.Integer,
fields[1]:sa.types.VARCHAR,
fields[2]:sa.types.Integer,
fields[3]:sa.types.Integer,
fields[4]:sa.types.Integer,
fields[5]:sa.types.VARCHAR,
fields[6]:sa.types.VARCHAR,
fields[7]:sa.types.VARCHAR,
fields[8]:sa.types.VARCHAR,
fields[9]:sa.types.VARCHAR,
fields[10]:sa.types.VARCHAR,
fields[11]:sa.types.VARCHAR,
fields[12]:sa.types.DateTime
})
connection.close()
And get this mistake:
...
\Anaconda3\lib\site-packages\sqlalchemy\engine\default.py in do_executemany(self, cursor, statement, parameters, context)
465
466 def do_executemany(self, cursor, statement, parameters, context=None):
--> 467 cursor.executemany(statement, parameters)
468
469 def do_execute(self, cursor, statement, parameters, context=None):
\Anaconda3\lib\site-packages\vertica_python\vertica\cursor.py in executemany(self, operation, seq_of_parameters)
153 else:
154 raise NotImplementedError(
--> 155 "executemany is implemented for simple INSERT statements only")
156
157 def fetchone(self):
NotImplementedError: executemany is implemented for simple INSERT statements only
I got the same error when I was trying to write my data to vertica using sqlalchemy. For my case the issue was the column names. It seems that it can't write column names that include special characters. I could fix the error by removing all the '_', '%' and white space characters from column names in pandas and then I used df.to_sql() to write it in vertica.