KeyError: 1 when trying to do sentiment analysis by python - python-3.x

This is the error info+ most recent call
KeyError Traceback (most recent call last)
D:\python\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3079 try:
-> 3080 return self._engine.get_loc(casted_key)
3081 except KeyError as err:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 1
The above exception was the direct cause of the following exception:
And this is the code that I use for practice from Github
#Define the main sentiment analysis function
def sentiment_check(file):
with open(file, 'r') as myfile:
file_content = open(file, 'r').read()
#Tokenise the management discussion using NLTK
file_content_tokenized= nltk.word_tokenize(file_content)
#Create a frequence distribution table of word tokens
freq=pd.Series(nltk.FreqDist(file_content_tokenized)).sort_values(ascending=False)
#print('Most popular 10 stop words',freq.iloc[0:10])
#print('fraction of total word count that are stop words:',freq.iloc[0:10].sum()/freq.sum())
#The top 10 most common words have been identified as stop words.
#These are words like: 'The', 'Ok', etc.
stopwords=pd.Series(freq.iloc[0:10].index)
#Remove stopwords
file_content_tokenized=pd.Series([x for x in file_content_tokenized if x not in stopwords.values]).str.lower()
# Load Loughran and McDonald dictionaries
#these dictionaries are specially used for textual analysis of financial statements
#More details on this in the README.md
pos = pd.read_csv('POSITIVE.txt', squeeze=True).str.lower()
neg = pd.read_csv('NEGATIVE.txt', squeeze=True).str.lower()
positive_words= file_content_tokenized.isin(pos).sum()
negative_words= file_content_tokenized.isin(neg).sum()
#Total Positive & Negative words in the statement
#("Positive Words:",positive_words)
#print("Negative Words:",negative_words)
sentiment_score = (positive_words-negative_words)/file_content_tokenized.count()
print("for",file.rstrip('.txt'),"(positive words - negative words)/total words:",sentiment_score)
print("for",file.rstrip('.txt'),"negative words/total words:",(negative_words)/file_content_tokenized.count())
#print((positive_words-negative_words)/file_content_tokenized.count())
nnn_words= pd.DataFrame(file_content_tokenized.isin(['no', 'not', 'never']))
nnn_words=nnn_words[nnn_words.iloc[:,0]]
nnn_words['idx']=nnn_words.index.values
nnn_words['words']=file_content_tokenized[nnn_words['idx']]
pos_after_neg=nnn_words.apply(pos_after_negator,axis=1,args=(pos.values,file_content_tokenized)).dropna()
print('+ve words after a negator:',pos_after_neg.values)
print('')
return sentiment_score;
def pos_after_negator(row,pos,file_content_tokenized):
#pos = pd.read_csv('LM_pos_words.txt', squeeze=True).str.lower()
#print(row)
string = row['words']
#print(file_content_tokenized.get(row[1]+1,''))
string+=' '+ str(file_content_tokenized.get(row[1]+1,''))
if file_content_tokenized.get(row[1]+1,'') in pos:
return string
string+=' '+ str(file_content_tokenized.get(row[1]+2,''))
if file_content_tokenized.get(row[1]+2,'') in pos:
return string
string+=' '+ str(file_content_tokenized.get(row[1]+3,''))
if file_content_tokenized.get(row[1]+3,'') in pos:
return string
# print(string)
return None
def driver():
#I have extracted Management Discussion section from last 5 10K annual reports and placed them in data folder
path = "D:\history data\Dissertation\MDA copy"
files = [s for s in os.listdir(path) if s.endswith('.txt')]
year = pd.Series([],dtype=pd.StringDtype())
sentiment = pd.Series([],dtype=pd.StringDtype())
for file in files:
year = year.append(pd.Series([int(file.split('.')[0])]))
sentiment = sentiment.append(pd.Series([sentiment_check(path+'\\'+file)]))
return (year, sentiment)
#Run for last 5 years
year, sentiment = driver()
I'm new to python and this error has been bothering me for hours T_T Please help! I literally have no idea about where this code could go wrong, so I put all of my codes here in case I miss the true cause. (Sorry for my messy format)

Related

Pytrends is only implemented in city level for USA (and without geocode)

I have been trying pytrends and I discovered that interest_by_region=city is only implemented for USA:
if self.geo == '':
self.interest_by_region_widget['request'][
'resolution'] = resolution
elif self.geo == 'US' and resolution in ['DMA', 'CITY', 'REGION']:
self.interest_by_region_widget['request'][
'resolution'] = resolution
I tried to discover what is missing in the code for other countries, but I am not able to find. I only know based on this piece of code above, that it only works for USA. Furthermore, I know that I can specify the city level in google trends. Can one help me find what is the part of pytrends that I have to implement?
EDIT:
I implemented the suggestion of #mcskinner (+1) that really makes the things simpler (but I got the same problem of my hack). Now, my code is:
import json
import pandas as pd
from pytrends.request import TrendReq
#from request import TrendReq
class MyTrendReq(TrendReq):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def interest_by_region(self, resolution='COUNTRY', inc_low_vol=False,
inc_geo_code=False):
"""Request data from Google's Interest by Region section and return a dataframe"""
# make the request
region_payload = dict()
if self.geo == '':
self.interest_by_region_widget['request']['resolution'] = resolution
elif self.geo == 'US' and resolution in ['DMA', 'CITY', 'REGION']:
self.interest_by_region_widget['request']['resolution'] = resolution
elif len(self.geo) == 2 and resolution in ['CITY', 'REGION']:
self.interest_by_region_widget['request']['resolution'] = resolution
self.interest_by_region_widget['request'][
'includeLowSearchVolumeGeos'] = inc_low_vol
# convert to string as requests will mangle
region_payload['req'] = json.dumps(
self.interest_by_region_widget['request'])
region_payload['token'] = self.interest_by_region_widget['token']
region_payload['tz'] = self.tz
# parse returned json
req_json = self._get_data(
url=TrendReq.INTEREST_BY_REGION_URL,
method=TrendReq.GET_METHOD,
trim_chars=5,
params=region_payload,
)
df = pd.DataFrame(req_json['default']['geoMapData'])
if (df.empty):
return df
# rename the column with the search keyword
df = df[['geoName', 'geoCode', 'value']].set_index(
['geoName']).sort_index()
# split list columns into seperate ones, remove brackets and split on comma
result_df = df['value'].apply(lambda x: pd.Series(
str(x).replace('[', '').replace(']', '').split(',')))
if inc_geo_code:
result_df['geoCode'] = df['geoCode']
# rename each column with its search term
for idx, kw in enumerate(self.kw_list):
result_df[kw] = result_df[idx].astype('int')
del result_df[idx]
return result_df
#import pytrends
if __name__=="__main__":
pytrend = MyTrendReq()
pytrend.build_payload(kw_list=['BMW'],geo='BR',timeframe='2019-03-01 2020-03-02')
# df = pytrend.interest_by_region(resolution='REGION', inc_low_vol=True, inc_geo_code=True)
df = pytrend.interest_by_region(resolution='CITY', inc_low_vol=True, inc_geo_code=True)
I got the following error (it seems that something is missing, but I am able to manually do this kind of search in google trends):
runfile('/home/daniel/Documents/caju/testingPytrendsStackoverflow.py', wdir='/home/daniel/Documents/caju')
Traceback (most recent call last):
File "<ipython-input-8-3a8c4f9b3a66>", line 1, in <module>
runfile('/home/daniel/Documents/caju/testingPytrendsStackoverflow.py', wdir='/home/daniel/Documents/caju')
File "/usr/lib/python3/dist-packages/spyder/utils/site/sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "/usr/lib/python3/dist-packages/spyder/utils/site/sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "/home/daniel/Documents/caju/testingPytrendsStackoverflow.py", line 72, in <module>
df = pytrend.interest_by_region(resolution='CITY', inc_low_vol=True, inc_geo_code=True)
File "/home/daniel/Documents/caju/testingPytrendsStackoverflow.py", line 53, in interest_by_region
df = df[['geoName', 'geoCode', 'value']].set_index(
File "/home/daniel/.local/lib/python3.6/site-packages/pandas/core/frame.py", line 2986, in __getitem__
indexer = self.loc._convert_to_indexer(key, axis=1, raise_missing=True)
File "/home/daniel/.local/lib/python3.6/site-packages/pandas/core/indexing.py", line 1285, in _convert_to_indexer
return self._get_listlike_indexer(obj, axis, **kwargs)[1]
File "/home/daniel/.local/lib/python3.6/site-packages/pandas/core/indexing.py", line 1092, in _get_listlike_indexer
keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
File "/home/daniel/.local/lib/python3.6/site-packages/pandas/core/indexing.py", line 1185, in _validate_read_indexer
raise KeyError("{} not in index".format(not_found))
KeyError: "['geoCode'] not in index"
If I replace in my code
df = pytrend.interest_by_region(resolution='CITY', inc_low_vol=True, inc_geo_code=True)
by
# df = pytrend.interest_by_region(resolution='REGION', inc_low_vol=True, inc_geo_code=True)
It works.
EDIT 2:
#mcskinner is right.
If I make inc_geo_code=False and I comment
# df = df[['geoName', 'geoCode', 'value']].set_index(
# ['geoName']).sort_index()
It works, but I loose the information of the city:
BMW
0 100
1 90
2 88
3 88
4 84
.. ...
105 43
106 43
107 42
108 42
109 38
The point is where should I include the missing geocode information for Brazil?
Right after the code you identified, as part of the same if/elif branching, you could add an additional branch for all non-global and non-US regions.
if self.geo == '':
self.interest_by_region_widget['request']['resolution'] = resolution
elif self.geo == 'US' and resolution in ['DMA', 'CITY', 'REGION']:
self.interest_by_region_widget['request']['resolution'] = resolution
elif len(self.geo) == 2 and resolution in ['CITY', 'REGION']:
self.interest_by_region_widget['request']['resolution'] = resolution
The condition on length 2 is a bit of a hack to identify countries. You could also get rid of the if condition and just always try to use the resolution.
self.interest_by_region_widget['request']['resolution'] = resolution
Some combinations are now invalid (REGION breakdown of a METRO), and Google Trends will fail for those. You would still need to be careful to handle those or only send valid combinations, but this would give you the freedom to do that.
Note that all of these require modifying the library code. To do it yourself, you would want to create your own subclass of TrendReq and override the interest_by_region method with your own modified copy.
class MyTrendReq(TrendReq):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def interest_by_region(self, resolution='COUNTRY', inc_low_vol=False,
inc_geo_code=False):
# Your modified copy goes here.
There is a small bug in pytrends' source code. There are no geocodes associated with cities.
To fix the problem, change the line
df = df[['geoName', 'geoCode', 'value']].set_index(['geoName']).sort_index()
to
df = df[['geoName', 'coordinates', 'value']].set_index(['geoName']).sort_index()

pd.rename key KeyError: 'New_Name'

Edit 12/07/19: The problem was not in fact with pd.rename fuction but the fact that I did not return from the function the pandas dataframe and as a result the column change did not exist when printing. i.e.
def change_column_names(as_pandas, old_name, new_name):
as_pandas.rename(columns={old_name: new_name}, inplace=)
return as_pandas <- This was missing*
Please see the user comment below to uptick them for finding this error for me.
Alternatively, you can continue reading.
The data can be downloaded from this link, yet I have added a sample dataset. The formatting of the file is not a typical CSV file and I believe this may have been an assessment piece and is related to Hidden Decision Tree article. I have given the portion of the code as it solves the issues surrounding the format of the text file as mentioned above and allows the user to rename the column.
The problem occured when I tried to assign create a re-naming function:
def change_column_names(as_pandas, old_name, new_name):
as_pandas.rename(columns={old_name: new_name}, inplace=)
However, it seem to work when I set the variable names inside rename function.
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'}, inplace=True)
return as_pandas
Sample Dataset
Title URL Date Unique Pageviews
oupUrl=tutorials 18-Apr-15 5608
"An Exclusive Interview with Data Expert, John Bottega" http://www.datasciencecentral.com/forum/topics/an-exclusive-interview-with-data-expert-john-bottega?groupUrl=announcements 10-Jun-14 360
Announcing Composable Analytics http://www.datasciencecentral.com/forum/topics/announcing-composable-analytics 15-Jun-14 367
Announcing the release of Spark 1.5 http://www.datasciencecentral.com/forum/topics/announcing-the-release-of-spark-1-5 12-Sep-15 156
Are Extreme Weather Events More Frequent? The Data Science Answer http://www.datasciencecentral.com/forum/topics/are-extreme-weather-events-more-frequent-the-data-science-answer 5-Oct-15 204
Are you interested in joining the University of California for an empiricalstudy on 'Big Data'? http://www.datasciencecentral.com/forum/topics/are-you-interested-in-joining-the-university-of-california-for-an 7-Feb-13 204
Are you smart enough to work at Google? http://www.datasciencecentral.com/forum/topics/are-you-smart-enough-to-work-at-google 11-Oct-15 3625
"As a software engineer, what's the best skill set to have for the next 5-10years?" http://www.datasciencecentral.com/forum/topics/as-a-software-engineer-what-s-the-best-skill-set-to-have-for-the- 12-Feb-16 2815
A Statistician's View on Big Data and Data Science (Updated) http://www.datasciencecentral.com/forum/topics/a-statistician-s-view-on-big-data-and-data-science-updated-1 21-May-14 163
A synthetic variance designed for Hadoop and big data http://www.datasciencecentral.com/forum/topics/a-synthetic-variance-designed-for-hadoop-and-big-data?groupUrl=research 26-May-14 575
A Tough Calculus Question http://www.datasciencecentral.com/forum/topics/a-tough-calculus-question 10-Feb-16 937
Attribution Modeling: Key Analytical Strategy to Boost Marketing ROI http://www.datasciencecentral.com/forum/topics/attribution-modeling-key-concept 24-Oct-15 937
Audience expansion http://www.datasciencecentral.com/forum/topics/audience-expansion 6-May-13 223
Automatic use of insights http://www.datasciencecentral.com/forum/topics/automatic-use-of-insights 27-Aug-15 122
Average length of dissertations by higher education discipline. http://www.datasciencecentral.com/forum/topics/average-length-of-dissertations-by-higher-education-discipline 4-Jun-15 1303
This is the full code that produces the Key Error:
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'}, inplace=True)
def change_column_names(as_pandas, old_name, new_name):
as_pandas.rename(columns={old_name: new_name}, inplace=True)
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'},
inplace=True)
def open_as_dataframe(file_name_in):
reader = pd.read_csv(file_name_in, encoding='windows-1251')
return reader
# Get each column of data including the heading and separate each element
i.e. Title, URL, Date, Page Views
# and save to string_of_rows with comma separator for storage as a csv
# file.
def get_columns_of_data(*args):
# Function that accept variable length arguments
string_of_rows = str()
num_cols = len(args)
try:
if num_cols > 0:
for number, element in enumerate(args):
if number == (num_cols - 1):
string_of_rows = string_of_rows + element + '\n'
else:
string_of_rows = string_of_rows + element + ','
except UnboundLocalError:
print('Empty file \'or\' No arguments received, cannot be zero')
return string_of_rows
def open_file(file_name):
try:
with open(file_name) as csv_file_in, open('HDT_data5.txt', 'w') as csv_file_out:
csv_read = csv.reader(csv_file_in, delimiter='\t')
for row in csv_read:
try:
row[0] = row[0].replace(',', '')
csv_file_out.write(get_columns_of_data(*row))
except TypeError:
continue
print("The file name '{}' was successfully opened and read".format(file_name))
except IOError:
print('File not found \'OR\' Not in current directory\n')
# All acronyms used in variable naming correspond to the function at time
# of return from function.
# csv_list being a list of the v file contents the remainder i.e. 'st' of
# csv_list_st = split_title().
def main():
open_file('HDTdata3.txt')
multi_sets = open_as_dataframe('HDT_data5.txt')
# change_column_names(multi_sets)
change_column_names(multi_set, 'Old_Name', 'New_Name')
print(multi_sets)
main()
I cleaned up your code so it would run. You were changing the column names but not returning the result. Try the following:
import pandas as pd
import numpy as np
import math
def set_new_columns(as_pandas):
titles_list = ['Year > 2014', 'Forum', 'Blog', 'Python', 'R',
'Machine_Learning', 'Data_Science', 'Data',
'Analytics']
for number, word in enumerate(titles_list):
as_pandas.insert(len(as_pandas.columns), titles_list[number], 0)
def title_length(as_pandas):
# Insert new column header then count the number of letters in 'Title'
as_pandas.insert(len(as_pandas.columns), 'Title_Length', 0)
as_pandas['Title_Length'] = as_pandas['Title'].map(str).apply(len)
# Although it is log, percentage of change is inverse linear comparison of
#logX1 - logX2
# therefore you could think of it as the percentage change in Page Views
# map
# function allows for function to be performed on all rows in column
# 'Page_Views'.
def log_page_view(as_pandas):
# Insert new column header
as_pandas.insert(len(as_pandas.columns), 'Log_Page_Views', 0)
as_pandas['Log_Page_Views'] = as_pandas['Page_Views'].map(lambda x: math.log(1 + float(x)))
def change_to_numeric(as_pandas):
# Check for missing values then convert the column to numeric.
as_pandas = as_pandas.replace(r'^\s*$', np.nan, regex=True)
as_pandas['Page_Views'] = pd.to_numeric(as_pandas['Page_Views'],
errors='coerce')
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'}, inplace=True)
return as_pandas
def open_as_dataframe(file_name_in):
reader = pd.read_csv(file_name_in, encoding='windows-1251')
return reader
# Get each column of data including the heading and separate each element
# i.e. Title, URL, Date, Page Views
# and save to string_of_rows with comma separator for storage as a csv
# file.
def get_columns_of_data(*args):
# Function that accept variable length arguments
string_of_rows = str()
num_cols = len(args)
try:
if num_cols > 0:
for number, element in enumerate(args):
if number == (num_cols - 1):
string_of_rows = string_of_rows + element + '\n'
else:
string_of_rows = string_of_rows + element + ','
except UnboundLocalError:
print('Empty file \'or\' No arguments received, cannot be zero')
return string_of_rows
def open_file(file_name):
import csv
try:
with open(file_name) as csv_file_in, open('HDT_data5.txt', 'w') as csv_file_out:
csv_read = csv.reader(csv_file_in, delimiter='\t')
for row in csv_read:
try:
row[0] = row[0].replace(',', '')
csv_file_out.write(get_columns_of_data(*row))
except TypeError:
continue
print("The file name '{}' was successfully opened and read".format(file_name))
except IOError:
print('File not found \'OR\' Not in current directory\n')
# All acronyms used in variable naming correspond to the function at time
# of return from function.
# csv_list being a list of the v file contents the remainder i.e. 'st' of
# csv_list_st = split_title().
def main():
open_file('HDTdata3.txt')
multi_sets = open_as_dataframe('HDT_data5.txt')
multi_sets = change_column_names(multi_sets)
change_to_numeric(multi_sets)
log_page_view(multi_sets)
title_length(multi_sets)
set_new_columns(multi_sets)
print(multi_sets)
main()

'no unique mode; found %d equally common values' % len(table) statistics.StatisticsError: no unique mode; found 2 equally common values

when i use large number of data show this Error:('no unique mode; found %d equally common values' % len(table) statistics.StatisticsError: no unique mode; found 2 equally common values). But use 100 number of data it's work.i can't understand what the reason it doesn't work any one help and how to solve this Error pls.
data link:https://github.com/YoeriNijs/TweetAnalyzer
code:
import warnings
warnings.filterwarnings("ignore")
import nltk, random, csv, sys
from nltk.probability import FreqDist, ELEProbDist
from nltk.classify.util import apply_features,accuracy
from nltk.corpus import names
from nltk.tokenize import word_tokenize
import nltk.classify.util
from nltk import NaiveBayesClassifier
from textblob import TextBlob
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def get_words_in_tweets(tweets):
all_words = []
try:
for (words, sentiment) in tweets:
all_words.extend(words)
return all_words
except Exception as e:
print(e)
def get_word_features(wordlist):
wordlist = FreqDist(wordlist)
word_features = wordlist.keys()
#print (word_features)
return word_features
def selectTweets(row):
tweetWords = []
words = row[0].split()
for i in words:
i = i.lower()
i = i.strip('##\'"?,.!')
tweetWords.append(i)
row[0] = tweetWords
if counter <= 120:
trainTweets.append(row)
#print(trainTweets)
#print(('*')*30)
else:
testTweets.append(row)
#print(testTweets)
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
trainTweets = []
testTweets = []
#csvfile.csv
while True:
# Ask for filename
filename = str(input("> Please enter a filename (.csv): "))
#Check if filename ends with .csv
if filename.endswith(".csv"):
try:
#Open file
with open(filename, 'r',encoding='utf-8') as csvfile:
reader = csv.reader(csvfile, delimiter=';', quotechar='|')
#Print succes message
print ("> File opened successfully!")
counter = 0
for row in reader:
selectTweets(row)
counter += 1
print (counter,"> Wait a sec for the results...")
word_features = get_word_features(get_words_in_tweets(trainTweets))
training_set = apply_features(extract_features, trainTweets)
test_training_set=apply_features(extract_features, testTweets)
classifier = nltk.classify.NaiveBayesClassifier.train(training_set)
classifier.show_most_informative_features(5)
print (nltk.classify.util.accuracy(classifier,test_training_set))
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MultinomialNB accuracy percent:",nltk.classify.accuracy(MNB_classifier, test_training_set))
BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BernoulliNB accuracy percent:",nltk.classify.accuracy(BNB_classifier, test_training_set))
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, test_training_set))*100)
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, test_training_set))*100)
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, test_training_set))*100)
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, test_training_set))*100)
voted_classifier = VoteClassifier(classifier,
LinearSVC_classifier,
SGDClassifier_classifier,
MNB_classifier,
BNB_classifier,
LogisticRegression_classifier)
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier,test_training_set ))*100)
while True:
tweet = str(input("Please enter the text of the tweet you want to analize: "))
print (classifier.classify(extract_features(tweet.split())))
while True:
print
repeat = str(input("> Do you want to check another tweet (y/n)? "))
if repeat == "n":
print ("Exit program")
sys.exit()
if repeat != "y":
print ("Something went wrong")
if repeat == "y":
break
#If file does not exist, display this"""
except IOError:
print ("File does not exist.")
#Else if file does not end with .csv, do this
else:
print ("Please open a file that ends with .csv")
Show this Error:
Traceback (most recent call last):
File "C:\Users\Nahid\Desktop\main folder\newcheck.py", line 163, in <module>
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier,test_training_set ))*100)
File "C:\Users\Nahid\AppData\Local\Programs\Python\Python36-32\lib\site-packages\nltk\classify\util.py", line 87, in accuracy
results = classifier.classify_many([fs for (fs, l) in gold])
File "C:\Users\Nahid\AppData\Local\Programs\Python\Python36-32\lib\site-packages\nltk\classify\api.py", line 77, in classify_many
return [self.classify(fs) for fs in featuresets]
File "C:\Users\Nahid\AppData\Local\Programs\Python\Python36-32\lib\site-packages\nltk\classify\api.py", line 77, in <listcomp>
return [self.classify(fs) for fs in featuresets]
File "C:\Users\Nahid\Desktop\main folder\newcheck.py", line 35, in classify
return mode(votes)
File "C:\Users\Nahid\AppData\Local\Programs\Python\Python36-32\lib\statistics.py", line 507, in mode
'no unique mode; found %d equally common values' % len(table)
statistics.StatisticsError: no unique mode; found 2 equally common values
The easiest way to solve this is to upgrade Python to 3.8 or higher.
In Python versions 3.7 and older there can to be only a single number that occurs the most times in the whole set. If a set contains two or more such numbers than mode becomes inconclusive and returns the exact error you got.
However, since version 3.8 the whole mathematical concept is changed. In cases in which there are two or more modes in a set, the smallest mode is selected as the result.
Example:
result = statistics.mode([1,1,2,2,3,3])
has three possible and equal solutions: 1, 2, or 3 as each number occurs two times in the set
in Python 3.7 this returns an error,
in Python 3.8 this returns 1 as the mode

Python: what to fix in the following code to make it run?

I have the following code where i am facing error and i am unable to identify the actual issue here. The code takes a .json file which holds the words and their meanings and finds the exact or nearest matches of the words given as input by the user along with their meanings. The code was running fine until i tried to modify it a little. I wanted to add the matching words where the first word is capital in the following line post which it started throwing exception:
Changed line:
if (word != "") and ((word in data.keys()) or (word.capitalize() in data.keys())):
Code:
import json
import difflib
def searchWord(word):
if (word != "") and ((word in data.keys()) or (word.capitalize() in data.keys())):
return data[word]
else:
closematch = difflib.get_close_matches(word,data.keys())[0]
confirmation = (input(f"\nDid you mean: {closematch} (y/n): ")).lower()
if confirmation == 'y':
return data[closematch]
else:
return 'Word Not Found in Dictionary'
print('Loading Data...\n')
data = json.load(open('data.json'))
print('Data Loaded!\n')
word = (input('Enter word to lookup in dictionary: ')).lower()
meanings = searchWord(word)
if meanings == list:
for meaning in meanings:
print("\n"+meaning)
else:
print(meanings[0])
Error:
Loading Data...
Data Loaded!
Enter word to lookup in dictionary: delhi
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
E:\Learning\Python\AdvancedPython\PythonMegaCourse\DictionaryApp\dictionary.py in <module>()
20 word = (input('Enter word to lookup in dictionary: ')).lower()
21
---> 22 meanings = searchWord(word)
23 if meanings == list:
24 for meaning in meanings:
E:\Learning\Python\AdvancedPython\PythonMegaCourse\DictionaryApp\dictionary.py in searchWord(word)
4 def searchWord(word):
5 if (word != "") and ((word in data.keys()) or (word.capitalize() in data.keys())):
----> 6 return data[word]
7 else:
8 closematch = difflib.get_close_matches(word,data.keys())[0]
KeyError: 'delhi'
The .json file has got a key named Delhi however, the capitalize() doesn't seem to work.
When you are trying to access the word from the dictionary, you are not capitalizing it.
This is not a clean way to handle it but to give you the idea.
if (word != "") and (word in data.keys()):
return data[word]
if (word != "") and (word.capitalize() in data.keys()):
return data[word.capitalize()]

ZipFile cracker works with a few combinations and crashes when many combinations are used

I've found many examples of zip crackers written in python, but unfortunatelly they were either written in python2 or have the functionality I do not need (i.e. use of dictionaries saved in files). For me was interesting to check how long and how much memory will it take to break, let's say, a password of 5-10 different symbols (a-z, a-zA-Z, a-zA-Z1-10 etc.). Afterwards I can try different libraries and techniques (threads etc.) to improve performance of the code and, hopefully, get better undestaning of python mechanics in the process.
Here is my program. It works well when the program tries 2-position passwords (a-zA-Z) and crashes with longer passwords.
import os
import shutil
import zipfile
from itertools import permutations, combinations_with_replacement
from string import ascii_letters
#passgen() yields passwords to be checked
def passgen(passminlength, passmaxlength,searchdict):
prevpwd = []
for n in range(passminlength,passmaxlength):
for p in combinations_with_replacement(searchdict,n):
for k in permutations(p,n):
pwd_tmp=''.join(k)
if prevpwd != pwd_tmp: #without this check passgen() yields
prevpwd = pwd_tmp #recurring password combinations
yield pwd_tmp #due to the logic behind permutations()
if __name__ == '__main__':
zFile = zipfile.ZipFile("secret.zip", 'r') #encrypted file to crack
pwd = None #password to find
output_directory = os.path.curdir+"/unzip_tmp" #output tmpfolder for extracted files
if not os.path.isdir(output_directory): #if it exists - delete, otherwise - create
os.makedirs('unzip_tmp')
else:
shutil.rmtree(output_directory)
os.makedirs('unzip_tmp')
searchdict = list(ascii_letters) #list with symbols for brute force: a-zA-Z
passminlength = 1
passmaxlength = 3 #code works with passmaxlength=2, doesn't - with passmaxlength=3
pwd_tmp = passgen(passminlength,passmaxlength,searchdict) #pwd_tmp is an iterator
while True:
try:
tmp = next(pwd_tmp)
except StopIteration: #iterator is empty-> quit while-loop
break
print("trying..:%s" % tmp)
zFile.setpassword(bytes(tmp,'ascii'))
try:
zFile.extractall(output_directory)
pwd = tmp
break #password is found->quit while-loop
except RuntimeError: #checked password is wrong ->go again though while loop
print("wrong password:%s" % tmp)
print("password is:%s" % pwd)
The program crashes with maxpasslength=3 on the row with "zFile.extractall(output_directory)".
Error is:
Traceback (most recent call last):
File "C:\Experiments\Eclipse\Workspace\messingaroundpython\zipcracker.py", lin
e 48, in <module>
zFile.extractall(output_directory)
File "C:\Python34\lib\zipfile.py", line 1240, in extractall
self.extract(zipinfo, path, pwd)
File "C:\Python34\lib\zipfile.py", line 1228, in extract
return self._extract_member(member, path, pwd)
File "C:\Python34\lib\zipfile.py", line 1292, in _extract_member
shutil.copyfileobj(source, target)
File "C:\Python34\lib\shutil.py", line 67, in copyfileobj
buf = fsrc.read(length)
File "C:\Python34\lib\zipfile.py", line 763, in read
data = self._read1(n)
File "C:\Python34\lib\zipfile.py", line 839, in _read1
data = self._decompressor.decompress(data, n)
zlib.error: Error -3 while decompressing data: invalid distance too far back
I am stuck. Any idea what I could be missing?
Update: It seems to be a bug from zlib. I added an exeption catcher to ignore bad combinations:
except RuntimeError: #checked password is wrong ->go again though while loop
print("wrong password:%s" % tmp)
except Exception as err:
print("zlib bug, wrong password?:%s" % tmp)
Now the program can process much longer passwords.

Resources