TypeError: Image data cannot be converted to float,where my code is going wrong? - word-cloud

def calculate_frequencies(file_contents):
# Here is a list of punctuations and uninteresting words you can use to process your text
punctuations = '''!()-[]{};:'"\,<>./?##$%^&*_~'''
uninteresting_words = ["the", "a", "to", "if", "is", "it", "of", "and", "or", "an", "as", "i", "me", "my", \
"we", "our", "ours", "you", "your", "yours", "he", "she", "him", "his", "her", "hers", "its", "they", "them", \
"their", "what", "which", "who", "whom", "this", "that", "am", "are", "was", "were", "be", "been", "being", \
"have", "has", "had", "do", "does", "did", "but", "at", "by", "with", "from", "here", "when", "where", "how", \
"all", "any", "both", "each", "few", "more", "some", "such", "no", "nor", "too", "very", "can", "will", "just"]
# LEARNER CODE START HERE
dict1=[]
d ={}
for words in file_contents.split():
if words.isalpha() and words.lower() not in uninteresting_words:
dict1.append(words.lower())
for words in dict1:
if words not in d:
d[words] =0
d[words]+=file_contents.split().count(words)
return d
#wordcloud
cloud = WordCloud(width=900,height=500, max_words=1628,relative_scaling=1,normalize_plurals=False)
cloud.generate_from_frequencies(calculate_frequencies)
return cloud.to_array()`enter code here`)

# LEARNER CODE START HERE
words = file_contents.split(" ")
words_list = []
frequency={}
file_contents=file_contents.split()
for word in words:
for uninteresting_word in uninteresting_words:
if word is not uninteresting_word:
words_list.append(word)
for word in words_list:
if not word.isalpha():
word =''.join([letter for letter in word if word.isalpha()])
words_dict = {}
for word in words_list:
if word not in words_dict.keys():
words_dict[word] = words_list.count(word)
#wordcloud
cloud = wordcloud.WordCloud()
cloud.generate_from_frequencies(words_dict)
return cloud.to_array()

here is a shorter answer
LEARNER CODE START HERE
for s in file_contents:
if s in punctuations:
file_contents = file_contents.replace(s, "")
files = file_contents.lower().split()
result = {}
for word in files:
if word.isalpha() == True and word not in uninteresting_words:
if word in result:
result[word] += 1
else:
result[word] = 1
#wordcloud
cloud = wordcloud.WordCloud()
cloud.generate_from_frequencies(result)
return cloud.to_array()

Related

What am I missing to exclude uninteresting words from my wordcloud?

def calculate_frequencies(file_contents):
# Here is a list of punctuations and uninteresting words you can use to process your text
punctuations = '''!()-[]{};:'"\,<>./?##$%^&*_~'''
uninteresting_words = ["the", "a", "to", "if", "is", "it", "of", "and", "or", "an", "as", "i", "me", "my", \
"we", "our", "ours", "you", "your", "yours", "he", "she", "him", "his", "her", "hers", "its", "they", "them", \
"their", "what", "which", "who", "whom", "this", "that", "am", "are", "was", "were", "be", "been", "being", \
"have", "has", "had", "do", "does", "did", "but", "at", "by", "with", "from", "here", "when", "where", "how", \
"all", "any", "both", "each", "few", "more", "some", "such", "no", "nor", "too", "very", "can", "will", "just"]
# LEARNER CODE START HERE
dict = {}
new_list = []
for word in file_contents.lower().split():
if word.isalpha not in uninteresting_words:
new_list.append(word)
for word in new_list:
if word not in dict.keys():
dict[word] = new_list.count(word)
#wordcloud
cloud = wordcloud.WordCloud()
cloud.generate_from_frequencies(dict)
return cloud.to_array()
I have fixed my code so it doesn't return any Errors except it is still returning a word cloud with uninteresting words in it. I feel if word.isalpha not in uninteresting_words: should take care of it assuming everything else is correct.

Convert CSV to dictionary without using libraries

I have this CSV:
color,property,type,id
red,house,building,02
I'm trying to convert a csv to dictionary with the following structure:
{
"0": {"val1": 1, "val2": 2, "val3": 3, ..., "valn": n},
"1": {"val1": 45, "val2": 7, "val3": None, ..., "valn": 68},
}
Where as val1, val2 and so on are the header names of the columns and "0" and "1" are the number of rows.
So we should have:
CSV content is like this:
color,property,type,id
red,house,building,02
blue,department,flat,04
{
"0": {"color": "red", "property": "house", "type": "building", ..., "valn": n},
"1": {"color": "blue", "property": "farm", "type": "area", ..., "valn": n},
}
How can I achieve this result without using any library? I'd like to implement it from the scratch and don't use CSV library or the like.
Thank you.
Try this approach:
inp = """color,property,type,id
red,house,building,02
blue,department,flat,04
cyan,,flat,10
"""
lines = inp.split('\n')
colnames = list(map(lambda x: x.strip(), lines[0].split(',')))
lines = lines[1:]
res = {}
for i, line in enumerate(lines[:-1]):
res[i] = {
colname: val if val != '' else None
for colname, val in zip(colnames, map(lambda x: x.strip(), line.split(',')))
}
print(res)
However for additional features like type deduction code will be more complex: you can follow answers to this question

CSV to Nested JSON

My CSV is like :
csId,lut,seqId,lvlId,lvlTyp,accSt,enrlDt,ptnrName,ptnrIds
27768303,1561939200,1,G,GAR,10,06-06-2018,Chase,12345
27768303,1561939200,1,G,GAR,10,06-06-2018,Chase,98765
27768303,1561939200,1,G,GAR,10,06-06-2018,fliggy,67890
68537125,1562025600,2,S,SAR,20,11-12-2014,fliggy,98696
But I am getting something like :
[{"accSt": "10",
"csId": 27768303,
"enrlDt": "06-06-2018",
"lut": 1561939200,
"lvlId": "G",
"lvlTyp": "GAR",
"ptnrlst": "ptnrName":"Chase","ptnrIds":12345},
"seqId": 1,
"type": "mber"},
{"accSt": "10",
"csId": 27768303,
"enrlDt": "06-06-2018",
"lut": 1561939200,
"lvlId": "G",
"lvlTyp": "GAR",
"ptnrlst": {"ptnrName":"Chase","ptnrIds":98765},
"seqId": 1,
"type": "mber"},
{ "accSt": "10",
"csId": 27768303,
"enrlDt": "06-06-2018",
"lut": 1561939200,
"lvlId": "G",
"lvlTyp": "GAR",
"ptnrlst": {"ptnrName":"fliggy","ptnrIds":67890},
"seqId": 1,
"type": "mber"},
{ "accSt": "20",
"csId": 68537125,
"enrlDt": "11-12-2014",
"lut": 1562025600,
"lvlId": "S",
"lvlTyp": "SAR",
"ptnrlst": {"ptnrName":"Chase","ptnrIds":98696},
"seqId": 2,
"type": "mber"}]
I tried using the following code:
from csv import DictReader
from itertools import groupby
from pprint import pprint
import fileinput
import time
def check_for_null_new(allvars):
first_split = allvars.split(',')
ret_val=""
loop_cnt=1
for second_split in first_split:
individual_split = second_split.split(':')
if not individual_split[1]:
pass
else:
if loop_cnt == 1:
if individual_split[1].isnumeric():
ret_val=(individual_split[0]+":"+individual_split[1])
else:
ret_val=
(individual_split[0]+":'"+individual_split[1]+"'")
else:
if individual_split[1].isnumeric():
ret_val=ret_val +','+
(individual_split[0]+":"+individual_split[1])
else:
ret_val=ret_val +','+
(individual_split[0]+":'"+individual_split[1]+"'")
loop_cnt = loop_cnt + 1
return (ret_val)
start_time = time.time()
with open('member.csv',encoding='utf-8-sig') as csvfile:
r1 = DictReader(csvfile, skipinitialspace=True)
data = [dict(d) for d in r1]
groups = []
uniquekeys = []
for k, g in groupby(data, lambda r: ( r['csId'], r['lut'], r['seqId'],r['lvlId'], r['lvlTyp'], r['accSt'], r['enrlDt'],r['ptnrName'],r['ptnrIds'])):
groups.append({ "type": "mber",
"csId": int(k[0]),
"lut": int(k[1]),
"seqId": int(k[2]),
"lvlId": k[3],
"lvlTyp": k[4],
"accSt": k[5],
"enrlDt": k[6],
"ptnrlst" : {check_for_null_new("'ptnrName':"+k[7]+",'ptnrIds':"+ k[8])}
})
uniquekeys.append(g)
with open('member.json', 'wt') as out:
pprint(groups, stream=out)
with fileinput.FileInput('member.json', inplace=True, backup='.bak') as file:
for line in file:
print(line.replace("\"", "").replace("'", "\""), end='')
end_time = time.time()
print("CSV to JSON Completed for Member in %s seconds " % (end_time - start_time))
My expected output is :
[
{
"type": "mber",
"csId": 27768303,
"lut": 1561939200,
"seqId":1,
"lvlId": "G",
"lvlTyp": "GAR",
"accSt": "10",
"enrlDt": "06-06-2018",
"ptnrlst":[{"ptnrName":"Chase",
"ptnrIds":["12345","98765"]
},
{"ptnrName":"fliggy",
"ptnrIds":["67890"]
}]
},
{
"type": "mber",
"csId": 68537125,
"lut": 1562025600,
"seqId":2,
"lvlId": "S",
"lvlTyp": "SAR",
"accSt": "20",
"enrlDt": "11-12-2014",
"ptnrlst":[{"ptnrName":"chase","ptnrIds":["98696"]
}]
}
]
Specifically answering the question where you are reading from a csv file that outputs a json file which have nested objects, in your case seems to be ptnrName and ptnrIds.
Initializing and reading the file should be straight forward with consideration that you are comfortable with loading the whole file in-memory.
import csv
import itertools
from operator import itemgetter
with open('members.csv', 'r') as csvfile:
all_ = list(csv.DictReader(csvfile, skipinitialspace=True))
While we are utilizing itertools groupby function, you need to sort it by keys, so this would be all fields except from the nested object mentioned.
keys = 'csId lut seqId lvlId lvlTyp accSt enrlDt'.split()
all_.sort(key = itemgetter(*(keys)))
The next part is where we will be creating the nested operations by using 2 groupby functions;
The first groupby identifies the keys and if they have nested objects, and the second groupby groups them by ptnrName. Putting them together, you get:
import csv
import itertools
import json
from pprint import pprint
from operator import itemgetter
with open('members.csv', 'r') as csvfile:
all_ = list(csv.DictReader(csvfile, skipinitialspace=True))
keys = 'csId lut seqId lvlId lvlTyp accSt enrlDt'.split() #list of keys
all_.sort(key = itemgetter(*(keys))) #inplace sort based on keys
ds = []
#1st groupby based on keys
for k, g in itertools.groupby(all_, key=lambda r: [r[i] for i in keys]):
d = {key:value for key, value in zip(keep,k)} #create the default key,values
d['seqId'] = int(d['seqId'])
for k1, g1 in itertools.groupby(g, key = lambda r: r['ptnrName']): #2nd groupby ptnrName
array = [i['ptnrIds'] for i in g1] #array of multiple ptnrIds based on ptnrName
#set default key ptnrlst to a list to store nested ptnrName, ptnrIds pairs
d.setdefault('ptnrlst', []).append({'ptnrName':k1, 'ptnrIds':array})
ds.append(d)
And the results as expected, also note that while working with dictionaries, the keys are not sorted;
[{'accSt': '10',
'csId': '27768303',
'enrlDt': '06-06-2018',
'lut': '1561939200',
'lvlId': 'G',
'lvlTyp': 'GAR',
'ptnrlst': [{'ptnrIds': ['12345', '98765'], 'ptnrName': 'Chase'},
{'ptnrIds': ['67890'], 'ptnrName': 'fliggy'}],
'seqId': 1},
{'accSt': '20',
'csId': '68537125',
'enrlDt': '11-12-2014',
'lut': '1562025600',
'lvlId': 'S',
'lvlTyp': 'SAR',
'ptnrlst': [{'ptnrIds': ['98696'], 'ptnrName': 'fliggy'}],
'seqId': 2}]
And finally dump to json:
with open('member.json', 'w') as jsonfile:
json.dump(ds, jsonfile)
with open('member.json', 'r') as jsonfile:
jload = json.load(jsonfile)
jload == ds
>>True

adding, copying and creating new csv file

I got an assignment to import a CSV file with some fields, and I need to create a new CSV file with different fields that contains the original fields (in a different order).
original csv:
full name,Posiotion,Phone,Email,LinkedIn,Source,Comment
I tried to look up online and this is as far as i got:
import csv
with open("mobileTL.csv", 'r') as csv_file:
reader = csv.reader(csv_file)
newcsvdict = {"First name": [], "Middle name": [], "Last name": [], "Email": [], "Creation date": [], "Status": [],
"Position": [], "ID/SSN": [], "Source": [], "Source type": [], "Availability": [], "Salary expectations": [],
"Phone": [], "Mobile": [], "Street Adress": [], "City": [], "State": [], "Country": [], "Zip": [],
"LinkedIn URL": [], "Resume file name": [], "Migration ID": [], "Comment": [], "Comment2": []}
next(reader)
for row in reader:
first = ""
last = ""
if row[0] != "":
first = row[0].split()[0]
last = row[0].split()[1]
newcsvdict["First name"].append(first)
newcsvdict["Last name"].append(last)
newcsvdict["Phone"].append(row[2])
newcsvdict["Position"].append(row[1])
newcsvdict["Email"].append(row[3])
newcsvdict["Source"].append(row[5])
newcsvdict["Comment"].append(row[6])
newcsvdict["LinkedIn URL"].append(row[4])
with open('new.csv', 'w') as csv_file:
w = csv.DictWriter(csv_file, newcsvdict.keys())
w.writeheader()
w.writerows(newcsvdict)
It does create a new file but for some reason only the header is written.
First, the reason why it's only writing the header is because you'll get an error:
Traceback (most recent call last):
File "test.py", line 29, in <module>
w.writerows(newcsvdict)
...
wrong_fields = rowdict.keys() - self.fieldnames
AttributeError: 'str' object has no attribute 'keys'
You need to learn not to ignore error messages. The cause of that problem is that you were using writerows (note plural rows, which expects an iterable of rows) instead of writerow (note singular row, which expects just one row). To use writerows, you need to pass a list of dicts like this:
w.writerows([newcsvdict, newcsvdict, newcsvdict])
You should be using writerow, since you seem to only have 1 row, newcsvdict. Though, when I went ahead and did that, the output does not seem to be what you need:
First name,Middle name,Last name,Email,Creation date,Status,Position,ID/SSN,Source,Source type,Availability,Salary expectations,Phone,Mobile,Street Adress,City,State,Country,Zip,LinkedIn URL,Resume file name,Migration ID,Comment,Comment2
"['aaa', 'bbb', 'ccc']",[],"['AAA', 'BBB', 'CCC']","['aaa#email.com', 'bbb#email.com', 'ccc#email.com']",[],[],"['Pos1', 'Pos2', 'Pos3']",[],"['aaa', 'bbb', 'ccc']",[],[],[],"['123', '456', '789']",[],[],[],[],[],[],"['aaa', 'bbb', 'ccc']",[],[],"['aaa', 'bbb', 'ccc']",[]
That looks weird, because you created a dict with a list for each value (ex. "First name": []). Maybe that's what you want... but my understanding of your requirement is that you want for the new CSV is to have the same number of rows but different columns.
For that, it does not make sense to store the values as a list. One solution is to read one row, create a dict for it, then writerow it, then just repeat for the steps for all the rows. You can also use DictReader to easily access the values from the old CSV as a dict.
with open("new.csv", "w") as new_file:
new_row = dict.fromkeys([
"First name", "Middle name", "Last name", "Email",
"Creation date", "Status", "Position", "ID/SSN",
"Source", "Source type", "Availability", "Salary expectations",
"Phone", "Mobile", "Street Adress", "City",
"State", "Country", "Zip", "LinkedIn URL",
"Resume file name", "Migration ID", "Comment", "Comment2"
])
writer = csv.DictWriter(new_file, fieldnames=new_row.keys())
writer.writeheader()
with open("old.csv", 'r') as old_file:
old_csv = csv.DictReader(old_file)
for row in old_csv:
first = ""
last = ""
if row["full name"] != "":
first, last = row["full name"].split()
new_row["First name"] = first
new_row["Last name"] = last
new_row["Phone"] = row["Phone"]
new_row["Position"] = row["Position"]
new_row["Email"] = row["Email"]
new_row["Source"] = row["Source"]
new_row["Comment"] = row["Comment"]
new_row["LinkedIn URL"] = row["LinkedIn"]
writer.writerow(new_row)

I don't know why the second if block doesn't work?

#!/usr/bin/python
from TwitterSearch import *
import sys
import csv
tso = TwitterSearchOrder() # create a TwitterSearchOrder object
tso.set_keywords(['gmo']) # let's define all words we would like to have a look for
tso.set_language('en') # we want to see English tweets only
tso.set_include_entities(False) # and don't give us all those entity information
max_range = 1 # search range in kilometres
num_results = 500 # minimum results to obtain
outfile = "output.csv"
# create twitter API object
twitter = TwitterSearch(
access_token = "764537836884242432-GzJmUSL4hcC2DOJD71TiQXwCA0aGosz",
access_token_secret = "zDGYDeigRqDkmdqTgBOltcfNcNnfLwRZPkPLlnFyY3xqQ",
consumer_key = "Kr9ThiJWvPa1uTXZoj4O0YaSG",
consumer_secret = "ozGCkXtTCyCdOcL7ZFO4PJs85IaijjEuhl6iIdZU0AdH9CCoxS"
)
# Create an array of USA states
ustates = [
"AL",
"AK",
"AS",
"AZ",
"AR",
"CA",
"CO",
"CT",
"DE",
"DC",
"FM",
"FL",
"GA",
"GU",
"HI",
"ID",
"IL",
"IN",
"IA",
"KS",
"KY",
"LA",
"ME",
"MH",
"MD",
"MA",
"MI",
"MN",
"MS",
"MO",
"MT",
"NE",
"NV",
"NH",
"NJ",
"NM",
"NY",
"NC",
"ND",
"MP",
"OH",
"OK",
"OR",
"PW",
"PA",
"PR",
"RI",
"SC",
"SD",
"TN",
"TX",
"UT",
"VT",
"VI",
"VA",
"WA",
"WV",
"WI",
"WY",
"USA"
]
def linearSearch(item, obj, start=0):
for i in range(start, len(obj)):
if item == obj[i]:
return True
return False
# open a file to write (mode "w"), and create a CSV writer object
csvfile = file(outfile, "w")
csvwriter = csv.writer(csvfile)
# add headings to our CSV file
row = [ "user", "text", "place"]
csvwriter.writerow(row)
#-----------------------------------------------------------------------
# the twitter API only allows us to query up to 100 tweets at a time.
# to search for more, we will break our search up into 10 "pages", each
# of which will include 100 matching tweets.
#-----------------------------------------------------------------------
result_count = 0
last_id = None
while result_count < num_results:
# perform a search based on latitude and longitude
# twitter API docs: https://dev.twitter.com/docs/api/1/get/search
query = twitter.search_tweets_iterable(tso)
for result in query:
state = 0
if result["place"]:
user = result["user"]["screen_name"]
text = result["text"]
text = text.encode('utf-8', 'replace')
place = result["place"]["full_name"]
state = place.split(",")[1]
if linearSearch(state,ustates):
print state
# now write this row to our CSV file
row = [ user, text, place ]
csvwriter.writerow(row)
result_count += 1
last_id = result["id"]
print "got %d results" % result_count
csvfile.close()
I am trying to categorize the tweets by my array ustates, but the second if block seems like it doesn't work. I had no idea about that. What I did was to do a linear search, if my item is equal to the item in my array, I will write it into a csv file.
as it looks like the problem is some whitespaces remaining, you can use .strip() to remove them
>>> x=" WY "
>>> x.strip()
'WY'
>>>
Also some other tips
To speed up the membership test in ustates use a set instead of a list because set have a constant time check, while list is a linear search
The preferred way to open a file is using a context manager which ensure the closing of the file at the end of the block or in case of error in the block. Also use open instead of file
with those tip the code should look like
#!/usr/bin/python
... # all the previous stuff
# Create an set of USA states
ustates = {
"AL", "AK", "AS", "AZ", "AR",
"CA", "CO", "CT",
"DE", "DC",
"FM", "FL",
"GA", "GU",
"HI",
"ID", "IL", "IN", "IA",
"KS", "KY",
"LA",
"ME", "MH", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "MP",
"NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND",
"OH", "OK", "OR",
"PW", "PA", "PR",
"RI",
"SC", "SD",
"TN", "TX",
"UT",
"VT", "VI", "VA",
"WA", "WV", "WI", "WY",
"USA"
} # that arrange is just to take less lines, while grouping them alphabetically
# open a file to write (mode "w"), and create a CSV writer object
with open(outfile,"w") as csvfile:
... # the rest is the same
while result_count < num_results:
# perform a search based on latitude and longitude
# twitter API docs: https://dev.twitter.com/docs/api/1/get/search
query = twitter.search_tweets_iterable(tso)
for result in query:
state = 0
if result["place"]:
... # all the other stuff
state = state.strip() #<--- the strip part, add the .upper() if needed or just in case
if state in ustates:
... # all the other stuff
... # the rest of stuff
print "got %d results" % result_count

Resources