I'm currently applying some machine learning code to analyze emails from the enron dataset using the following code in python:
import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC, NuSVC, LinearSVC
def make_dictionary(train_dir):
emails = [os.path.join(train_dir,f) for f in os.listdir(train_dir)]
all_words = []
for mail in emails:
with open(mail) as m:
for i,line in enumerate(m):
if i == 2:
words = line.split()
all_words += words
dictionary = Counter(all_words)
return dictionary
list_to_remove = dictionary.keys()
for item in list_to_remove:
if item.isalpha() == False:
del dictionary[item]
elif len(item) == 1:
del dictionary[item]
dictionary = dictionary.most_common(3000)
train_dir = 'train-mails'
dictionary = make_Dictionary(train_dir)
def extract_features(mail_dir):
files = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)]
features_matrix = np.zeros((len(files),3000))
docID = 0;
for fil in files:
with open(fil) as fi:
for i,line in enumerate(fi):
if i == 2:
words = line.split()
for word in words:
wordID = 0
for i,d in enumerate(dictionary):
if d[0] == word:
wordID = i
features_matrix[docID,wordID] = words.count(word)
docID = docID + 1
return features_matrix
train_labels = np.zeros(702)
train_labels[351:701] = 1
train_matrix = extract_features(train_dir)
model1 = MultinomialNB()
model2 = LinearSVC()
model1.fit(train_matrix,train_labels)
model2.fit(train_matrix,train_labels)
test_dir = 'test-mails'
test_matrix = extract_features(test_dir)
test_labels = np.zeros(260)
test_labels[130:260] = 1
result1 = model1.predict(test_matrix)
result2 = model2.predict(test_matrix)
print confusion_matrix(test_labels,result1)
print confusion_matrix(test_labels,result2)
However, every time I run it, it says that dictionary is not defined and I can not figure out why it doesn't want to work. I've indented the areas which need it and I have the correct modules imported but it still doesn't work. Any ideas on how to fix this would be helpful.
dictionary = make_Dictionary(train_dir)
should be dictionary = make_dictionary(train_dir)
python is case sensitive. D should be d.
Related
Code works well when I hardcode the nodes (e.g. node1), but not when I use user input - it always returns 0 instead of counting the numbers which are "node3". Here is the page I am using http://py4e-data.dr-chuck.net/comments_678016.xml - node1 = comments, node2 = comment, node3= count. Any suggestions?
import urllib.request, urllib.parse, urllib.error
import xml.etree.ElementTree as ET
import ssl
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input ("Which url?\n")
node1 = input ("Enter node1- ")
node2 = input ("Enter node2- ")
node3 = input ("Enter node3- ")
count = 0
try:
html = urllib.request.urlopen(url, context=ctx).read()
tree = ET.fromstring(html)
x = tree.findall(node1/node2)
for item in x:
c = int(item.find(node3).text)
count = count + c
print(count)
except:
print("Please only input complete urls")
Putting aside the user input angle, if you want "to sum up all numbers under "count"", change your xpath expression to
x = tree.findall('.//comment/count')
and then either do it the long way (which I personally prefer):
total = 0
for count in x:
total += int(count.text)
or use list comprehensions:
sum([int(count.text) for count in x])
In either case, the output is
2348
Found out what the mistake was - needed to concatenate strings:
x = tree.findall(node1 + "/" + node2)
import urllib.request, urllib.parse, urllib.error
import xml.etree.ElementTree as ET
import ssl
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = input("Which url?\n")
node1 = input("Enter node1- ")
node2 = input("Enter node2- ")
node3 = input("Enter node3- ")
count = 0
try:
html = urllib.request.urlopen(url, context=ctx).read()
tree = ET.fromstring(html)
x = tree.findall(node1 + "/" + node2)
for item in x:
count += int(item.find(node3).text)
print(count)
except:
print("Please only input complete urls")
I am using below python code:
n = int(input('enter the number:'))
student_marks = {}
for i in range(n):
name, *line = input().split()
scores = list(map(float, line))
student_marks[name] = scores
query_name = input('enter the name:')
list_1 = list(student_marks[query_name])
no = len(l)
print(no)
s = sum(l)
print(s)
ss = s/no
print(ss)
But, i am getting an error while input the query_name during the run of code.
source: https://www.hackerrank.com/challenges/finding-the-percentage/problem
you can try to do
n = int(input('enter the number:'))
student_marks = {}
for i in range(n):
name, *line = input("enter name and scroe (spared by space): ").split()
scores = list(map(float, line))
student_marks[name] = scores
query_name = input('enter the name:')
list_1 = list(student_marks[query_name])
no = len(list_1)
print("the numer of scores {}".format(no))
s = sum(list_1)
print("The sum of all scores {}".format(s))
ss = s/no
print("The average score {}".format(ss))
if __name__ == '__main__':
n = int(input())
student_marks = {}
count = 0
for _ in range(n):
name, *line = input().split()
scores = list(map(float, line))
student_marks[name] = scores
query_name = input()
for i in student_marks[query_name]:
count += i
average = count / len(student_marks[query_name])
print("%.2f" %average)
You can try this solution:
--------------------------
from decimal import Decimal
# Main function (like Java main() method)
if __name__ == '__main__':
# Taking number of times input will be taken from console and converting it into int type
n = int(input())
# creating an empty dictionary
student_marks = {}
# Iterate from: 0 to n-1 times
for _ in range(n):
# Taking the first argument as name and all other numbers inside line var
name, *line = input().split()
# Converting the numbers contained in line variable to a map then, converting into list
scores = list(map(float, line))
# Inserting into dictionary as key - value pair
student_marks[name] = scores
# Taking the student name from console and store into query_name
query_name = input()
# Fetch student marks using student name
query_scores = student_marks[query_name]
# Sum all the marks
total_scores = sum(query_scores)
# Find average of the marks
avg = Decimal(total_scores/3)
# print the average upto two decimal point
print(round(avg, 2))
I tried using for-loop to exec my function but its always the same result.
import random
def main(arr):
result = random.choice(arr)
...some code...
return len(result)
for i in range(100):
main(arr)
I could only get diff result from stop/run the terminal. Anyone know why?
my question is the same as this one. random.choice always same
import random
results = []
with open('kargerMinCut.txt') as inputfile:
for line in inputfile:
results.append(line.strip().split('\t'))
def contract(arr):
while len(arr) > 2:
# Generate random number in list of lists
# ranList = random.choice(arr)
ranList = arr[np.random.choice(len(arr))]
ranNum = random.choice(ranList[1:])
# retrieve the index of the random number
listIndex = arr.index(ranList)
for i in range(0, len(arr)):
if arr[i][0] == ranNum:
targetList = i
break
target = ranList[0]
for i in range(0, len(arr)):
if i == listIndex:
arr[i].pop(0)
arr[i] = [x for x in arr[i] if x != ranNum]
elif i == targetList:
arr[i] = [x for x in arr[i] if x != target]
else:
for index, item in enumerate(arr[i]):
if item == target:
arr[i][index] = ranNum
arr[targetList] += arr[listIndex]
del arr[listIndex]
return len(arr[0])-1
the arr would be like this
array = [[1,2,3,4],[2,1,3,4],[3,1,2,4],[4,1,2,3]]
I don't know what you do inside your function but I've got the normal result. And in the question what you linked to the person just used seed. This is kinda pseudorandom that gives you all the time the same random output. Here is the link to deep your knowledge about pseudorandom
import random
arr = [1,2,3,4,5]
def main(arr):
result = random.choice(arr)
print(result)
for i in range(100):
main(arr)
The result is as it has to be:
1
3
5
3
4
3
1
4
4
3
2
this is a python program to generate a random string and to match it with a user given output and to get a return on the amount of attempts by the computer but i cant get the try count
import random
class txt:
def __init__(self):
self.txt = None
trycount = 0
def maketxt(self,txt):
txt = ""
a = []
a.append(txt.split())
# return a
# def match(self):
tokenlist = ["a", "b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"]
matchlist =[]
while (len(matchlist) <=24):
x =random.choice(tokenlist)
matchlist.append(x)
if matchlist == a:
print(trycount)
else :
trycount += 1
match()
t = txt()
t.maketxt("hagjkrshgujrahg")
I keep getting the error
File "C:/Users/#####/AppData/Local/Programs/Python/Python36/test1.py", line 25, in maketxt
trycount += 1
UnboundLocalError: local variable 'trycount' referenced before assignment
How do I implement 'insert mutation' and 'cycle recombination' for a TSP problem using genetic algorithm in Python3?
Assume I have done the selection steps and now I have two parents for reproduction.
This is what I have done so far:
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 13 14:15:31 2016
#author: niloo
"""
import pandas as pd
import random
import numpy as np
from geopy.distance import great_circle as gcy
def read(file_name):
data = pd.DataFrame.from_csv(file_name, sep=" ")
data = np.array(data)
return data
def Dist(data):
# distance = []
for i in range(len(data)):
temp = []
for j in range(len(data)):
if j < i:
temp.append(distance[j][i])
elif i == j:
temp.append(0)
else:
temp.append(gcy((data[i][0], data[i][1]), (data[j][0], data[j][1])).km)
distance.append(temp)
#print(distance[1][2])
return distance
def k_fitness():
k = 50
t = 3
first_gener = []
how_fit = []
for i in range(0, k):
temp = np.random.permutation(535)
first_gener.append(temp)
for i in range(len(first_gener)):
sum_dis = 0
for j in range(0, 534):
temp1 = first_gener[i][j]
temp2 = first_gener[i][j + 1]
sum_dis += distance[int(temp1)][int(temp2)]
how_fit.append(sum_dis)
# print(how_fit)
race1 = np.random.randint(k, size=t)
winner1 = 9999999999999999999
for i in range(0, t):
if how_fit[int(race1[i])] < winner1:
winner1 = how_fit[int(race1[i])]
mom = first_gener[int(race1[i])]
#print (mom)
race2 = np.random.randint(k, size=t)
winner2 = 9999999999999999999
for i in range(0, t):
if how_fit[int(race2[i])] < winner2:
winner2 = how_fit[int(race2[i])]
dad = first_gener[int(race2[i])]
return mom, dad
def cross_over(mom , dad):
#mom = [1,2,3,4,5,6,7,8,9]
#dad = [9,3,7,8,2,6,5,1,4]
if len(mom) != len(dad):
print ('error')
else:
child1 = [0] * len(mom)
child2 = [0] * len(mom)
flag = False
index = [0] * len(mom)
while True:
ind = -1
for i in range(len(mom)):
if index[i] == 0:
ind = i
break
if ind == -1:
break
temp = ind
while True:
index[temp] = 1
if flag == False:
child1[temp] = mom[temp]
child2[temp] = dad[temp]
else:
child1[temp] = dad[temp]
child2[temp] = mom[temp]
val = dad[temp]
for i in range(len(dad)):
if mom[i] == val:
temp = i
break
if ind == temp:
break
if flag == False:
flag = True
else:
flag = False
#print child1
#print child2
return [child1 , child2]
def mutation(offspring):
if random.random() <= 0.02:
index1 = random.randint(534)
index2 = random.randint(534)
if index1 != index2:
ind1 = min(index1 , index2)
ind2 = max(index1 , index2)
temp = offspring [ : ind1+1]
temp += offspring [ind2 : ind2+1]
temp += offspring [ind1+1 : ind2]
temp += offspring [ind2+1 : ]
offspring = temp
return offspring
else:
return offspring
dat = read('ali535.tsp')
distance = []
Dist(dat)
k_fitness()