why I get KeyError when I extract data with specific keywords from CSV file using python? - python-3.x

I am trying to use below code to get posts with specific keywords from my csv file but I keep getting KeyErro "Tag1"
import re
import string
import pandas as pd
import openpyxl
import glob
import csv
import os
import xlsxwriter
import numpy as np
keywords = {"agile","backlog"}
# all your keywords
df = pd.read_csv(r"C:\Users\ferr1982\Desktop\split1_out.csv",
error_bad_lines=False)#, sep="," ,
encoding="utf-8")
output = pd.DataFrame(columns=df.columns)
for i in range(len(df.index)):
#if (df.loc[df['Tags'].isin(keywords)]):
if any(x in ((df['Tags1'][i]),(df['Tags2'][i]), (df['Tags3'][i] ),
(df['Tags4'][i]) , (df['Tags5'][i])) for x in keywords):
output.loc[len(output)] = [df[j][i] for j in df.columns]
output.to_csv("new_data5.csv", incdex=False)

Okay, it turned to be that there is a little space before "Tags" column in my CSV file !
it is working now after I added the space to the name in the code above.

Related

Combining CSV files using Pandas is appending additional columns rather than right below?

I'm not exactly sure what is the best way to describe this problem, but the photos below should be pretty clear.
First photo is the current output and the second photo is the desired output.
Here is the code I'm using to combine these files:
import os
import glob
import pandas as pd
os.chdir("mydir")
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
#combine all files inthe list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ], axis=1)
#export to csv
combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')
I've also used this with no luck:
import pandas as pd
import glob
import os
# merging the files
joined_files = os.path.join("mydir", "clean_csv*.csv")
# A list of all joined files is returned
joined_list = glob.glob(joined_files)
# Finally, the files are joined
df = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)
df.to_csv('output-test.csv', index=True, encoding='utf-8-sig', header=None)

Reading CSV file with proper encoding in pandas

I can not read the csv file in my jupiternotebook, the following is the link github link of the csv file
https://github.com/roshanthokchom/new-assignment/blob/master/spam.csv
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
import urllib
pd.read_csv('spam.csv',encoding='latin-1')
ParserError: Error tokenizing data. C error: Expected 2 fields in line 13, saw 4
#Roshan here is the solution to your problem:
import pandas as pd
import csv
with open('spam.csv', newline='') as f:
csvread = csv.reader(f)
raw_data = list(csvread)
data = []
for i in batch_data:
i = i[0].split("\t")
data.append(i)
final_data = pd.DataFrame(data)
You can specify encoding as you have done but your file consists of commas in between text so if you read normally pandas will separate data based on ",". Thats why you are getting an error

Import dataset from url and convert text to csv in python3

I am pretty new to Python (using Python3) and read Pandas to import dataset.
I need to import dataset from url - https://newonlinecourses.science.psu.edu/stat501/sites/onlinecourses.science.psu.edu.stat501/files/data/leukemia_remission/index.txt
and convert it to csv file, I am getting some special character in converted csv -> ��
I am download txt file and converting it to csv, is is the right approach?
and converted csv is putting entire text into one column
from urllib.request import urlretrieve
import pandas as pd
from pandas import DataFrame
url = 'https://newonlinecourses.science.psu.edu/stat501/sites/onlinecourses.science.psu.edu.stat501/files/data/leukemia_remission/index.txt'
urlretrieve(url, 'index.txt')
df = pd.read_csv('index.txt', sep='/t', engine='python', lineterminator='\r\n')
csv_file = df.to_csv('index.csv', sep='\t', index=False, header=True)
print(csv_file)
after successful import, I have to Extract X as all columns except the first column and Y as first column also.
I'll appreciate your all help.
from urllib.request import urlretrieve
import pandas as pd
url = 'https://newonlinecourses.science.psu.edu/stat501/sites/onlinecourses.science.psu.edu.stat501/files/data/leukemia_remission/index.txt'
urlretrieve(url, 'index.txt')
df = pd.read_csv('index.txt', sep='\t',encoding='utf-16')
Y = df[['REMISS']]
X = df.drop(['REMISS'],axis=1)

import data from multiple file and summing column wise

I have n number of txt files each having 99 floating numbers in 99 column. I read each files and append all data by following script.
import glob
import numpy as np
import matplotlib.pyplot as plt
msd_files = (glob.glob('MSD_no_fs*'))
msd_all=[]
for msd_file in msd_files:
# print(msd_file)
msd = numpy.loadtxt(fname=msd_file, delimiter=',')
msd_all.append(msd)
After that I need to make column wise summation of each files. for example file1,column1+file2,column1+...+file(n)column(1) and iterate this for all column. What will be the effective way to perform this? Can I use list comprehension for that?
**edited code and it works fine now.
import glob
import numpy as np
import matplotlib.pyplot as plt
msd_files = (glob.glob('MSD_no_fs*'))
msd_all=[]
for msd_file in msd_files:
with open(msd_file) as f:
for line in f:
# msd_all.append([float(v) for v in line.strip().split(',')])
msd_all.append(float(line.strip()))
msa_array = np.array(msd_all)
x=np.split(msa_array,99)
x=np.array(x)
result=np.mean(x,axis=0)
print(result.shape)
print(len(result))
It depends on efficiency level you want. Using numpy to load many csv files might be a bad choice. Here is my suggestion.
import glob
import numpy as np
msd_files = (glob.glob('MSD_no_fs*'))
msd_all=[]
for msd_file in msd_files:
with open(msd_file) as f:
for line in f:
msd_all.append([float(v) for v in line.strip().split(',')])
msa_array = np.array(msd_all)
result = msa_array.sum(axis=0)

how to solve the keyerror when I load a CSV file using pandas

I use pandas to load a csv file and want to print out data of row, here is original data
orginal data
I want to print out 'violence' data for make a bar chart, but it occuar a keyerror, here is my code
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
c_data=pd.read_csv('crime.csv')
print(c_data.head())
print (c_data['violence'])
and the error
error detail
error detail
I tried use capital VIOLENCE,print (c_data['VIOLENCE']),but also failed
error detail
error detail
can someone tell me how to work it out?
Try the following if your data is small:
with open('crime.csv', 'r') as my_file:
reader = csv.reader(my_file)
rows = list(reader)
print rows[3]
If your data is big, try this:
from itertools import islice
with open('crime.csv', 'r') as my_file:
reader = csv.reader(my_file)
print next(islice(reader, 3, 4))

Resources