I have n number of txt files each having 99 floating numbers in 99 column. I read each files and append all data by following script.
import glob
import numpy as np
import matplotlib.pyplot as plt
msd_files = (glob.glob('MSD_no_fs*'))
msd_all=[]
for msd_file in msd_files:
# print(msd_file)
msd = numpy.loadtxt(fname=msd_file, delimiter=',')
msd_all.append(msd)
After that I need to make column wise summation of each files. for example file1,column1+file2,column1+...+file(n)column(1) and iterate this for all column. What will be the effective way to perform this? Can I use list comprehension for that?
**edited code and it works fine now.
import glob
import numpy as np
import matplotlib.pyplot as plt
msd_files = (glob.glob('MSD_no_fs*'))
msd_all=[]
for msd_file in msd_files:
with open(msd_file) as f:
for line in f:
# msd_all.append([float(v) for v in line.strip().split(',')])
msd_all.append(float(line.strip()))
msa_array = np.array(msd_all)
x=np.split(msa_array,99)
x=np.array(x)
result=np.mean(x,axis=0)
print(result.shape)
print(len(result))
It depends on efficiency level you want. Using numpy to load many csv files might be a bad choice. Here is my suggestion.
import glob
import numpy as np
msd_files = (glob.glob('MSD_no_fs*'))
msd_all=[]
for msd_file in msd_files:
with open(msd_file) as f:
for line in f:
msd_all.append([float(v) for v in line.strip().split(',')])
msa_array = np.array(msd_all)
result = msa_array.sum(axis=0)
Related
I am trying to run the below script to add to columns to the left of a file; however it keeps giving me
valueError: header must be integer or list of integers
Below is my code:
import pandas as pd
import numpy as np
read_file = pd.read_csv("/home/ex.csv",header='true')
df=pd.DataFrame(read_file)
def add_col(x):
df.insert(loc=0, column='Creation_DT', value=pd.to_datetime('today'))
df.insert(loc=1, column='Creation_By', value="Sean")
df.to_parquet("/home/sample.parquet")
add_col(df)
Any ways to make the creation_dt column a string?
According to pandas docs header is row number(s) to use as the column names, and the start of the data and must be int or list of int. So you have to pass header=0 to read_csv method.
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
Also, pandas automatically creates dataframe from read file, you don't need to do it additionally. Use just
df = pd.read_csv("/home/ex.csv", header=0)
You can try:
import pandas as pd
import numpy as np
read_file = pd.read_csv("/home/ex.csv")
df=pd.DataFrame(read_file)
def add_col(x):
df.insert(loc=0, column='Creation_DT', value=str(pd.to_datetime('today')))
df.insert(loc=1, column='Creation_By', value="Sean")
df.to_parquet("/home/sample.parquet")
add_col(df)
I am trying to read 6 files into 7 different data frames but I am unable to figure out how should I do that. File names can be complete random, that is I know the files but it is not like data1.csv data2.csv.
I tried using something like this:
import sys
import os
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
f1='Norway.csv'
f='Canada.csv'
f='Chile.csv'
Norway = pd.read_csv(Norway.csv)
Canada = pd.read_csv(Canada.csv)
Chile = pd.read_csv(Chile.csv )
I need to read multiple files in different dataframes. it is working fine when I do with One file like
file='Norway.csv
Norway = pd.read_csv(file)
And I am getting error :
NameError: name 'norway' is not defined
You can read all the .csv file into one single dataframe.
for file_ in all_files:
df = pd.read_csv(file_,index_col=None, header=0)
list_.append(df)
# concatenate all dfs into one
big_df = pd.concat(dfs, ignore_index=True)
and then split the large dataframe into multiple (in your case 7). For example, -
import numpy as np
num_chunks = 3
df1,df2,df3 = np.array_split(big_df,num_chunks)
Hope this helps.
After googling for a while looking for an answer, I decided to combine answers from different questions into a solution to this question. This solution will not work for all possible cases. You have to tweak it to meet all your cases.
check out the solution to this question
# import libraries
import pandas as pd
import numpy as np
import glob
import os
# Declare a function for extracting a string between two characters
def find_between( s, first, last ):
try:
start = s.index( first ) + len( first )
end = s.index( last, start )
return s[start:end]
except ValueError:
return ""
path = '/path/to/folder/containing/your/data/sets' # use your path
all_files = glob.glob(path + "/*.csv")
list_of_dfs = [pd.read_csv(filename, encoding = "ISO-8859-1") for filename in all_files]
list_of_filenames = [find_between(filename, 'sets/', '.csv') for filename in all_files] # sets is the last word in your path
# Create a dictionary with table names as the keys and data frames as the values
dfnames_and_dfvalues = dict(zip(list_of_filenames, list_of_dfs))
I am trying to use below code to get posts with specific keywords from my csv file but I keep getting KeyErro "Tag1"
import re
import string
import pandas as pd
import openpyxl
import glob
import csv
import os
import xlsxwriter
import numpy as np
keywords = {"agile","backlog"}
# all your keywords
df = pd.read_csv(r"C:\Users\ferr1982\Desktop\split1_out.csv",
error_bad_lines=False)#, sep="," ,
encoding="utf-8")
output = pd.DataFrame(columns=df.columns)
for i in range(len(df.index)):
#if (df.loc[df['Tags'].isin(keywords)]):
if any(x in ((df['Tags1'][i]),(df['Tags2'][i]), (df['Tags3'][i] ),
(df['Tags4'][i]) , (df['Tags5'][i])) for x in keywords):
output.loc[len(output)] = [df[j][i] for j in df.columns]
output.to_csv("new_data5.csv", incdex=False)
Okay, it turned to be that there is a little space before "Tags" column in my CSV file !
it is working now after I added the space to the name in the code above.
I imported a file from spss, (sav file), however, the titles of my columns
appear as integers instead of strings. Is there a way to fix it? Below is the code I used....I would apreciate any help!
import fnmatch
import sys # import sys
import os
import pandas as pd #pandas importer
import savReaderWriter as spss # to import file from SPSS
import io #importing io
import codecs #to resolve the UTF-8 unicode
with spss.SavReader('file_name.sav') as reader: #Should I add "Np"
records = reader.all()
with codecs.open('file_name.sav', "r",encoding='utf-8', errors='strict')
as fdata: # Not sure if the problems resides on this line
df = pd.DataFrame(records)
df.head()
Wondering whether there is a way to actually convert the titles from numbers to strings. It has happened as if it were excel, but excel has an easy fix for that.
Thanks in advance!
After you have created the DataFrame, you can use df.columns = df.columns.map(str) to change the column headers to strings.
I use pandas to load a csv file and want to print out data of row, here is original data
orginal data
I want to print out 'violence' data for make a bar chart, but it occuar a keyerror, here is my code
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
c_data=pd.read_csv('crime.csv')
print(c_data.head())
print (c_data['violence'])
and the error
error detail
error detail
I tried use capital VIOLENCE,print (c_data['VIOLENCE']),but also failed
error detail
error detail
can someone tell me how to work it out?
Try the following if your data is small:
with open('crime.csv', 'r') as my_file:
reader = csv.reader(my_file)
rows = list(reader)
print rows[3]
If your data is big, try this:
from itertools import islice
with open('crime.csv', 'r') as my_file:
reader = csv.reader(my_file)
print next(islice(reader, 3, 4))