Python ingestion of csv files - python-3.x

I am trying to ingest daily csv data into Python. I have different files such as follows for each day.I need help in appending two columns where the values from the columns are from the file name, for eg first column should take the value before '_' and the second column takes the date part from the file name.
board_2019-08-08.csv
sign_2019-08-08.csv
Summary_2019-08-08.csv
Code :
path = "C:\xyz\Files\ETL\Dashboard"
all_files = glob.glob(os.path.join(path, "*.csv"))
for file in all_files:
file_name = os.path.splitext(os.path.basename(file))[0]
dfn = pd.read_csv(file, skiprows = 17)
dfn['Page'] = 'Dashboard'
del dfn['Dimension']
dfn = dfn.iloc[1:]
dfn.columns = ['LoanId', 'Impressions', 'Page']
`

Try this
path = "C:\xyz\Files\ETL\Dashboard"
files = list(filter(lambda x: '.csv' in x, os.listdir('path')))
for file in files:
pre,post = file.split("_")
post = post.split(".")[0]
dfn = pd.read_csv(f"{path}/{file}", skiprows = 17)
# assume your inital values for column 0 and 1 is 1
dfn.insert(0,"column1",value=pre)
dfn.insert(1,"column2",value=post)
// rest of your code

Related

Creating a csv file from python readlines()

I want to create a CSV file from a text file
text_file.txt
Friday,09071235462,08:42:48
Princely,08123456,08:46:45
My code to convert the file
#Convert to csv
import csv
for_csv_list = []
with open(f'./text_file.txt', "r") as file:
lines = file.readlines()
for line in lines:
if line != "\n":
for_csv_list.append(line.strip().split())
with open("the_csv_file.csv","w") as convert_to_csv:
writer = csv.writer(convert_to_csv)
writer.writerows(for_csv_list)
Then I tried to open my converted CSV file
f = open("the_csv_file.csv")
csv_f = csv.reader(f)
for row in csv_f:
print("this is row = ",row)
f.close()
The code returned
this is row = ['Friday,09071235462,08:42:48']
this is row = []
this is row = ['Princely,08123456,08:46:45']
this is row = []
Please how will I remove the empty list since my expected result should be:
this is row = ['Friday,09071235462,08:42:48']
this is row = ['Princely,08123456,08:46:45']

Write list of elements to csv file with seprated by space

I have pdf file and extracting table fro it. Ca not share data and so createdone sample of how I want things to be wroked
I have a list
l = [a,bd,c f,e]
I want csv file like
col1 col2
a
bd
c f
e
This seems to work:
import csv
myheaders = ['col1', 'col2']
mydata = [['a'],['bd'],['c', 'f'],['e']]
with open('test.csv', 'w') as target:
writer = csv.writer(target, delimiter=' ')
writer.writerow(myheaders)
writer.writerows(mydata)
Another option I found is convert list into dataframe
df = pd.DataFrame(l)
then split it
df[0].str.split(" ", n = 1, expand = True)

Storing outputdata in CSV using python

I have extracted data from different excel sheets spread in different folders, I have organized the folders numerically from 2015 to 2019 and each folder has twelve subfolders (from 1 to 12) here's my code:
import os
from os import walk
import pandas as pd
path = r'C:\Users\Sarah\Desktop\IOMTest'
my_files = []
for (dirpath, dirnames, filenames) in walk(path):
my_files.extend([os.path.join(dirpath, fname) for fname in filenames])
all_sheets = []
for file_name in my_files:
#Display sheets names using pandas
pd.set_option('display.width',300)
mosul_file = file_name
xl = pd.ExcelFile(mosul_file)
mosul_df = xl.parse(0, header=[1], index_col=[0,1,2])
#Read Excel and Select columns
mosul_file = pd.read_excel(file_name, sheet_name = 0 ,
index_clo=None, na_values= ['NA'], usecols = "A, E, G, H , L , M" )
#Remove NaN values
data_mosul_df = mosul_file.apply (pd.to_numeric, errors='coerce')
data_mosul_df = mosul_file.dropna()
print(data_mosul_df)
then I saved the extracted columns in a csv file
def save_frames(frames, output_path):
for frame in frames:
frame.to_csv(output_path, mode='a+', header=False)
if __name__ == '__main__':
frames =[pd.DataFrame(data_mosul_df)]
save_frames(frames, r'C:\Users\Sarah\Desktop\tt\c.csv')
My problem is that when I open the csv file it seems that it doesn't store all the data but only the last excel sheet that it has read or sometimes the two last excel sheets. however, when I print my data inside the console (in Spyder) I see that all the data are treated
data_mosul_df = mosul_file.apply (pd.to_numeric, errors='coerce')
data_mosul_df = mosul_file.dropna()
print(data_mosul_df)
the picture below shows the output csv created. I am wondering if it is because from Column A to Column E the information are the same ? so that's why it overwrite ?
I would like to know how to modify the code so that it extract and store the data chronologically from folders (2015 to 2019) taking into accout subfolders (from 1 to 12) in each folder and how to create a csv that stores all the data ? thank you
Rewrite your loop:
for file_name in my_files:
#Display sheets names using pandas
pd.set_option('display.width',300)
mosul_file = file_name
xl = pd.ExcelFile(mosul_file)
mosul_df = xl.parse(0, header=[1], index_col=[0,1,2])
#Read Excel and Select columns
mosul_file = pd.read_excel(file_name, sheet_name = 0 ,
index_clo=None, na_values= ['NA'], usecols = "A, E, G, H , L , M" )
#Remove NaN values
data_mosul_df = mosul_file.apply (pd.to_numeric, errors='coerce')
data_mosul_df = mosul_file.dropna()
#Make a list of df's
all_sheets.append(data_mosul_df)
Rewrite your save_frames:
def save_frames(frames, output_path):
frames.to_csv(output_path, mode='a+', header=False)
Rewrite your main:
if __name__ == '__main__':
frames = pd.concat(all_sheets)
save_frames(frames, r'C:\Users\Sarah\Desktop\tt\c.csv')

How to read csv files from folder , Read first 10 files and save in one csv , read next 10 files and save in other csv file and so on

path = r'D:/PYTHON/My codes/Frequency envelopes/test'
all_files = glob.glob(os.path.join(path, "*.csv"))
names = [os.path.basename(x) for x in all_files]
df = pd.DataFrame()
for file_, name in zip(all_files, names):
file_df = pd.read_csv(file_,index_col=False)
file_df['file_name'] = name
df = df.append(file_df)
**This piece of code is working for read all files from directory and save in one file, i want to read first 10 files from directory and save in one csv file and next 10 files from same directory to save in other csv and so on i want to do this until not complete all csv files in directory , if anyone have solution please mention here **
IIUC, split your list into list of list , each sublist contain 10 items
names=[names[i:i+3] for i in range(0, len(names), 3)]
all_files=[all_files[i:i+3] for i in range(0, len(all_files), 3)]
for file, name in zip(all_files, names):
df=pd.DataFrame()
for x , y in zip(file,name):
file_df = pd.read_csv(x,index_col=False)
file_df['file_name'] = y
df = df.append(file_df)
df.to_csv(name+'.csv')
Add nrows=10 to read first 10 lines.
import pandas
for file_, name in zip(all_files, names):
file_df = pd.read_csv(file_,nrows=10) #reading first ten lines
file_df['file_name'] = name
df = df.append(file_df)

How to apply this pandas.Series code to compare several files within a folder

I already have this code that finds all the csv files in a folder and reads them in:
directory = os.fsencode(folderpath)
os.chdir(directory)
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith(".csv"):
df1 = pd.read_csv(filename)[columnName]
Now I have code that can find all the rows that are found in every single csv file that I input:
match = pd.Series(list(set(file1.columnName) & set(file2.columnName) & set(file3.columnName) & set(file4.columnName)))
How can I merge the two pieces of code above to find all the rows that are found in every single csv file within a folder and return the matches in a single pandas dataframe?
I think you can create list of all Series first and then dynamically find matches by reduce:
#data from previous answer
vals = []
directory = os.fsencode(folderpath)
os.chdir(directory)
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith(".csv"):
df1 = pd.read_csv(filename)['Name']
vals.append(df1)
from functools import reduce
a = reduce(lambda x, y: set(x) & set(y), vals)
print (a)
{'Ben', 'Neil'}
df = pd.DataFrame({'col':list(a)})
print (df)
col
0 Ben
1 Neil

Resources