How to merge big data of csv files column wise into a single csv file using Pandas? - python-3.x

I have lots of big data csv files in terms of countries and I want to merge their column in a single csv file, furthermore, each file has 'Year' as an index and having same in terms of length and numbers. You can see below is a given example of a Japan.csv file.
If anyone can help me please let me know. Thank you!!

Try using:
import pandas as pd
import glob
l = []
path = 'path/to/directory/'
csvs = glob.glob(path + "/*.csv")
for i in csvs:
df = pd.read_csv(i, index_col=None, header=0)
l.append(df)
df = pd.concat(l, ignore_index=True)

This should work. It goes over each file name, reads it and combines everything into one df. You can export this df to csv or do whatever with it. gl.
import pandas as pd
def combine_csvs_into_one_df(names_of_files):
one_big_df = pd.DataFrame()
for file in names_of_files:
try:
content = pd.read_csv(file)
except PermissionError:
print (file,"was not found")
continue
one_big_df = pd.concat([one_big_df,content])
print (file," added!")
print ("------")
print ("Finished")
return one_big_df

Related

Reading multiple excel files into a pandas dataframe, but also storing the file name

I would like to read multiple excel files and store them into a single pandas dataframe, but I would like one of the columns in the dataframe to be the file name. This is because the file name contains the date (this is monthly data) and I need that information. I can't seem to get the filename, but I'm able to get the excel files into a dataframe. Please help.
import os
import pandas as pd
import fsspec
files = os.listdir("C://Users//6J2754897//Downloads//monthlydata")
paths = "C://Users//6J2754897//Downloads//monthlydata"
a = pd.DataFrame([2], index = None)
df = pd.DataFrame()
for file in range(len(files)):
if files[file].endswith('.xlsx'):
df = df.append(pd.read_excel(paths + "//" + files[file], sheet_name = "information", skiprows=7), ignore_index=True)
df['Month'] = str(files[file])
The order of operations here is incorrect. The line:
df['Month'] = str(files[file])
Is going to overwrite the entire column with the most recent value.
Instead we should only add the value to the current DataFrame:
import os
import pandas as pd
paths = "C://Users//6J2754897//Downloads//monthlydata"
files = os.listdir(paths)
df = pd.DataFrame()
for file in range(len(files)):
if files[file].endswith('.xlsx'):
# Read in File
file_df = pd.read_excel(paths + "//" + files[file],
sheet_name="information",
skiprows=7)
# Add to just this DataFrame
file_df['Month'] = str(files[file])
# Update `df`
df = df.append(file_df, ignore_index=True)
Alternatively we can use DataFrame.assign to chain the column assignment:
import os
import pandas as pd
paths = "C://Users//6J2754897//Downloads//monthlydata"
files = os.listdir(paths)
df = pd.DataFrame()
for file in range(len(files)):
if files[file].endswith('.xlsx'):
# Read in File
df = df.append(
# Read in File
pd.read_excel(paths + "//" + files[file],
sheet_name="information",
skiprows=7)
.assign(Month=str(files[file])), # Add to just this DataFrame
ignore_index=True
)
For general overall improvements we can use pd.concat with a list comprehension over files. This is done to avoid growing the DataFrame (which can be extremely slow). Pathlib.glob can also help with the ability to select the appropriate files:
from pathlib import Path
import pandas as pd
paths = "C://Users//6J2754897//Downloads//monthlydata"
df = pd.concat([
pd.read_excel(file,
sheet_name="information",
skiprows=7)
.assign(Month=file.stem) # We may also want file.name here
for file in Path(paths).glob('*.xlsx')
])
Some options for the Month Column are either:
file.stem will give "[t]he final path component, without its suffix".
'folder/folder/sample.xlsx' -> 'sample'
file.name will give "the final path component, excluding the drive and root".
'folder/folder/sample.xlsx' -> 'sample.xlsx'

How to edit columns in .CSV files using pandas

import urllib.request
import pandas as pd
# Url file Website
url = 'https://......CSV'
# Download file
urllib.request.urlretrieve(
url, "F:\.....A.CSV")
csvFilePath = "F:\.....A.CSV"
df = pd.read_csv(csvFilePath, sep='\t')
rows=[0,1,2,3]
df2 = df.drop(rows, axis=0, inplace=True)
df.to_csv(
r'F:\....New_A.CSV')
I tried doing this in code but it's making columns merge into a single column.
What I'm going to do is remove the top row from the left as shown in the picture.
I found a problem sep='\t' change to sep=','
Replace:
df = pd.read_csv(csvFilePath, sep='\t')
by:
df = pd.read_csv(csvFilePath, sep='\t', skiprows=5)

Convert a pandas dataframe to tab separated list in Python

I have a dataframe like below:
import pandas as pd
data = {'Words':['actually','he','came','from','home','and','played'],
'Col2':['2','0','0','0','1','0','3']}
data = pd.DataFrame(data)
The dataframe looks like this:
I write this dataframe into the drive using below command:
np.savetxt('/folder/file.txt', data.values,fmt='%s', delimiter='\t')
And the next script reads it with below line of code:
data = load_file('/folder/file.txt')
Below is load_file function to read a text file.
def load_file(filename):
with open(filename, 'r', encoding='utf-8') as f:
data = f.readlines()
return data
The data will be a tab separated list.
print(data)
gives me the following output:
['actually\t2\n', 'he\t0\n', 'came\t0\n', 'from\t0\n', 'home\t1\n', 'and\t0\n', 'played\t3\n']
I dont want to write the file to drive and then read it for processing. Instead I want to convert the dataframe to a tab separated list and process directly. How can I achieve this?
I checked for existing answers, but most just convert list to dataframe and not other way around.
Thanks in advance.
Try using .to_csv()
df_list = data.to_csv(header=None, index=False, sep='\t').split('\n')
df_list:
['actually\t2',
'he\t0',
'came\t0',
'from\t0',
'home\t1',
'and\t0',
'played\t3'
]
v = df.to_csv(header=None, index=False, sep='\t').rstrip().replace('\n', '\n\\n').split('\\n')
df_list:
['actually\t2\n',
'he\t0\n',
'came\t0\n',
'from\t0\n',
'home\t1\n',
'and\t0\n',
'played\t3\n'
]
I think this achieves the same result without writing to the drive:
df_list = list(data.apply(lambda row: row['Words'] + '\t' + row['Col2'] + '\n', axis=1))
Try:
data.apply("\t".join, axis=1).tolist()

import multiple Excel files to pandas and export to multiple Stata files

My raw Excel files are:
[excel_1.xlsx,excel_2.xlsx,...,excel_12.xlsx].
At first I want to import them into dataframes and then append them into a big dataframe, then df.to_dta, but python shows error and said:
MemoryError
I guess the problem is that the appended dataframe is too big.
So I thought I could transform each Excel file to each Stata file, which is:
[excel_1.xlsx,excel_2.xlsx,...,excel_12.xlsx]
to
[excel_1.dta,excel_2.dta,...,excel_12.dta]
and append them in Stata, but I don't know how to do that.
My original code was
import pandas as pd
IO = 'excel_1.xlsx'
df = pd.read_excel(io=IO, skiprows = [1,2] ,
dtype={"Opnprc": "str","Hiprc": "str","Loprc": "str","Clsprc": "str","Dnshrtrd": "str","Dnvaltrd": "str","Dsmvosd": "str",
"Dsmvtll": "str","Dretwd": "str","Dretnd": "str","Adjprcwd": "str","Adjprcnd": "str","Markettype": "str",
"Trdsta": "str"})
df.to_stata('excel1.dta')
I guess a for loop should work, but I don't know how to do that.
(the append code:
import os
import pandas as pd
cwd = os.path.abspath('D:\\onedrive\\test2')
files = os.listdir(cwd)
print(files)
df = pd.DataFrame()
for file in files:
if file.endswith('.xlsx'):
df = df.append(pd.read_excel(file, skiprows = [1,2] ,
dtype={"Opnprc": "str","Hiprc": "str","Loprc": "str","Clsprc": "str","Dnshrtrd": "str","Dnvaltrd": "str","Dsmvosd": "str",
"Dsmvtll": "str","Dretwd": "str","Dretnd": "str","Adjprcwd": "str","Adjprcnd": "str","Markettype": "str",
"Trdsta": "str"}), ignore_index=True)
df.head()
df.to_stata('test.dta')
Here is how to transform each Excel file to a Stata file using a for loop in python3.
import pandas as pd
IO = 'excel_{}.xlsx'
num_files = 12
for i in range(1, num_files + 1):
df = pd.read_excel(
io=IO.format(i),
skiprows = [1,2] ,
dtype={"Opnprc": "str","Hiprc": "str","Loprc": "str","Clsprc": "str","Dnshrtrd": "str","Dnvaltrd": "str","Dsmvosd": "str",
"Dsmvtll": "str","Dretwd": "str","Dretnd": "str","Adjprcwd": "str","Adjprcnd": "str","Markettype": "str",
"Trdsta": "str"})
df.to_stata('excel_{}.dta'.format(i))

update excel files via python

I want to read an excel file, sort the rows , remove duplicate files and re-save the file again
To do that, i have written this script:
import pandas as pd
data = pd.ExcelFile('FILE_NAME.xlsx')
df = data.parse('data')
df.sort_index()
df.drop_duplicates(subset = 'MAKAT', keep='first', inplace=False)
data.close()
print(pd.read_excel(data))
print('**** DONE ****')
in the result, I see the rows on the screen but the file stays with the duplicated rows.
My question is how to save these changes to the same file ?
Change the two lines as below:
df = df.sort_index()
df = df.drop_duplicates(subset = 'MAKAT', keep='first').sort_values(by=['MAKAT'])
df.to_csv('outputfile.csv)

Resources