import multiple Excel files to pandas and export to multiple Stata files - python-3.x

My raw Excel files are:
[excel_1.xlsx,excel_2.xlsx,...,excel_12.xlsx].
At first I want to import them into dataframes and then append them into a big dataframe, then df.to_dta, but python shows error and said:
MemoryError
I guess the problem is that the appended dataframe is too big.
So I thought I could transform each Excel file to each Stata file, which is:
[excel_1.xlsx,excel_2.xlsx,...,excel_12.xlsx]
to
[excel_1.dta,excel_2.dta,...,excel_12.dta]
and append them in Stata, but I don't know how to do that.
My original code was
import pandas as pd
IO = 'excel_1.xlsx'
df = pd.read_excel(io=IO, skiprows = [1,2] ,
dtype={"Opnprc": "str","Hiprc": "str","Loprc": "str","Clsprc": "str","Dnshrtrd": "str","Dnvaltrd": "str","Dsmvosd": "str",
"Dsmvtll": "str","Dretwd": "str","Dretnd": "str","Adjprcwd": "str","Adjprcnd": "str","Markettype": "str",
"Trdsta": "str"})
df.to_stata('excel1.dta')
I guess a for loop should work, but I don't know how to do that.
(the append code:
import os
import pandas as pd
cwd = os.path.abspath('D:\\onedrive\\test2')
files = os.listdir(cwd)
print(files)
df = pd.DataFrame()
for file in files:
if file.endswith('.xlsx'):
df = df.append(pd.read_excel(file, skiprows = [1,2] ,
dtype={"Opnprc": "str","Hiprc": "str","Loprc": "str","Clsprc": "str","Dnshrtrd": "str","Dnvaltrd": "str","Dsmvosd": "str",
"Dsmvtll": "str","Dretwd": "str","Dretnd": "str","Adjprcwd": "str","Adjprcnd": "str","Markettype": "str",
"Trdsta": "str"}), ignore_index=True)
df.head()
df.to_stata('test.dta')

Here is how to transform each Excel file to a Stata file using a for loop in python3.
import pandas as pd
IO = 'excel_{}.xlsx'
num_files = 12
for i in range(1, num_files + 1):
df = pd.read_excel(
io=IO.format(i),
skiprows = [1,2] ,
dtype={"Opnprc": "str","Hiprc": "str","Loprc": "str","Clsprc": "str","Dnshrtrd": "str","Dnvaltrd": "str","Dsmvosd": "str",
"Dsmvtll": "str","Dretwd": "str","Dretnd": "str","Adjprcwd": "str","Adjprcnd": "str","Markettype": "str",
"Trdsta": "str"})
df.to_stata('excel_{}.dta'.format(i))

Related

Reading multiple excel files into a pandas dataframe, but also storing the file name

I would like to read multiple excel files and store them into a single pandas dataframe, but I would like one of the columns in the dataframe to be the file name. This is because the file name contains the date (this is monthly data) and I need that information. I can't seem to get the filename, but I'm able to get the excel files into a dataframe. Please help.
import os
import pandas as pd
import fsspec
files = os.listdir("C://Users//6J2754897//Downloads//monthlydata")
paths = "C://Users//6J2754897//Downloads//monthlydata"
a = pd.DataFrame([2], index = None)
df = pd.DataFrame()
for file in range(len(files)):
if files[file].endswith('.xlsx'):
df = df.append(pd.read_excel(paths + "//" + files[file], sheet_name = "information", skiprows=7), ignore_index=True)
df['Month'] = str(files[file])
The order of operations here is incorrect. The line:
df['Month'] = str(files[file])
Is going to overwrite the entire column with the most recent value.
Instead we should only add the value to the current DataFrame:
import os
import pandas as pd
paths = "C://Users//6J2754897//Downloads//monthlydata"
files = os.listdir(paths)
df = pd.DataFrame()
for file in range(len(files)):
if files[file].endswith('.xlsx'):
# Read in File
file_df = pd.read_excel(paths + "//" + files[file],
sheet_name="information",
skiprows=7)
# Add to just this DataFrame
file_df['Month'] = str(files[file])
# Update `df`
df = df.append(file_df, ignore_index=True)
Alternatively we can use DataFrame.assign to chain the column assignment:
import os
import pandas as pd
paths = "C://Users//6J2754897//Downloads//monthlydata"
files = os.listdir(paths)
df = pd.DataFrame()
for file in range(len(files)):
if files[file].endswith('.xlsx'):
# Read in File
df = df.append(
# Read in File
pd.read_excel(paths + "//" + files[file],
sheet_name="information",
skiprows=7)
.assign(Month=str(files[file])), # Add to just this DataFrame
ignore_index=True
)
For general overall improvements we can use pd.concat with a list comprehension over files. This is done to avoid growing the DataFrame (which can be extremely slow). Pathlib.glob can also help with the ability to select the appropriate files:
from pathlib import Path
import pandas as pd
paths = "C://Users//6J2754897//Downloads//monthlydata"
df = pd.concat([
pd.read_excel(file,
sheet_name="information",
skiprows=7)
.assign(Month=file.stem) # We may also want file.name here
for file in Path(paths).glob('*.xlsx')
])
Some options for the Month Column are either:
file.stem will give "[t]he final path component, without its suffix".
'folder/folder/sample.xlsx' -> 'sample'
file.name will give "the final path component, excluding the drive and root".
'folder/folder/sample.xlsx' -> 'sample.xlsx'

How to edit columns in .CSV files using pandas

import urllib.request
import pandas as pd
# Url file Website
url = 'https://......CSV'
# Download file
urllib.request.urlretrieve(
url, "F:\.....A.CSV")
csvFilePath = "F:\.....A.CSV"
df = pd.read_csv(csvFilePath, sep='\t')
rows=[0,1,2,3]
df2 = df.drop(rows, axis=0, inplace=True)
df.to_csv(
r'F:\....New_A.CSV')
I tried doing this in code but it's making columns merge into a single column.
What I'm going to do is remove the top row from the left as shown in the picture.
I found a problem sep='\t' change to sep=','
Replace:
df = pd.read_csv(csvFilePath, sep='\t')
by:
df = pd.read_csv(csvFilePath, sep='\t', skiprows=5)

Dynamically create dataframes in python

Below is the code where i am reading two files and trying to create separate dataframes for both them.I am trying to achieve this dynamically so that I can use these df as per required. Here is the code of what I have done.
import pandas as pd
commanFilePath = '\Projects\Pandas\Csv_Files\\'
fileNametoImport = ['Employee.txt','Role.txt']
listofdf =[]
# load file to data frame
for filename in fileNametoImport:
fN,ext = filename.split('.')
fN = 'df'+fN
listofdf.append(fN)
filewithpathname = commanFilePath + filename
fN = pd.read_csv(filewithpathname,delimiter=',')
print(fN)
print(listofdf)
I want when I do print(listofdf[0]) I should get my first dataframe which would be dfEmployee.
Since you want listofdf[0] to be the first dataframe i.e., a dataframe for Employee.txt, then listofdf[0] should be pd.read_csv('\path\to\Employee.txt', delimiter=','). The name dfEmployee doesn't really matter. So you need to append the output of pd.read_csv to listofdf.
import pandas as pd
commanFilePath = '\Projects\Pandas\Csv_Files\\'
fileNametoImport = ['Employee.txt','Role.txt']
listofdf =[]
# load file to data frame
for filename in fileNametoImport:
fN,ext = filename.split('.')
fN = 'df'+fN # <--- is this needed?
listofdf.append(fN) # <--- remove this
filewithpathname = commanFilePath + filename
fN = pd.read_csv(filewithpathname,delimiter=',') # <---- this is the dataframe
listofdf.append(fN) # <--- append this
print(listofdf) #<-- should contain two dataframes now
Perhaps something like:
from os.path import join
import pandas as pd
folder_path = "\Projects\Pandas\Csv_Files"
file_names = ["Employee.txt", "Role.txt"]
dfs = [
pd.read_csv(join(folder_path, file_name), delimiter=",") for file_name in file_names
]

How to merge big data of csv files column wise into a single csv file using Pandas?

I have lots of big data csv files in terms of countries and I want to merge their column in a single csv file, furthermore, each file has 'Year' as an index and having same in terms of length and numbers. You can see below is a given example of a Japan.csv file.
If anyone can help me please let me know. Thank you!!
Try using:
import pandas as pd
import glob
l = []
path = 'path/to/directory/'
csvs = glob.glob(path + "/*.csv")
for i in csvs:
df = pd.read_csv(i, index_col=None, header=0)
l.append(df)
df = pd.concat(l, ignore_index=True)
This should work. It goes over each file name, reads it and combines everything into one df. You can export this df to csv or do whatever with it. gl.
import pandas as pd
def combine_csvs_into_one_df(names_of_files):
one_big_df = pd.DataFrame()
for file in names_of_files:
try:
content = pd.read_csv(file)
except PermissionError:
print (file,"was not found")
continue
one_big_df = pd.concat([one_big_df,content])
print (file," added!")
print ("------")
print ("Finished")
return one_big_df

Cannot get Pandas to concat/append

I'm trying to parse a website's tables and I'm pretty noobish still. For each link only the second table/dataframe is appended to the SS. There are multiple links and therefore it requires a while loop. Using what little I could find only I am just stuck with this which Im pretty sure is totally off:
import pandas as pd
from pandas import ExcelWriter
a=1
alist = []
writer = ExcelWriter('name.xlsx')
def dffunc():
dfs = pd.read_html('http://websitepath{}.htm'.format(a))
df = dfs[1]
alist.append(df,ignore_index=True)
alist = pd.concat(df, axis=0)
while a<9:
dffunc()
a+=1
alist.to_excel(writer, index=False)
writer.save()
df=dfs[1] takes the second table in the list. Is that what you want?
old:
df = dfs[1]
alist.append(df,ignore_index=True)
alist = pd.concat(df, axis=0)
You're appending the 2nd table in the dfs collection to global alist
You're assigning the 2nd table in the dfs collection to alist, undoing all previous steps
Operating on a global var that is written to file once at the end of the loop defeats the purpose of your loop given second bullet; alist will only ever take on the value of the 2nd table in the last query when you write to file
new:
import pandas as pd
from pandas import ExcelWriter
writer = ExcelWriter('name.xlsx')
writer_kwargs = {'index': False}
A = 9
def dffunc(a):
dfs = pd.read_html('http://websitepath{}.htm'.format(a))
return pd.concat(dfs, axis=0)
def dfhandler(df, writer, **kwargs):
df.to_excel(writer, sheet_name=a, **kwargs)
for a in xrange(1, A):
dfhandler(dffunc(a), writer, **writer_kwargs)
writer.save()

Resources