Dynamically create dataframes in python - python-3.x

Below is the code where i am reading two files and trying to create separate dataframes for both them.I am trying to achieve this dynamically so that I can use these df as per required. Here is the code of what I have done.
import pandas as pd
commanFilePath = '\Projects\Pandas\Csv_Files\\'
fileNametoImport = ['Employee.txt','Role.txt']
listofdf =[]
# load file to data frame
for filename in fileNametoImport:
fN,ext = filename.split('.')
fN = 'df'+fN
listofdf.append(fN)
filewithpathname = commanFilePath + filename
fN = pd.read_csv(filewithpathname,delimiter=',')
print(fN)
print(listofdf)
I want when I do print(listofdf[0]) I should get my first dataframe which would be dfEmployee.

Since you want listofdf[0] to be the first dataframe i.e., a dataframe for Employee.txt, then listofdf[0] should be pd.read_csv('\path\to\Employee.txt', delimiter=','). The name dfEmployee doesn't really matter. So you need to append the output of pd.read_csv to listofdf.
import pandas as pd
commanFilePath = '\Projects\Pandas\Csv_Files\\'
fileNametoImport = ['Employee.txt','Role.txt']
listofdf =[]
# load file to data frame
for filename in fileNametoImport:
fN,ext = filename.split('.')
fN = 'df'+fN # <--- is this needed?
listofdf.append(fN) # <--- remove this
filewithpathname = commanFilePath + filename
fN = pd.read_csv(filewithpathname,delimiter=',') # <---- this is the dataframe
listofdf.append(fN) # <--- append this
print(listofdf) #<-- should contain two dataframes now

Perhaps something like:
from os.path import join
import pandas as pd
folder_path = "\Projects\Pandas\Csv_Files"
file_names = ["Employee.txt", "Role.txt"]
dfs = [
pd.read_csv(join(folder_path, file_name), delimiter=",") for file_name in file_names
]

Related

Pandas - Add items to dataframe

I am trying to add row items to the dataframe, and I am not able to update the dataframe.
What i tried until now is commented out as it doesn't do what I need.
I simply want to download the json file and store it to a dataframe with those given columns. Seems I am not able to extract the child components fron JSON file and store them to a brand new dataframe.
Please find bellow my code:
import requests, json, urllib
import pandas as pd
url = "https://www.cisa.gov/sites/default/files/feeds/known_exploited_vulnerabilities.json"
data = pd.read_json(url)
headers = []
df = pd.DataFrame()
for key, item in data['vulnerabilities'].items():
for k in item.keys():
headers.append(k)
col = list(set(headers))
new_df = pd.DataFrame(columns=col)
for item in data['vulnerabilities'].items():
print(item[1])
# new_df['product'] = item[1]['product']
# new_df['vendorProject'] = item[1]['vendorProject']
# new_df['dueDate'] = item[1]['dueDate']
# new_df['shortDescription'] = item[1]['shortDescription']
# new_df['dateAdded'] = item[1]['dateAdded']
# new_df['vulnerabilityName'] = item[1]['vulnerabilityName']
# new_df['cveID'] = item[1]['cveID']
# new_df.append(item[1], ignore_index = True)
new_df
At the end my df is still blank.
The nested JSON data can be directly converted to a flattened dataframe using pd.json_normalize(). The headers are extracted from the JSON itself.
new_df = pd.DataFrame(pd.json_normalize(data['vulnerabilities']))
UPDATE: Unnested the vulnerabilities column specifically.
Output:
It worked with this:
import requests, json, urllib
import pandas as pd
url = "https://www.cisa.gov/sites/default/files/feeds/known_exploited_vulnerabilities.json"
data = pd.read_json(url)
headers = []
df = pd.DataFrame()
for key, item in data['vulnerabilities'].items():
for k in item.keys():
headers.append(k)
col = list(set(headers))
new_df = pd.DataFrame(columns=col)
for item in data['vulnerabilities'].items():
new_df.loc[len(new_df.index)] = item[1] <===THIS
new_df.head()

Reading multiple excel files into a pandas dataframe, but also storing the file name

I would like to read multiple excel files and store them into a single pandas dataframe, but I would like one of the columns in the dataframe to be the file name. This is because the file name contains the date (this is monthly data) and I need that information. I can't seem to get the filename, but I'm able to get the excel files into a dataframe. Please help.
import os
import pandas as pd
import fsspec
files = os.listdir("C://Users//6J2754897//Downloads//monthlydata")
paths = "C://Users//6J2754897//Downloads//monthlydata"
a = pd.DataFrame([2], index = None)
df = pd.DataFrame()
for file in range(len(files)):
if files[file].endswith('.xlsx'):
df = df.append(pd.read_excel(paths + "//" + files[file], sheet_name = "information", skiprows=7), ignore_index=True)
df['Month'] = str(files[file])
The order of operations here is incorrect. The line:
df['Month'] = str(files[file])
Is going to overwrite the entire column with the most recent value.
Instead we should only add the value to the current DataFrame:
import os
import pandas as pd
paths = "C://Users//6J2754897//Downloads//monthlydata"
files = os.listdir(paths)
df = pd.DataFrame()
for file in range(len(files)):
if files[file].endswith('.xlsx'):
# Read in File
file_df = pd.read_excel(paths + "//" + files[file],
sheet_name="information",
skiprows=7)
# Add to just this DataFrame
file_df['Month'] = str(files[file])
# Update `df`
df = df.append(file_df, ignore_index=True)
Alternatively we can use DataFrame.assign to chain the column assignment:
import os
import pandas as pd
paths = "C://Users//6J2754897//Downloads//monthlydata"
files = os.listdir(paths)
df = pd.DataFrame()
for file in range(len(files)):
if files[file].endswith('.xlsx'):
# Read in File
df = df.append(
# Read in File
pd.read_excel(paths + "//" + files[file],
sheet_name="information",
skiprows=7)
.assign(Month=str(files[file])), # Add to just this DataFrame
ignore_index=True
)
For general overall improvements we can use pd.concat with a list comprehension over files. This is done to avoid growing the DataFrame (which can be extremely slow). Pathlib.glob can also help with the ability to select the appropriate files:
from pathlib import Path
import pandas as pd
paths = "C://Users//6J2754897//Downloads//monthlydata"
df = pd.concat([
pd.read_excel(file,
sheet_name="information",
skiprows=7)
.assign(Month=file.stem) # We may also want file.name here
for file in Path(paths).glob('*.xlsx')
])
Some options for the Month Column are either:
file.stem will give "[t]he final path component, without its suffix".
'folder/folder/sample.xlsx' -> 'sample'
file.name will give "the final path component, excluding the drive and root".
'folder/folder/sample.xlsx' -> 'sample.xlsx'

import multiple Excel files to pandas and export to multiple Stata files

My raw Excel files are:
[excel_1.xlsx,excel_2.xlsx,...,excel_12.xlsx].
At first I want to import them into dataframes and then append them into a big dataframe, then df.to_dta, but python shows error and said:
MemoryError
I guess the problem is that the appended dataframe is too big.
So I thought I could transform each Excel file to each Stata file, which is:
[excel_1.xlsx,excel_2.xlsx,...,excel_12.xlsx]
to
[excel_1.dta,excel_2.dta,...,excel_12.dta]
and append them in Stata, but I don't know how to do that.
My original code was
import pandas as pd
IO = 'excel_1.xlsx'
df = pd.read_excel(io=IO, skiprows = [1,2] ,
dtype={"Opnprc": "str","Hiprc": "str","Loprc": "str","Clsprc": "str","Dnshrtrd": "str","Dnvaltrd": "str","Dsmvosd": "str",
"Dsmvtll": "str","Dretwd": "str","Dretnd": "str","Adjprcwd": "str","Adjprcnd": "str","Markettype": "str",
"Trdsta": "str"})
df.to_stata('excel1.dta')
I guess a for loop should work, but I don't know how to do that.
(the append code:
import os
import pandas as pd
cwd = os.path.abspath('D:\\onedrive\\test2')
files = os.listdir(cwd)
print(files)
df = pd.DataFrame()
for file in files:
if file.endswith('.xlsx'):
df = df.append(pd.read_excel(file, skiprows = [1,2] ,
dtype={"Opnprc": "str","Hiprc": "str","Loprc": "str","Clsprc": "str","Dnshrtrd": "str","Dnvaltrd": "str","Dsmvosd": "str",
"Dsmvtll": "str","Dretwd": "str","Dretnd": "str","Adjprcwd": "str","Adjprcnd": "str","Markettype": "str",
"Trdsta": "str"}), ignore_index=True)
df.head()
df.to_stata('test.dta')
Here is how to transform each Excel file to a Stata file using a for loop in python3.
import pandas as pd
IO = 'excel_{}.xlsx'
num_files = 12
for i in range(1, num_files + 1):
df = pd.read_excel(
io=IO.format(i),
skiprows = [1,2] ,
dtype={"Opnprc": "str","Hiprc": "str","Loprc": "str","Clsprc": "str","Dnshrtrd": "str","Dnvaltrd": "str","Dsmvosd": "str",
"Dsmvtll": "str","Dretwd": "str","Dretnd": "str","Adjprcwd": "str","Adjprcnd": "str","Markettype": "str",
"Trdsta": "str"})
df.to_stata('excel_{}.dta'.format(i))

How to merge big data of csv files column wise into a single csv file using Pandas?

I have lots of big data csv files in terms of countries and I want to merge their column in a single csv file, furthermore, each file has 'Year' as an index and having same in terms of length and numbers. You can see below is a given example of a Japan.csv file.
If anyone can help me please let me know. Thank you!!
Try using:
import pandas as pd
import glob
l = []
path = 'path/to/directory/'
csvs = glob.glob(path + "/*.csv")
for i in csvs:
df = pd.read_csv(i, index_col=None, header=0)
l.append(df)
df = pd.concat(l, ignore_index=True)
This should work. It goes over each file name, reads it and combines everything into one df. You can export this df to csv or do whatever with it. gl.
import pandas as pd
def combine_csvs_into_one_df(names_of_files):
one_big_df = pd.DataFrame()
for file in names_of_files:
try:
content = pd.read_csv(file)
except PermissionError:
print (file,"was not found")
continue
one_big_df = pd.concat([one_big_df,content])
print (file," added!")
print ("------")
print ("Finished")
return one_big_df

Create multiple Dataframe from XML based on Specific Value

I am trying to parse an XML and save the results in Pandas Data-frame. I have succeeded in saving the details in one specific Data-frame. However now am trying to save the results in multiple data-frame based on one specific class value.
import pandas as pd
import xml.etree.ElementTree as ET
import os
from collections import defaultdict, OrderedDict
tree = ET.parse('PowerChange_76.xml')
root = tree.getroot()
df_list = []
for i, child in enumerate(root):
for subchildren in child.findall('{raml20.xsd}header'):
for subchildren in child.findall('{raml20.xsd}managedObject'):
match_found = 0
xml_class_name = subchildren.get('class')
xml_dist_name = subchildren.get('distName')
print(xml_class_name)
df_dict = OrderedDict()
for subchild in subchildren:
header = subchild.attrib.get('name')
df_dict['Class'] = xml_class_name
df_dict['CellDN'] = xml_dist_name
df_dict[header]=subchild.text
df_list.append(df_dict)
df_cm = pd.DataFrame(df_list)
Expected Result is creation of multiple data-frame based on number of 'class'.
Current Output:
XML File
This is being answered with below method:
def ExtractMOParam(xmlfile2):
tree2=etree.parse(xmlfile2)
root2=tree2.getroot()
df_list2=[]
for i, child in enumerate(root2):
for subchildren in (child.findall('{raml21.xsd}header') or child.findall('{raml20.xsd}header')):
for subchildren in (child.findall('{raml21.xsd}managedObject') or child.findall('{raml20.xsd}managedObject')):
xml_class_name2 = subchildren.get('class')
xml_dist_name2 = subchildren.get('distName')
if ((xml_class_name2 in GetMOClass) and (xml_dist_name2 in GetCellDN)):
#xml_dist_name2 = subchildren.get('distName')
#df_list1.append(xml_class_name1)
for subchild in subchildren:
df_dict2=OrderedDict()
header2=subchild.attrib.get('name')
df_dict2['MOClass']=xml_class_name2
df_dict2['CellDN']=xml_dist_name2
df_dict2['Parameter']=header2
df_dict2['CurrentValue']=subchild.text
df_list2.append(df_dict2)
return df_list2
ExtractDump=pd.DataFrame(ExtractMOParam(inputdfile))
d = dict(tuple(ExtractDump.groupby('MOClass')))
for key in d:
d[key]=d[key].reset_index().groupby(['CellDN','MOClass','Parameter'])['CurrentValue'].aggregate('first').unstack()
d[key].reset_index(level=0, inplace=True)
d[key].reset_index(level=0, inplace=True)
writer = pd.ExcelWriter('ExtractedDump.xlsx', engine='xlsxwriter')
for tab_name, dframe in d.items():
dframe.to_excel(writer, sheet_name=tab_name,index=False)
writer.save()
Hope this will help others as well.

Resources