Create multiple Dataframe from XML based on Specific Value - python-3.x

I am trying to parse an XML and save the results in Pandas Data-frame. I have succeeded in saving the details in one specific Data-frame. However now am trying to save the results in multiple data-frame based on one specific class value.
import pandas as pd
import xml.etree.ElementTree as ET
import os
from collections import defaultdict, OrderedDict
tree = ET.parse('PowerChange_76.xml')
root = tree.getroot()
df_list = []
for i, child in enumerate(root):
for subchildren in child.findall('{raml20.xsd}header'):
for subchildren in child.findall('{raml20.xsd}managedObject'):
match_found = 0
xml_class_name = subchildren.get('class')
xml_dist_name = subchildren.get('distName')
print(xml_class_name)
df_dict = OrderedDict()
for subchild in subchildren:
header = subchild.attrib.get('name')
df_dict['Class'] = xml_class_name
df_dict['CellDN'] = xml_dist_name
df_dict[header]=subchild.text
df_list.append(df_dict)
df_cm = pd.DataFrame(df_list)
Expected Result is creation of multiple data-frame based on number of 'class'.
Current Output:
XML File

This is being answered with below method:
def ExtractMOParam(xmlfile2):
tree2=etree.parse(xmlfile2)
root2=tree2.getroot()
df_list2=[]
for i, child in enumerate(root2):
for subchildren in (child.findall('{raml21.xsd}header') or child.findall('{raml20.xsd}header')):
for subchildren in (child.findall('{raml21.xsd}managedObject') or child.findall('{raml20.xsd}managedObject')):
xml_class_name2 = subchildren.get('class')
xml_dist_name2 = subchildren.get('distName')
if ((xml_class_name2 in GetMOClass) and (xml_dist_name2 in GetCellDN)):
#xml_dist_name2 = subchildren.get('distName')
#df_list1.append(xml_class_name1)
for subchild in subchildren:
df_dict2=OrderedDict()
header2=subchild.attrib.get('name')
df_dict2['MOClass']=xml_class_name2
df_dict2['CellDN']=xml_dist_name2
df_dict2['Parameter']=header2
df_dict2['CurrentValue']=subchild.text
df_list2.append(df_dict2)
return df_list2
ExtractDump=pd.DataFrame(ExtractMOParam(inputdfile))
d = dict(tuple(ExtractDump.groupby('MOClass')))
for key in d:
d[key]=d[key].reset_index().groupby(['CellDN','MOClass','Parameter'])['CurrentValue'].aggregate('first').unstack()
d[key].reset_index(level=0, inplace=True)
d[key].reset_index(level=0, inplace=True)
writer = pd.ExcelWriter('ExtractedDump.xlsx', engine='xlsxwriter')
for tab_name, dframe in d.items():
dframe.to_excel(writer, sheet_name=tab_name,index=False)
writer.save()
Hope this will help others as well.

Related

Pandas - Add items to dataframe

I am trying to add row items to the dataframe, and I am not able to update the dataframe.
What i tried until now is commented out as it doesn't do what I need.
I simply want to download the json file and store it to a dataframe with those given columns. Seems I am not able to extract the child components fron JSON file and store them to a brand new dataframe.
Please find bellow my code:
import requests, json, urllib
import pandas as pd
url = "https://www.cisa.gov/sites/default/files/feeds/known_exploited_vulnerabilities.json"
data = pd.read_json(url)
headers = []
df = pd.DataFrame()
for key, item in data['vulnerabilities'].items():
for k in item.keys():
headers.append(k)
col = list(set(headers))
new_df = pd.DataFrame(columns=col)
for item in data['vulnerabilities'].items():
print(item[1])
# new_df['product'] = item[1]['product']
# new_df['vendorProject'] = item[1]['vendorProject']
# new_df['dueDate'] = item[1]['dueDate']
# new_df['shortDescription'] = item[1]['shortDescription']
# new_df['dateAdded'] = item[1]['dateAdded']
# new_df['vulnerabilityName'] = item[1]['vulnerabilityName']
# new_df['cveID'] = item[1]['cveID']
# new_df.append(item[1], ignore_index = True)
new_df
At the end my df is still blank.
The nested JSON data can be directly converted to a flattened dataframe using pd.json_normalize(). The headers are extracted from the JSON itself.
new_df = pd.DataFrame(pd.json_normalize(data['vulnerabilities']))
UPDATE: Unnested the vulnerabilities column specifically.
Output:
It worked with this:
import requests, json, urllib
import pandas as pd
url = "https://www.cisa.gov/sites/default/files/feeds/known_exploited_vulnerabilities.json"
data = pd.read_json(url)
headers = []
df = pd.DataFrame()
for key, item in data['vulnerabilities'].items():
for k in item.keys():
headers.append(k)
col = list(set(headers))
new_df = pd.DataFrame(columns=col)
for item in data['vulnerabilities'].items():
new_df.loc[len(new_df.index)] = item[1] <===THIS
new_df.head()

How to create a DataFrame from a list that each column is created by a regex expression

I have a list as such:
lst = ['2021_01_21__11_10_54_1__13928_snapshot.jpg',
'2021_01_21__12_27_44_1__13934_snapshot.jpg',
'2021_01_21__11_11_08_2__13928_snapshot.jpg',
'2021_01_21__12_27_56_2__13934_snapshot.jpg',
'2021_01_21__11_11_19_3__13928_snapshot.jpg',
'2021_01_21__12_28_08_3__13934_snapshot.jpg']
I want to create a DataFrame so that each column will be represented by:
def by_number(path):
base_name = os.path.basename(path)
return re.findall('[\_]{2}(\d{5})',lst)
And the rows will be represented by:
def by_index(path):
base_name = os.path.basename(path)
return re.findall('\_(\d)[\_]{2}',lst)
So eventually I'll get a DataFrame that looks something like this:
name_list = ['2021_01_21__11_10_54_1__13928_snapshot.jpg',
'2021_01_21__12_27_44_1__13934_snapshot.jpg',
'2021_01_21__11_11_08_2__13928_snapshot.jpg',
'2021_01_21__12_27_56_2__13934_snapshot.jpg',
'2021_01_21__11_11_19_3__13928_snapshot.jpg',
'2021_01_21__12_28_08_3__13934_snapshot.jpg']
import re
import pandas as pd
df = pd.DataFrame([[0]], columns=['count']) # initialize dataframe
for name in name_list:
count = re.search('\_(\d)[\_]{2}',name).group(1)
col = re.search('[\_]{2}(\d{5})',name).group(1)
if ((df['count'] == count)).any():
df.loc[df['count'] == count, col] = name
else:
new_row = pd.DataFrame([[count,name]], columns=['count',col])
df = df.append(new_row)
df.set_index('count', inplace=True)
print(df)

Dynamically create dataframes in python

Below is the code where i am reading two files and trying to create separate dataframes for both them.I am trying to achieve this dynamically so that I can use these df as per required. Here is the code of what I have done.
import pandas as pd
commanFilePath = '\Projects\Pandas\Csv_Files\\'
fileNametoImport = ['Employee.txt','Role.txt']
listofdf =[]
# load file to data frame
for filename in fileNametoImport:
fN,ext = filename.split('.')
fN = 'df'+fN
listofdf.append(fN)
filewithpathname = commanFilePath + filename
fN = pd.read_csv(filewithpathname,delimiter=',')
print(fN)
print(listofdf)
I want when I do print(listofdf[0]) I should get my first dataframe which would be dfEmployee.
Since you want listofdf[0] to be the first dataframe i.e., a dataframe for Employee.txt, then listofdf[0] should be pd.read_csv('\path\to\Employee.txt', delimiter=','). The name dfEmployee doesn't really matter. So you need to append the output of pd.read_csv to listofdf.
import pandas as pd
commanFilePath = '\Projects\Pandas\Csv_Files\\'
fileNametoImport = ['Employee.txt','Role.txt']
listofdf =[]
# load file to data frame
for filename in fileNametoImport:
fN,ext = filename.split('.')
fN = 'df'+fN # <--- is this needed?
listofdf.append(fN) # <--- remove this
filewithpathname = commanFilePath + filename
fN = pd.read_csv(filewithpathname,delimiter=',') # <---- this is the dataframe
listofdf.append(fN) # <--- append this
print(listofdf) #<-- should contain two dataframes now
Perhaps something like:
from os.path import join
import pandas as pd
folder_path = "\Projects\Pandas\Csv_Files"
file_names = ["Employee.txt", "Role.txt"]
dfs = [
pd.read_csv(join(folder_path, file_name), delimiter=",") for file_name in file_names
]

How to apply a function fastly on the list of DataFrame in Python?

I have a list of DataFrames with equal length of columns and rows but different values, such as
data = [df1, df2,df3.... dfn] .
How can I apply a function function on each dataframe in the list data? I used following code but it doe not work
data = [df1, def2,df3.... dfn]
def maxloc(data):
data['loc_max'] = np.zeros(len(data))
for i in range(1,len(data)-1): #from the second value on
if data['q_value'][i] >= data['q_value'][i-1] and data['q_value'][i] >= data['q_value'][i+1]:
data['loc_max'][i] = 1
return data
df_list = [df.pipe(maxloc) for df in data]
Seems to me the problem is in your maxloc() function as this code works.
I added also the maximum value in the return of maxloc.
from random import randrange
import pandas as pd
def maxloc(data_frame):
max_index = data_frame['Value'].idxmax(0)
maximum = data_frame['Value'][max_index]
return max_index, maximum
# create test list of data-frames
data = []
for i in range(5):
temp = []
for j in range(10):
temp.append(randrange(100))
df = pd.DataFrame({'Value': temp}, index=(range(10)))
data.append(df)
df_list = [df.pipe(maxloc) for df in data]
for i, (index, value) in enumerate(df_list):
print(f"Data-frame {i:02d}: maximum = {value} at position {index}")

Cannot get Pandas to concat/append

I'm trying to parse a website's tables and I'm pretty noobish still. For each link only the second table/dataframe is appended to the SS. There are multiple links and therefore it requires a while loop. Using what little I could find only I am just stuck with this which Im pretty sure is totally off:
import pandas as pd
from pandas import ExcelWriter
a=1
alist = []
writer = ExcelWriter('name.xlsx')
def dffunc():
dfs = pd.read_html('http://websitepath{}.htm'.format(a))
df = dfs[1]
alist.append(df,ignore_index=True)
alist = pd.concat(df, axis=0)
while a<9:
dffunc()
a+=1
alist.to_excel(writer, index=False)
writer.save()
df=dfs[1] takes the second table in the list. Is that what you want?
old:
df = dfs[1]
alist.append(df,ignore_index=True)
alist = pd.concat(df, axis=0)
You're appending the 2nd table in the dfs collection to global alist
You're assigning the 2nd table in the dfs collection to alist, undoing all previous steps
Operating on a global var that is written to file once at the end of the loop defeats the purpose of your loop given second bullet; alist will only ever take on the value of the 2nd table in the last query when you write to file
new:
import pandas as pd
from pandas import ExcelWriter
writer = ExcelWriter('name.xlsx')
writer_kwargs = {'index': False}
A = 9
def dffunc(a):
dfs = pd.read_html('http://websitepath{}.htm'.format(a))
return pd.concat(dfs, axis=0)
def dfhandler(df, writer, **kwargs):
df.to_excel(writer, sheet_name=a, **kwargs)
for a in xrange(1, A):
dfhandler(dffunc(a), writer, **writer_kwargs)
writer.save()

Resources