Pandas - Add items to dataframe - python-3.x

I am trying to add row items to the dataframe, and I am not able to update the dataframe.
What i tried until now is commented out as it doesn't do what I need.
I simply want to download the json file and store it to a dataframe with those given columns. Seems I am not able to extract the child components fron JSON file and store them to a brand new dataframe.
Please find bellow my code:
import requests, json, urllib
import pandas as pd
url = "https://www.cisa.gov/sites/default/files/feeds/known_exploited_vulnerabilities.json"
data = pd.read_json(url)
headers = []
df = pd.DataFrame()
for key, item in data['vulnerabilities'].items():
for k in item.keys():
headers.append(k)
col = list(set(headers))
new_df = pd.DataFrame(columns=col)
for item in data['vulnerabilities'].items():
print(item[1])
# new_df['product'] = item[1]['product']
# new_df['vendorProject'] = item[1]['vendorProject']
# new_df['dueDate'] = item[1]['dueDate']
# new_df['shortDescription'] = item[1]['shortDescription']
# new_df['dateAdded'] = item[1]['dateAdded']
# new_df['vulnerabilityName'] = item[1]['vulnerabilityName']
# new_df['cveID'] = item[1]['cveID']
# new_df.append(item[1], ignore_index = True)
new_df
At the end my df is still blank.

The nested JSON data can be directly converted to a flattened dataframe using pd.json_normalize(). The headers are extracted from the JSON itself.
new_df = pd.DataFrame(pd.json_normalize(data['vulnerabilities']))
UPDATE: Unnested the vulnerabilities column specifically.
Output:

It worked with this:
import requests, json, urllib
import pandas as pd
url = "https://www.cisa.gov/sites/default/files/feeds/known_exploited_vulnerabilities.json"
data = pd.read_json(url)
headers = []
df = pd.DataFrame()
for key, item in data['vulnerabilities'].items():
for k in item.keys():
headers.append(k)
col = list(set(headers))
new_df = pd.DataFrame(columns=col)
for item in data['vulnerabilities'].items():
new_df.loc[len(new_df.index)] = item[1] <===THIS
new_df.head()

Related

Pass url column's values one by one to web crawler code in Python

Based on the answered code from this link, I'm able to create a new column: df['url'] = 'https://www.cspea.com.cn/list/c01/' + df['projectCode'].
Next step I would like to pass the url column's values to the following code and append all the scrapied contents as dataframe.
import urllib3
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.cspea.com.cn/list/c01/gr2021bj1000186" # url column's values should be passed here one by one
soup = BeautifulSoup(requests.get(url, verify=False).content, "html.parser")
index, data = [], []
for th in soup.select(".project-detail-left th"):
h = th.get_text(strip=True)
t = th.find_next("td").get_text(strip=True)
index.append(h)
data.append(t)
df = pd.DataFrame(data, index=index, columns=["value"])
print(df)
How could I do that in Python? Thanks.
Updated:
import requests
from bs4 import BeautifulSoup
import pandas as pd
df = pd.read_excel('items_scraped.xlsx')
data = []
urls = df.url.tolist()
for url_link in urls:
url = url_link
# url = "https://www.cspea.com.cn/list/c01/gr2021bj1000186"
soup = BeautifulSoup(requests.get(url, verify=False).content, "html.parser")
index, data = [], []
for th in soup.select(".project-detail-left th"):
h = th.get_text(strip=True)
t = th.find_next("td").get_text(strip=True)
index.append(h)
data.append(t)
df = pd.DataFrame(data, index=index, columns=["value"])
df = df.T
df.reset_index(drop=True, inplace=True)
print(df)
df.to_excel('result.xlsx', index = False)
But it only saved one rows into excel file.
You need to combine the dfs generated in the loop. You could add them to a list and then call pd.concat on that list.
import requests
from bs4 import BeautifulSoup
import pandas as pd
df = pd.read_excel('items_scraped.xlsx')
# data = []
urls = df.url.tolist()
dfs = []
for url_link in urls:
url = url_link
# url = "https://www.cspea.com.cn/list/c01/gr2021bj1000186"
soup = BeautifulSoup(requests.get(url, verify=False).content, "html.parser")
index, data = [], []
for th in soup.select(".project-detail-left th"):
h = th.get_text(strip=True)
t = th.find_next("td").get_text(strip=True)
index.append(h)
data.append(t)
df = pd.DataFrame(data, index=index, columns=["value"])
df = df.T
df.reset_index(drop=True, inplace=True)
print(df)
dfs.append(df)
df = pd.concat(dfs)
df.to_excel('result.xlsx', index = False)
Use
urls = df.url.tolist()
To create a list of URLs and then iterate through them using f string to insert each one into your base url

Python - Creating a for loop to build a single csv file with multiple dataframes

I am new to python and trying various things to learn the fundamentals. One of the things that i'm currently stuck on is for loops. I have the following code and am positive it can be built out more efficiently using a loop but i'm not sure exactly how.
import pandas as pd
import numpy as np
url1 = 'https://www.cbssports.com/nfl/stats/player/receiving/nfl/regular/qualifiers/?page=1'
url2 = 'https://www.cbssports.com/nfl/stats/player/receiving/nfl/regular/qualifiers/?page=2'
url3 = 'https://www.cbssports.com/nfl/stats/player/receiving/nfl/regular/qualifiers/?page=3'
df1 = pd.read_html(url1)
df1[0].to_csv ('NFL_Receiving_Page1.csv', index=False) #index false gets rid of index listing that appears as the very first column in the csv
df2 = pd.read_html(url2)
df2[0].to_csv ('NFL_Receiving_Page2.csv', index=False) #index false gets rid of index listing that appears as the very first column in the csv
df3 = pd.read_html(url3)
df3[0].to_csv ('NFL_Receiving_Page3.csv', index=False) #index false gets rid of index listing that appears as the very first column in the csv
df_receiving_agg = pd.concat([df1[0], df2[0], df3[0]])
df_receiving_agg.to_csv('NFL_Receiving_Combined.csv', index=False) #index false gets rid of index listing that appears as the very first column in the csv
I'm ultimately trying to combine the data in the above URL's into a single table in a csv file.
You can try this:
urls = [url1,url2,url3]
df_receiving_agg = pd.DataFrame()
for url in urls:
df = pd.read_html(url)
df_receiving_agg = pd.concat([df_receiving_agg, df])
df_receiving_agg.to_csv('filepath.csv',index=False)
You can do this:
base_url = 'https://www.cbssports.com/nfl/stats/player/receiving/nfl/regular/qualifiers/?page='
dfs = []
for page in range(1, 4):
url = f'{base_url}{page}'
df = pd.read_html(url)
df.to_csv(f'NFL_Receiving_Page{page}.csv', index=False)
dfs.append(df)
df_receiving_agg = pd.concat(dfs)
df_receiving_agg.to_csv('NFL_Receiving_Combined.csv', index=False)

appending Dict to nested list per request made

I am currently scraping through an XML API response. I am looking to gather a piece of information for each request and create a dictionary each time I find this piece of data. Each request can have several IDs. So one response can have 2 IDs while the next response might have 3 IDs. For example, let's say the first response has 2 IDs. I am storing this data in a list at the moment when the second request is done the additional 3 IDs are being stored under this same list as well.
import requests
import pandas as pd
from pandas import DataFrame
from bs4 import BeautifulSoup
import datetime as datetime
import json
import time
trackingDomain = ''
domain = ''
aIDs = []
cIDs = []
url = "https://" + domain + ""
print(url)
df = pd.read_csv('campids.csv')
for index, row in df.iterrows():
payload = {'api_key':'',
'campaign_id':'0',
'site_offer_id':row['IDs'],
'source_affiliate_id':'0',
'channel_id':'0',
'account_status_id':'0',
'media_type_id':'0',
'start_at_row':'0',
'row_limit':'0',
'sort_field':'campaign_id',
'sort_descending':'TRUE'
}
print('Campaign Payload', payload)
r = requests.get(url, params=payload)
print(r.status_code)
soup = BeautifulSoup(r.text, 'lxml')
success = soup.find('success').string
for affIDs in soup.select('campaign'):
affID = affIDs.find('source_affiliate_id').string
aIDs.append(affID)
dataDict = dict()
dataDict['offers'] = []
affDict = {'affliate_id':aIDs}
dataDict['offers'].append(dict(affDict))
The result ends up being as follows:
dictData = {'offers': [{'affliate_id': ['9','2','45','47','14','8','30','30','2','2','9','2']}]}
What I am looking to do is this:
dictData = {'offers':[{'affiliate_id'['9','2','45','47','14','8','30','30','2','2']},{'affiliate_id':['9','2']}]}
On the first request, I obtain the following:
IDs['9','2','45','47','14','8','30','30','2','2']
On the second request these IDs are returned:
['9','2']
I am new to Python so please bear with me as far etiquette goes and I am missing something. I'll be happy to provide any additional information.
It has to do with the order of your initializing and appending that is causing you to not get the outcome you are wanting. You are overwriting your dataDict after each iteration, and inserting the appended list which is not overwritten, thus leaving you with a final list that has appended ALL aIDs. What you want to to do is initialise that dataDict out side of your for loop, and then you can append the dictionary in the nested loop into that list:
Note: It's tough to work out/test without having the actual data, but I believe this should do it if I worked out the logic correctly in my head:
import requests
import pandas as pd
from pandas import DataFrame
from bs4 import BeautifulSoup
import datetime as datetime
import json
import time
trackingDomain = ''
domain = ''
cIDs = []
url = "https://" + domain + ""
# Initialize your dictionary
dataDict = dict()
# Initialize your list in your dictionary under key `offers`
dataDict['offers'] = []
print(url)
df = pd.read_csv('campids.csv')
for index, row in df.iterrows():
payload = {'api_key':'',
'campaign_id':'0',
'site_offer_id':row['IDs'],
'source_affiliate_id':'0',
'channel_id':'0',
'account_status_id':'0',
'media_type_id':'0',
'start_at_row':'0',
'row_limit':'0',
'sort_field':'campaign_id',
'sort_descending':'TRUE'
}
print('Campaign Payload', payload)
r = requests.get(url, params=payload)
print(r.status_code)
soup = BeautifulSoup(r.text, 'lxml')
success = soup.find('success').string
# Initialize your list for this iteration/row in your df.iterrows
aIDs = []
for affIDs in soup.select('campaign'):
affID = affIDs.find('source_affiliate_id').string
# Append those affIDs to the aIDs list
aIDs.append(affID)
# Create your dictionary of key:value with key 'affiliate_id' and value the aIDs list
affDict = {'affliate_id':aIDs}
# NOW append that into your list in your dictionary under key `offers`
dataDict['offers'].append(dict(affDict))

how to fix the indexing error and to scrape the data from a webpage

I want to scrape data from a webpage from a wayback machine using pandas. I used string split to split some string if its present.
the URL for the webpage is this
Here is my code:
import pandas as pd
url = "https://web.archive.org/web/20140528015357/http://eciresults.nic.in/statewiseS26.htm"
dfs = pd.read_html(url)
df = dfs[0]
idx = df[df[0] == '\xa0Next >>'].index[0]
# Error mentioned in comment happens on the above line.
cols = list(df.iloc[idx-1,:])
df.columns = cols
df = df[df['Const. No.'].notnull()]
df = df.loc[df['Const. No.'].str.isdigit()].reset_index(drop=True)
df = df.dropna(axis=1,how='all')
df['Leading Candidate'] = df['Leading Candidate'].str.split('i',expand=True)[0]
df['Leading Party'] = df['Leading Party'].str.split('iCurrent',expand=True)[0]
df['Trailing Party'] = df['Trailing Party'].str.split('iCurrent',expand=True)[0]
df['Trailing Candidate'] = df['Trailing Candidate'].str.split('iAssembly',expand=True)[0]
df.to_csv('Chhattisgarh_cand.csv', index=False)
The expected output from that webpage must be in csv format like
You can use BeautifulSoup to extract the data. Panadas will help you to process the data in efficient way but its not ment for data extraction.
import pandas as pd
from bs4 import BeautifulSoup
import requests
response = requests.get('https://web.archive.org/web/20140528015357/http://eciresults.nic.in/statewiseS26.htm?st=S26')
soup = BeautifulSoup(response.text,'lxml')
table_data = []
required_table = [table for table in soup.find_all('table') if str(table).__contains__('Indian National Congress')]
if required_table:
for tr_tags in required_table[0].find_all('tr',{'style':'font-size:12px;'}):
td_data = []
for td_tags in tr_tags.find_all('td'):
td_data.append(td_tags.text.strip())
table_data.append(td_data)
df = pd.DataFrame(table_data[1:])
# print(df.head())
df.to_csv("DataExport.csv",index=False)
You can expect result like this in pandas dataframe,
0 1 ... 6 7
0 BILASPUR 5 ... 176436 Result Declared
1 DURG 7 ... 16848 Result Declared
2 JANJGIR-CHAMPA 3 ... 174961 Result Declared
3 KANKER 11 ... 35158 Result Declared
4 KORBA 4 ... 4265 Result Declared
The code below should get you the table on your url link ("Chhattisgarh Result Status") using a combination of BS and pandas; you can then save it as csv:
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
url = "https://web.archive.org/web/20140528015357/http://eciresults.nic.in/statewiseS26.htm?st=S26"
response = urllib.request.urlopen(url)
elect = response.read()
soup = BeautifulSoup(elect,"lxml")
res = soup.find_all('table')
df = pd.read_html(str(res[7]))
df[3]

Create multiple Dataframe from XML based on Specific Value

I am trying to parse an XML and save the results in Pandas Data-frame. I have succeeded in saving the details in one specific Data-frame. However now am trying to save the results in multiple data-frame based on one specific class value.
import pandas as pd
import xml.etree.ElementTree as ET
import os
from collections import defaultdict, OrderedDict
tree = ET.parse('PowerChange_76.xml')
root = tree.getroot()
df_list = []
for i, child in enumerate(root):
for subchildren in child.findall('{raml20.xsd}header'):
for subchildren in child.findall('{raml20.xsd}managedObject'):
match_found = 0
xml_class_name = subchildren.get('class')
xml_dist_name = subchildren.get('distName')
print(xml_class_name)
df_dict = OrderedDict()
for subchild in subchildren:
header = subchild.attrib.get('name')
df_dict['Class'] = xml_class_name
df_dict['CellDN'] = xml_dist_name
df_dict[header]=subchild.text
df_list.append(df_dict)
df_cm = pd.DataFrame(df_list)
Expected Result is creation of multiple data-frame based on number of 'class'.
Current Output:
XML File
This is being answered with below method:
def ExtractMOParam(xmlfile2):
tree2=etree.parse(xmlfile2)
root2=tree2.getroot()
df_list2=[]
for i, child in enumerate(root2):
for subchildren in (child.findall('{raml21.xsd}header') or child.findall('{raml20.xsd}header')):
for subchildren in (child.findall('{raml21.xsd}managedObject') or child.findall('{raml20.xsd}managedObject')):
xml_class_name2 = subchildren.get('class')
xml_dist_name2 = subchildren.get('distName')
if ((xml_class_name2 in GetMOClass) and (xml_dist_name2 in GetCellDN)):
#xml_dist_name2 = subchildren.get('distName')
#df_list1.append(xml_class_name1)
for subchild in subchildren:
df_dict2=OrderedDict()
header2=subchild.attrib.get('name')
df_dict2['MOClass']=xml_class_name2
df_dict2['CellDN']=xml_dist_name2
df_dict2['Parameter']=header2
df_dict2['CurrentValue']=subchild.text
df_list2.append(df_dict2)
return df_list2
ExtractDump=pd.DataFrame(ExtractMOParam(inputdfile))
d = dict(tuple(ExtractDump.groupby('MOClass')))
for key in d:
d[key]=d[key].reset_index().groupby(['CellDN','MOClass','Parameter'])['CurrentValue'].aggregate('first').unstack()
d[key].reset_index(level=0, inplace=True)
d[key].reset_index(level=0, inplace=True)
writer = pd.ExcelWriter('ExtractedDump.xlsx', engine='xlsxwriter')
for tab_name, dframe in d.items():
dframe.to_excel(writer, sheet_name=tab_name,index=False)
writer.save()
Hope this will help others as well.

Resources