The XML data(file.xml) for the state will look like below
<?xml version="1.0" encoding="UTF-8" standalone="true"?>
<Activity_Logs xsi:schemaLocation="http://www.cisco.com/PowerKEYDVB/Auditing
DailyActivityLog.xsd" To="2018-04-01" From="2018-04-01" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns="http://www.cisco.com/PowerKEYDVB/Auditing">
<ActivityRecord>
<time>2015-09-16T04:13:20Z</time>
<oper>Create_Product</oper>
<pkgEid>10</pkgEid>
<pkgName>BBCWRL</pkgName>
</ActivityRecord>
<ActivityRecord>
<time>2015-09-16T04:13:20Z</time>
<oper>Create_Product</oper>
<pkgEid>18</pkgEid>
<pkgName>CNNINT</pkgName>
</ActivityRecord>
Parsing and conversion to CSV of above mentioned XML file will be done by the following python code.
import csv
import xml.etree.cElementTree as ET
tree = ET.parse('file.xml')
root = tree.getroot()
data_to_csv= open('output.csv','w')
list_head=[]
Csv_writer=csv.writer(data_to_csv)
count=0
for elements in root.findall('ActivityRecord'):
List_node = []
if count == 0 :
time = elements.find('time').tag
list_head.append(time)
oper = elements.find('oper').tag
list_head.append(oper)
pkgEid = elements.find('pkgEid').tag
list_head.append(pkgEid)
pkgName = elements.find('pkgName').tag
list_head.append(pkgName)
Csv_writer.writerow(list_head)
count = +1
time = elements.find('time').text
List_node.append(time)
oper = elements.find('oper').text
List_node.append(oper)
pkgEid = elements.find('pkgEid').text
List_node.append(pkgEid)
pkgName = elements.find('pkgName').text
List_node.append(pkgName)
Csv_writer.writerow(List_node)
data_to_csv.close()
The code I am using is not giving me any data in CSV. Could some one tell me where excatly am I going wrong?
Using Pandas, parsing all xml fields.
import xml.etree.ElementTree as ET
import pandas as pd
tree = ET.parse("file.xml")
root = tree.getroot()
get_range = lambda col: range(len(col))
l = [{r[i].tag:r[i].text for i in get_range(r)} for r in root]
df = pd.DataFrame.from_dict(l)
df.to_csv('file.csv')
Using pandas and BeautifulSoup you can achieve your expected output easily:
#Code:
import pandas as pd
import itertools
from bs4 import BeautifulSoup as b
with open("file.xml", "r") as f: # opening xml file
content = f.read()
soup = b(content, "lxml")
pkgeid = [ values.text for values in soup.findAll("pkgeid")]
pkgname = [ values.text for values in soup.findAll("pkgname")]
time = [ values.text for values in soup.findAll("time")]
oper = [ values.text for values in soup.findAll("oper")]
# For python-3.x use `zip_longest` method
# For python-2.x use 'izip_longest method
data = [item for item in itertools.zip_longest(time, oper, pkgeid, pkgname)]
df = pd.DataFrame(data=data)
df.to_csv("sample.csv",index=False, header=None)
#output in `sample.csv` file will be as follows:
2015-09-16T04:13:20Z,Create_Product,10,BBCWRL
2015-09-16T04:13:20Z,Create_Product,18,CNNINT
2018-04-01T03:30:28Z,Deactivate_Dhct,,
Use pyxmlparser if it is a one-time operation.
Disclaimer I am the author of the library and it is fairly new. Any feedback is appreciated. It is a command line utility.
https://pypi.org/project/pyxmlparser/
Answer for 2021:
you can use Pandas to read XML and output CSV
https://pandas.pydata.org/pandas-docs/dev/whatsnew/v1.3.0.html#read-and-write-xml-documents
import pandas as pd
df = pd.read_xml(<xml_or_xml_filepath>)
# ...
df.to_csv(<csv_filepath>)
for more details on usage see official documentation:
https://pandas.pydata.org/pandas-docs/dev/reference/api/pandas.read_xml.html
Found the most appropriate way of doing this:
import os
import pandas as pd
from bs4 import BeautifulSoup as b
with open("file.xml", "r") as f: # opening xml file
content = f.read()
soup = b(content, "lxml")
df1 = pd.DataFrame()
for each_file in files_xlm:
with open( each_file, "r") as f: # opening xml file
content = f.read()
soup = b(content, "lxml")
list1 = []
for values in soup.findAll("activityrecord"):
if values.find("time") is None:
time = ""
else:
time = values.find("time").text
if values.find("oper") is None:
oper = ""
else:
oper = values.find("oper").text
if values.find("pkgeid") is None:
pkgeid = ""
else:
pkgeid = values.find("pkgeid").text
if values.find("pkgname") is None:
pkgname = ""
else:
pkgname = values.find("pkgname").text
if values.find("dhct") is None:
dhct = ""
else:
dhct = values.find("dhct").text
if values.find("sourceid") is None:
sourceid = ""
else:
sourceid = values.find("sourceid").text
list1.append(time+','+ oper+','+pkgeid+','+ pkgname+','+dhct+','+sourceid)
df = pd.DataFrame(list1)
df=df[0].str.split(',', expand=True)
df.columns = ['Time','Oper','PkgEid','PkgName','dhct','sourceid']
df.to_csv("new.csv",index=False)
Related
I want to find a string that starts with "section_" and add this as a value to a tag in the same line.
Example: Following is the input in a file of type ditamap.
<topicref href="xyz/debug_logging_in_xyz-section_i_y_mn.dita"/>
<topicref href="xyz/workflows_id-section_exf_zaz_lo.dita"/>
<topicref href="xyz/images_id-section_ekl_bbz_lo.dita"/>
Desired output:
<topicref href="xyz/debug_logging_in_xyz-section_i_y_mn.dita" keys="section_i_y_mn"/>
<topicref href="xyz/workflows_id-section_exf_zaz_lo.dita" keys="section_exf_zaz_lo"/>
<topicref href="xyz/images_id-section_ekl_bbz_lo.dita" keys="section_ekl_bbz_lo"/>
I understand BeautifulSoup can be used to achieve this. But, I am new and do not know the syntax. Can anyone help?
Here is the code I am trying to use:
import os
from bs4 import BeautifulSoup as bs
globpath = "C:/DATA" #add your directory path here
def main(path):
with open(path, encoding="utf-8") as f:
s = f.read()
s = bs(s, "xml")
imgs = s.find_all("topicref")
for i in imgs:
if "section" in i["href"]:
i["keys"] = i["href"].replace("*-","").replace(".dita*","")
s = str(s)
with open(path, "w", encoding="utf-8") as f:
f.write(s)
for dirpath, directories, files in os.walk(globpath):
for fname in files:
if fname.endswith(".ditamap"):
path = os.path.join(dirpath, fname)
main(path)
But, it's adding the entire path in the keys attribute. I need only the portion that starts with section and ends before .dita.
Regex worked:Here is the final code
from bs4 import BeautifulSoup as bs
import re
globpath = "C:/DATA" #add your directory path here
def main(path):
with open(path, encoding="utf-8") as f:
s = f.read()
s = bs(s, "xml")
imgs = s.find_all("topicref")
for i in imgs:
if "section" in i["href"]:
try:
i["keys"] = re.findall("section[^\.]*",i["href"])[0]
except:
print("Could not replace")
s = str(s)
with open(path, "w", encoding="utf-8") as f:
f.write(s)```
I think it should be done with Regex (cuz thats the most i can do)
from bs4 import BeautifulSoup
import re
soup = BeautifulSoup('your-string-input-of-tags-goes-here', 'html.parser')
soup.find_all('topicref', {'keys': re.compile(r'(section_([^ "])+)')})
Returns a list of matched tags
Check this code whether it works or not
I'm trying to download and iterate over csv file but I'm only reading the headers but no more lines after it
tried using this answer but with no luck
this is my code:
from datetime import datetime
import requests
import csv
def main():
print("python main function")
datetime_object = datetime.now().date()
url = f'https://markets.cboe.com/us/equities/market_statistics/volume_reports/day/{datetime_object}/csv/?mkt=bzx'
print(url)
response = requests.get(url, stream=True)
csv_content = response.content.decode('utf-8')
print(csv_content)
cr = csv.reader(csv_content.splitlines(), delimiter='~')
my_list = list(cr)
for row in my_list:
print(row)
if __name__ == '__main__':
main()
cr = csv.reader(csv_content.splitlines(), delimiter='~')
change to
cr = csv.reader(csv_content.splitlines(), delimiter=',')
And check if You download full file or file with header only use URL in browser ;)
I have a crawler code as follows:
import requests
import json
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from datetime import datetime
def crawl(id):
try:
url = 'https://www.china0001.com.cn/project/{0:06d}.html'.format(id)
print(url)
content = requests.get(url).text
soup = BeautifulSoup(content, 'lxml')
tbody = soup.find("table", attrs={"id":"mse_new"}).find("tbody", attrs={"class":"jg"})
tr = tbody.find_all("tr")
rows = []
for i in tr[1:]:
rows.append([j.text.strip() for j in i.findAll("td")])
out = dict([map(str.strip, y.split(':')) for x in rows for y in x])
return out
except AttributeError:
return False
data = list()
for id in range(699998, 700010):
print(id)
res = crawl(id)
if res:
data.append(res)
if len(data) > 0:
df = pd.DataFrame(data)
df.to_excel('test.xlsx', index = False)
In this code, the result dataframe df will be writen to an Excel file after the whole scraping process is finished.
Now I want to save the scraping results one by one into Excel or CSV file during the scraping process, how could I modify the code above?
Thanks.
Updates:
MAX_WORKERS = 30
ids = range(700000, 700050)
workers = min(MAX_WORKERS, len(ids))
with futures.ThreadPoolExecutor(workers) as executor:
res = executor.map(crawl, sorted(ids))
data = list(res)
if len(data) > 0:
df = pd.DataFrame(data)
df.to_csv('test.csv', mode = 'a', header = True, index = False)
Try Using to_csv with header=False, index=False
Ex:
for id in range(699998, 700010):
res = crawl(id)
if res:
df = pd.DataFrame([res])
df.to_csv('test.csv', mode='a', header=False, index=False)
I’d recommend looking at my question on here:
What is the problem with the pandas to csv in my code?.
I’d recommend looking at the answers for the daily sheets then apply and modify it to fit your program
I am working on a web scraper for class. I basically have to compile all of the http links from a website and write them to a csv. They also need to be de-duplicated which is why I'm using a set. I have all the parts complete expect when it writes to the csv, the entire set of links writes to a single row rather than one link per row. Can someone review my code and tell me what i'm missing? I cannot find a solution anywhere.
My code is below:
from bs4 import BeautifulSoup
import requests
import csv
import urllib.parse
base_url = 'https://www.census.gov'
l = set()
r = requests.get("https://www.census.gov/programs-surveys/popest.html")
c = r.content
soup = BeautifulSoup(c, 'html.parser')
file = open('c996webscraper_writer.csv', 'w', newline="")
for link in soup.findAll('a'):
output = link.get('href')
abs_url = urllib.parse.urljoin(base_url, output)
l.add(abs_url)
with file:
write = csv.writer(file, delimiter = ',', lineterminator = '\r')
write.writerow(['List of Links'])
write.writerows([l])
file.close()
This is a printout of what's happening:
CSV Image
from bs4 import BeautifulSoup
import requests
import csv
import urllib.parse
base_url = 'https://www.census.gov'
l = list()
r = requests.get("https://www.census.gov/programs-surveys/popest.html")
c = r.content
soup = BeautifulSoup(c, 'html.parser')
file = open('c996webscraper_writer.csv', 'w', newline="")
for link in soup.findAll('a'):
output = link.get('href')
abs_url = urllib.parse.urljoin(base_url, output)
l.append(abs_url)
with file:
write = csv.writer(file)
write.writerow(['List of Links'])
for x in l:
write.writerow([x])
file.close()
I need to scrape the job descriptions in the page () for every job title like section (accounting) job title (staff accountant) job description text inside the title in different columns in a csv file using python beautiful soup module.
I'm new to beautiful soup i tried some ways of doing it but its not working can you please help with the code
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
start = time.time()
url = ""
data = []
while True:
resp = requests.get(url)
soup = BeautifulSoup(resp.content, 'lxml')
jobdesc = soup.find("li",{'class':'col-xs-12 col-sm-4'})
section=soup.find("h4")
jd = {"jobdescription":jobdesc.text,"topic":section.text}
data.append(jd)
df = pd.DataFrame(data)
df.to_csv("JD.csv")
Here is one way leveraging :has in bs4 4.7.1+ to isolate the sections for looping over. zip_longest is used so we can join section title on to each job.
import requests, csv
from bs4 import BeautifulSoup as bs
from itertools import zip_longest
r = requests.get('https://resources.workable.com/job-descriptions/#', headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.content, 'lxml')
with open("data.csv", "w", encoding="utf-8-sig", newline='') as csv_file:
w = csv.writer(csv_file, delimiter = ",", quoting=csv.QUOTE_MINIMAL)
w.writerow(['Section','Job Title'])
for section in soup.select('section:has(.job)'):
title = section.select_one('a').text.strip()
jobs = [job.text for job in section.select('li a')]
rows = list(zip_longest([title], jobs, fillvalue = title))
for row in rows:
w.writerow(row)
I had a 403 forbidden using requests package, so I decide to use selenium
You can try this:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from selenium import webdriver
url = "https://resources.workable.com/job-descriptions/#"
data = []
#resp = requests.get(url)
#soup = BeautifulSoup(resp.text, 'html.parser')
driver = webdriver.Firefox()
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
section = soup.find_all('section',{'class':'box-white'})
for s in section:
title = s.find('h4').text
lis = soup.find_all("li",{'class':'col-xs-12 col-sm-4'})
for li in lis:
jd = {"jobdescription":li.text,"topic":title}
data.append(jd)
df = pd.DataFrame(data)
df.to_csv("JD.csv")
EDIT: To get description for all jobs
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from selenium import webdriver
url = "https://resources.workable.com/job-descriptions/#"
data = []
#resp = requests.get(url)
#soup = BeautifulSoup(resp.text, 'html.parser')
driver = webdriver.Firefox()
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
section = soup.find_all('section',{'class':'box-white'})
for s in section:
title = s.find('h4').text
lis = s.find_all("li",{'class':'col-xs-12 col-sm-4'})
for li in lis:
job = li.text
driver.get(li.find('a').get('href'))
soup2 = BeautifulSoup(driver.page_source, 'html.parser')
jd = {"job":job,"topic":title, "description": soup2.find('div',{'class':'entry-content article-content'}).text}
data.append(jd)
df = pd.DataFrame(data)
df.to_csv("JD.csv")
Scraping data from monster jobs and uploading to Mongo DB.
from time import *
from selenium import webdriver
import pymongo
from pymongo.results import InsertManyResult
import os
client = pymongo.MongoClient()
mydb = client['jobs']
collection = mydb['med_title']
driver = webdriver.Chrome("C:/Users/91798/Desktop/pythn_files/chromedriver.exe")
driver.get("https://www.monsterindia.com/")
driver.implicitly_wait(9)
driver.find_element_by_id("SE_home_autocomplete").send_keys("nursing , Therapist , docter , medical ,nurse , hospital")
#for normal search use this
driver.find_element_by_xpath("//body/div[#id='themeDefault']/section[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/form[1]/div[1]/div[2]/input[1]").click()
driver.implicitly_wait(20)
temp = 1
while(True):
if temp == 5:
break
all_jobs = driver.find_elements_by_class_name("card-apply-content")
link_list = []
for job in all_jobs:
try:
company = ""
com_name = job.find_elements_by_class_name("job-tittle")
driver.implicitly_wait(1)
for ele in com_name:
company = ele.find_element_by_class_name('company-name').text
job_title = ""
for ele in com_name:
job_title = ele.find_element_by_class_name('medium').text
location = job.find_element_by_class_name("loc").text
driver.implicitly_wait(1)
lnks= job.find_elements_by_tag_name("a")
for lnk in lnks:
link_list.append(lnk.get_attribute('href'))
break
driver.implicitly_wait(1)
desc = job.find_element_by_class_name("job-descrip").text
driver.implicitly_wait(1)
skills = job.find_element_by_class_name("descrip-skills").text
except:
desc = 'desc Not Specified'
skills = 'skills Not Specified'
location = ' location Not Specified'
company = 'company Not Specified'
job_title = 'job_title not specified'
s = skills.split(' ')
for i in s:
if i == ',':
s.remove(',')
data = {"job_title" : job_title ,"comapany_name": company,"job_location":
location,"job_desc":desc,"skills":s[2::],"card_link":link_list[0]}
link_list.clear()
y = collection.insert_one(data)
print(y.inserted_id)
driver.find_element_by_xpath("//button[contains(text(),'Next')]").click()
sleep(25)
temp = temp +1