How to scrape the data which is unable to inspect and which is inside the <svg> tag

How to scrape the data which is unable to inspect and which is inside the <svg> tag - python-3.x

I'm unable to scrape some of the data from a webpage Partywise Result. I want to scrape the partwise{vote%,vote count} from that page.
The code I have tried so far:
import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
def soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage,"html.parser")
return soupdata
#chhattisgarh
edatas = ""
edata1=""
codes = ["S26"]
for code in codes:
soup3 = "http://eciresults.nic.in/PartyWiseResult"+code+".htm"#2018
#soup3 = "https://web.archive.org/web/20140613012440/http://eciresults.nic.in/PartyWiseResult" + code + ".htm" # 2014
soup2 = soup(soup3)
for records2 in soup2.findAll("div",{"id":"piecharts26"}):
print(records2.table)
for records in records2.findAll("table"):
print(records)
edata = ""
for data in records.findAll('td'):
edata= edata+","+data.text
edatas= edatas + "\n" + edata[1:]+","+code
header ="Party,Won,Leading,Total,State code"
file = open(os.path.expanduser("per2014_result.csv"),"wb")#2018
#file = open(os.path.expanduser("per2014_result.csv"),"wb")#2014
file.write(bytes(header, encoding="ascii", errors="ignore"))
file.write(bytes(edatas, encoding="ascii", errors="ignore"))
file.write(bytes(edata1, encoding="ascii", errors="ignore"))
The result which I am expecting from is the %vote share
I want the output to be in CSV format like this:
INC,43.0%,6144192
and so on fully from the page one
and two

There is a loading of the data directly from the javascript inside your div:
if(document.getElementById('piecharts26')!=null)
So you have to use a console browser, such as selenium (link here) , or use a regex:
import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
import re
import json
def get_data(html_page):
s = str(html_page)
r = re.compile('data.addRows\((.*?)\);')
m = r.search(s)
if m:
result = m.group(1)
return json.loads(result.replace("'",'"'))
def soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage,"html.parser")
return soupdata
#chhattisgarh
edatas = ""
edata1=""
codes = ["S26"]
for code in codes:
soup3 = "http://eciresults.nic.in/PartyWiseResult"+code+".htm"#2018
#soup3 = "https://web.archive.org/web/20140613012440/http://eciresults.nic.in/PartyWiseResult" + code + ".htm" # 2014
soup2 = soup(soup3)
result = get_data(soup2)
print(result)
header ="Party,Won,Leading,Total,State code"
file = open(os.path.expanduser("per2014_result.csv"),"wb")#2018
#file = open(os.path.expanduser("per2014_result.csv"),"wb")#2014
file.write(bytes(header, encoding="ascii", errors="ignore"))
file.write(bytes(edatas, encoding="ascii", errors="ignore"))
file.write(bytes(edata1, encoding="ascii", errors="ignore"))
OUTPUT:
[['INC {43.0%,6144192}', 6144192],
['BJP {33.0%,4707141}', 4707141],
['JCCJ {7.6%,1086581}', 1086581],
['IND {5.9%,839053}', 839053],
['BSP {3.9%,552313}', 552313],
['GGP {1.7%,247459}', 247459],
['AAAP {0.9%,123526}', 123526],
['CPI {0.3%,48255}', 48255],
['APoI {0.3%,42013}', 42013],
['SHS {0.2%,34678}', 34678],
['NCP {0.2%,28983}', 28983],
['SP {0.2%,21969}', 21969],
['BYPP {0.1%,8425}', 8425],
['CPM {0.1%,8348}', 8348],
['JD(U) {0.1%,8285}', 8285],
['CSM {0.1%,7783}', 7783],
['BMUP {0.1%,7419}', 7419],
['BSCP {0.0%,5546}', 5546],
['BTP {0.0%,5498}', 5498],
['RJsbhP {0.0%,5141}', 5141],
['RGOP {0.0%,5040}', 5040],
['IPBP {0.0%,4982}', 4982],
['NINSHAD {0.0%,4586}', 4586],
['PSPU {0.0%,4309}', 4309],
['BHBHP {0.0%,3780}', 3780],
['RPI(A) {0.0%,3257}', 3257],
['JAC {0.0%,3034}', 3034],
['CPIM {0.0%,3017}', 3017],
['NDPF {0.0%,2912}', 2912],
['AASPP {0.0%,2474}', 2474],
['BBC {0.0%,2089}', 2089],
['SWAP {0.0%,2023}', 2023],
['cvgrp {0.0%,1582}', 1582],
['bhmm {0.0%,1474}', 1474],
['AVVP {0.0%,1407}', 1407],
['LSWP {0.0%,1399}', 1399],
['CSP {0.0%,1232}', 1232],
['BPSGKD {0.0%,1093}', 1093],
['BKNP {0.0%,1085}', 1085],
['CGVP {0.0%,1053}', 1053],
['SUCI {0.0%,1048}', 1048],
['SUSP {0.0%,988}', 988],
['DPI {0.0%,970}', 970],
['RJBP {0.0%,717}', 717],
['ASSP {0.0%,701}', 701],
['BLRP {0.0%,570}', 570],
['BSHSP {0.0%,562}', 562],
['ABHM {0.0%,549}', 549],
['SSBD {0.0%,468}', 468],
['ABSSP {0.0%,436}', 436],
['BRSP {0.0%,429}', 429],
['ABSKP {0.0%,389}', 389],
['BSSP {0.0%,279}', 279],
['BNIP {0.0%,267}', 267],
['RMGP {0.0%,258}', 258],
['KMSP {0.0%,241}', 241],
['BHBP {0.0%,224}', 224],
['RP(K) {0.0%,202}', 202],
['CMM {0.0%,192}', 192],
['CHSJP {0.0%,183}', 183],
['RSSM {0.0%,72}', 72],
['AnAP {0.0%,66}', 66],
['NOTA {2.0%,282744}', 282744]]
Then you can loop on the result and save it into the csv file
EDIT:
See this edit to save it in csv file:
import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
import re
import json
import csv
def get_data(html_page):
s = str(html_page)
r = re.compile('data.addRows\((.*?)\);')
m = r.search(s)
if m:
result = m.group(1)
return json.loads(result.replace("'",'"'))
def soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage,"html.parser")
return soupdata
codes = ["S26"]
for code in codes:
soup3 = "http://eciresults.nic.in/PartyWiseResult"+code+".htm"#2018
#soup3 = "https://web.archive.org/web/20140613012440/http://eciresults.nic.in/PartyWiseResult" + code + ".htm" # 2014
soup2 = soup(soup3)
result = get_data(soup2)
header = ["Party","Vote%","Count","State code"]
results_export = []
results_export.append(header)
for r in result:
export = []
party = r[0].split(' {')[0]
percent = r[0].split(' {')[1].split(',')[0]
count = r[1]
export.append(str(party))
export.append(str(percent))
export.append(str(count))
export.append(code)
results_export.append(export)
file = open(os.path.expanduser("per2014_result.csv"), "w") # 2018
writer = csv.writer(file)
writer.writerows(results_export)
EDIT2:
def get_data(html_page):
s = str(html_page)
r = re.compile('data.addRows\((.*?)\);')
ms = r.findall(s)
result = '[]'
if ms:
for m in ms:
if m != '[]':
result = m
return json.loads(result.replace("'",'"'))

Related

Loop and crawler items and save elements as json format in Python

Given a link from here:
I would like to loop all the counties and then all commercial districts, the save them as a txt file with json format as follows:
{"\u5317\u8521": "/ershoufang/beicai/", "\u78a7\u4e91": "/ershoufang/biyun/", "\u66f9\u8def": "/ershoufang/caolu/", "\u5ddd\u6c99": "/ershoufang/chuansha/", "\u5927\u56e2\u9547": "/ershoufang/datuanzhen/", ...}
How could I do that? Thanks at advance.
Code:
from bs4 import BeautifulSoup
import requests
import os
from urllib.parse import urlparse
url = 'https://wh.lianjia.com/ershoufang/jiangan/'
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
text = soup.find_all(text=True)
# xpath for counties
# counties: /html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div[1]/a[1]
# /html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div[1]/a[2]
# /html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div[1]/a[3]
# xpath for commercial districts
# /html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div[2]/a[1]
# /html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div[2]/a[2]

Is this what you want?
import json
import requests
from bs4 import BeautifulSoup
base_url = "https://wh.lianjia.com"
start_url = f"{base_url}/ershoufang/jiangan/"
def get_page(url: str) -> bytes:
return requests.get(url).content
def make_soup(page: bytes) -> BeautifulSoup:
return BeautifulSoup(
page, "html.parser",
).find(
"div",
{"data-role": "ershoufang"},
)
def find_anchors(soup: BeautifulSoup, div_num: int) -> list:
return soup.find_all("div")[div_num].find_all("a")
countries = [
f"{base_url}{a['href']}" for a
in find_anchors(make_soup(get_page(start_url)), div_num=0)
]
districts = {}
for country in countries:
print(f"Fetching data for {country}")
districts.update(
{
a.getText(): a["href"]
for a in find_anchors(make_soup(get_page(country)), div_num=1)
}
)
with open("all_districts.json", "w") as jf:
json.dump(districts, jf, indent=4, sort_keys=True)
Output:
{
"CBD\u897f\u5317\u6e56": "/ershoufang/cbdxibeihu/",
"\u4e03\u91cc\u5e99": "/ershoufang/qilimiao/",
"\u4e09\u73af\u5357": "/ershoufang/sanhuannan/",
"\u4e09\u9633\u8def": "/ershoufang/sanyanglu/",
"\u4e1c\u6e56\u4e1c\u4ead": "/ershoufang/donghudongting/",
"\u4e1c\u897f\u6e56\u5176\u5b83": "/ershoufang/dongxihuqita/",
"\u4e2d\u5317\u8def": "/ershoufang/zhongbeilu/",
"\u4e2d\u5357\u4e01\u5b57\u6865": "/ershoufang/zhongnandingziqiao/",
"\u4e2d\u6cd5\u751f\u6001\u57ce": "/ershoufang/zhongfashengtaicheng/",
"\u4e8c\u4e03": "/ershoufang/erqi2/",
"\u5149\u8c37\u4e1c": "/ershoufang/guanggudong/",
"\u5149\u8c37\u5357": "/ershoufang/guanggunan/",
"\u5149\u8c37\u5e7f\u573a": "/ershoufang/guangguguangchang/",
"\u5173\u5c71\u5927\u9053": "/ershoufang/guanshandadao/",
"\u5173\u897f\u957f\u804c": "/ershoufang/guanxichangzhi/",
"\u524d\u5ddd": "/ershoufang/qianchuan/",
"\u524d\u8fdb\u6c5f\u6c49": "/ershoufang/qianjinjianghan/",
"\u534e\u79d1\u5927": "/ershoufang/huakeda/",
"\u5353\u5200\u6cc9": "/ershoufang/zhuodaoquan/",
"\u5357\u6e56\u6c83\u5c14\u739b": "/ershoufang/nanhuwoerma/",
"\u53e4\u7530": "/ershoufang/gutian/",
"\u53f0\u5317\u9999\u6e2f\u8def": "/ershoufang/taibeixiangganglu/",
"\u540e\u5b98\u6e56": "/ershoufang/houguanhu/",
"\u540e\u6e56": "/ershoufang/houhu/",
"\u5434\u5bb6\u5c71": "/ershoufang/wujiashan/",
"\u5510\u5bb6\u58a9": "/ershoufang/tangjiadun/",
"\u56db\u65b0": "/ershoufang/sixin/",
"\u56e2\u7ed3\u5927\u9053": "/ershoufang/tuanjiedadao/",
"\u5824\u89d2": "/ershoufang/dijiao/",
"\u5854\u5b50\u6e56": "/ershoufang/tazihu/",
"\u5927\u667a\u8def": "/ershoufang/dazhilu/",
"\u5b97\u5173": "/ershoufang/zongguan/",
"\u5b9d\u4e30\u5d07\u4ec1": "/ershoufang/baofengchongren/",
"\u5c06\u519b\u8def": "/ershoufang/jiangjunlu/",
"\u5e38\u9752\u82b1\u56ed": "/ershoufang/changqinghuayuan/",
"\u5e38\u9752\u8def": "/ershoufang/changqinglu/",
"\u5e99\u5c71": "/ershoufang/miaoshan/",
"\u5f90\u4e1c": "/ershoufang/xudong/",
"\u6587\u5316\u5927\u9053": "/ershoufang/wenhuadadao/",
"\u65b0\u534e\u8def\u4e07\u8fbe": "/ershoufang/xinhualuwanda/",
"\u65b0\u5357\u6e56": "/ershoufang/xinnanhu/",
"\u65b0\u6d32\u5176\u5b83": "/ershoufang/xinzhouqita/",
"\u6768\u56ed": "/ershoufang/yangyuan/",
"\u6768\u6c4a\u6e56": "/ershoufang/yangchahu/",
"\u695a\u6cb3\u6c49\u8857": "/ershoufang/chuhehanjie/",
"\u6b66\u5e7f\u4e07\u677e\u56ed": "/ershoufang/wuguangwansongyuan/",
"\u6b66\u660c\u706b\u8f66\u7ad9": "/ershoufang/wuchanghuochezhan/",
"\u6b66\u6e56": "/ershoufang/wuhu/",
"\u6c11\u65cf\u5927\u9053": "/ershoufang/minzudadao/",
"\u6c34\u679c\u6e56": "/ershoufang/shuiguohu/",
"\u6c49\u5357\u5176\u5b83": "/ershoufang/hannanqita/",
"\u6c49\u53e3\u5317": "/ershoufang/hankoubei/",
"\u6c49\u6b63\u8857": "/ershoufang/hanzhengjie/",
"\u6c5f\u590f\u5176\u5b83": "/ershoufang/jiangxiaqita/",
"\u6c8c\u53e3": "/ershoufang/dunkou/",
"\u6c99\u6e56": "/ershoufang/shahu/",
"\u6d2a\u5c71\u5176\u5b83": "/ershoufang/hongshanqita/",
"\u738b\u5bb6\u6e7e": "/ershoufang/wangjiawan/",
"\u73de\u72ee\u5357\u8def": "/ershoufang/luoshinanlu/",
"\u767d\u6c99\u6d32": "/ershoufang/baishazhou/",
"\u767e\u6b65\u4ead": "/ershoufang/baibuting/",
"\u76d8\u9f99\u57ce": "/ershoufang/panlongcheng/",
"\u79ef\u7389\u6865": "/ershoufang/jiyuqiao/",
"\u7eb8\u574a": "/ershoufang/zhifang/",
"\u8001\u5357\u6e56": "/ershoufang/laonanhu/",
"\u80b2\u624d\u82b1\u6865": "/ershoufang/yucaihuaqiao/",
"\u8521\u7538\u5176\u5b83": "/ershoufang/caidianqita/",
"\u8521\u7538\u57ce\u533a": "/ershoufang/caidianchengqu/",
"\u85cf\u9f99\u5c9b": "/ershoufang/canglongdao/",
"\u864e\u6cc9\u6768\u5bb6\u6e7e": "/ershoufang/huquanyangjiawan/",
"\u8857\u9053\u53e3": "/ershoufang/jiedaokou/",
"\u91d1\u878d\u6e2f": "/ershoufang/jinronggang/",
"\u91d1\u94f6\u6e56": "/ershoufang/jinyinhu/",
"\u949f\u5bb6\u6751": "/ershoufang/zhongjiacun/",
"\u957f\u4e30\u5e38\u7801\u5934": "/ershoufang/changfengchangmatou/",
"\u957f\u6e2f\u8def": "/ershoufang/changganglu/",
"\u9633\u903b": "/ershoufang/yangluo/",
"\u96c6\u8d24": "/ershoufang/jixian2/",
"\u9752\u5c71": "/ershoufang/qingshan1/",
"\u9996\u4e49": "/ershoufang/shouyi/",
"\u9ec4\u57d4\u6c38\u6e05": "/ershoufang/huangpuyongqing/",
"\u9ec4\u9642\u5176\u5b83": "/ershoufang/huangbeiqita/"
}

Unable to scrape all data

from bs4 import BeautifulSoup
import requests , sys ,os
import pandas as pd
URL = r"https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/"
My_list = ['2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020']
Year= []
CompanyName = []
Rank = []
Score = []
print('\n>>Process started please wait\n\n')
for I, Page in enumerate(My_list, start=1):
url = r'https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/{}'.format(Page)
print('\nData fetching from : ',url)
Res = requests.get(url)
soup = BeautifulSoup(Res.content , 'html.parser')
data = soup.find('section',{'class': 'search-result CompanyWorkfor RankingMain FindSchools school-results contrastSection d-flex justify-content-center min-height Rankings CompRank'})
if len(soup) > 0:
print("\n>>Getting page source for :" , url)
else:
print("Please Check url :",url)
for i, item in enumerate(data.find_all("div", {"class": "RankItem"})):
year = item.find("i",{"class":"fa-stack fa-2x"})
Year.append(year)
title = item.find("h3", {"class": "MainLink"}).get_text().strip()
CompanyName.append(title)
rank = item.find("div", {"class": "RankNumber"}).get_text().strip()
Rank.append(rank)
score = item.find("div", {"class": "score"}).get_text().strip()
Score.append(score)
Data = pd.DataFrame({"Year":Year,"CompanyName":CompanyName,"Rank":Rank,"Score":Score})
Data[['First','Score']] = Data.Score.str.split(" " , expand =True,)
Data[['hash','Rank']] = Data.Rank.str.split("#" , expand = True,)
Data.drop(columns = ['hash','First'],inplace = True)
Data.to_csv('Vault_scrap.csv',index = False)
For each url the expected output Data for year, rank, title and score is 100 lines, but I'm getting only 10 lines.

You can iterate through the year and pages like this.
import requests
import pandas as pd
url = 'https://www.vault.com/vault/api/Rankings/LoadMoreCompanyRanksJSON'
def page_loop(year, url):
tableReturn = pd.DataFrame()
for page in range(1,101):
payload = {
'rank': '2',
'year': year,
'category': 'LBACCompany',
'pg': page}
jsonData = requests.get(url, params=payload).json()
if jsonData == []:
return tableReturn
else:
print ('page: %s' %page)
tableReturn = tableReturn.append(pd.DataFrame(jsonData), sort=True).reset_index(drop=True)
return tableReturn
results = pd.DataFrame()
for year in range(2007,2021):
print ("\n>>Getting page source for :" , year)
jsonData = page_loop(year, url)
results = results.append(pd.DataFrame(jsonData), sort=True).reset_index(drop=True)

Having error in the concatenation of the data for the multiple pages in python

I am facing the error while concatenating the data of the multiple pages and exporting it in the single CSV file. According to my code, the data is exporting upto page 10 but after page number 10 it is working.
import urllib.request
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import os
url = 'http://www.igrmaharashtra.gov.in/eASR/eASRCommon.aspx?
hDistName=Buldhana'
chrome_path =
r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
d = webdriver.Chrome(executable_path=chrome_path)
d.implicitly_wait(10)
d.get(url)
Select(d.find_element_by_name('ctl00$ContentPlaceHolder5$ddlTaluka')).select_by_value('7')
Select(d.find_element_by_name('ctl00$ContentPlaceHolder5$ddlVillage')).select_by_value('1464')
tableElement = d.find_element_by_id(
'ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate')
table = pd.read_html(tableElement.get_attribute('outerHTML'))[0]
#print(table)
table.columns = table.iloc[0]
table = table.iloc[1:]
#print(type(table))
table = table[table.Select == 'SurveyNo']
#print(table) #assumption SurveyNo exists for all wanted rows
surveyNo_scripts = [item.get_attribute('href') for item in
d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWi
seRate [href*='Select$']")]
#print(surveyNo_scripts)
i = 0
for script in surveyNo_scripts:
d.execute_script(script)
surveys = d.find_element_by_css_selector('textarea').text
table.iloc[[i],table.columns.get_loc('Select')] = surveys
i += 1
print(table)
j=2
while True:
if len(d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate a[href*='Page${}']".format(j)))>0:
#print( d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate a[href*='Page${}']".format(i))[0].get_attribute('href'))
d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate a[href*='Page${}']".format(j))[0].click()
tableElement = d.find_element_by_css_selector(
"#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate")
table1 = pd.read_html(tableElement.get_attribute('outerHTML'))[0]
table1.columns = table1.iloc[0]
table1 = table1.iloc[1:]
#print(type(table))
table1 = table1[table1.Select == 'SurveyNo']
#print(table) #assumption SurveyNo exists for all wanted rows
surveyNo_scripts = [item.get_attribute('href') for item in
d.find_elements_by_css_selector(
"#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate
[href*='Select$']")]
#print(surveyNo_scripts)
i = 0
for script in surveyNo_scripts:
d.execute_script(script)
surveys =
d.find_element_by_css_selector('textarea').text
table1.iloc[[i],table1.columns.get_loc('Select')] =
surveys
i += 1
#print(table1)
#table = table.append(table1.reindex(columns=table.columns))
table1.columns = table.columns
table = pd.concat([table, table1] ,ignore_index=True)
print(table)
j+=1
else:
break
table.to_csv(r"C:\Users\Guest\Desktop\Sample_buldhana.csv", sep=',', encoding='utf-8-sig',index = False )

Python 3 code stops at HTTP error and I can't figure out how to handle it

I'm trying to scrape links from the website https://www.usyouthsoccer.org/clubs/club-directory/. Initially, the code broke at the 30th link, so I tried to handle the exception error with urllib HTTPError. Now, the script just stops running at the 30th link. I checked that specific url and it is a bad link. I just want to move past it in the loop, but I'm having trouble with the work around. Any suggestions would be greatly appreciated...
import requests
from bs4 import BeautifulSoup as bs
from splinter import Browser
import pandas as pd
from urllib.request import Request, urlopen
from urllib.error import HTTPError
executable_path = {"executable_path": "chromedriver"}
browser = Browser("chrome", **executable_path, headless=True)
url = 'https://www.usyouthsoccer.org/clubs/club-directory/'
zipcode_input = 'CT_Main_0$txtLocation'
search_button = '//*[#id="CT_Main_0_btnSearch"]'
dropdown = '//*[#id="CT_Main_0_drpMiles"]/option[5]'
zip_codes = [64015]
team_df = pd.DataFrame()
for x in zip_codes:
try:
print(f'\n{x}\n')
url = 'https://www.usyouthsoccer.org/clubs/club-directory/'
browser.visit(url)
browser.fill(zipcode_input, x)
browser.find_by_xpath(dropdown).click()
browser.find_by_xpath(search_button).click()
html = browser.html
soup = bs(html, 'html.parser')
dallas_urls = soup.find_all(class_="more")
counter = 1
for url in dallas_urls:
print(f'Link {counter} of {len((dallas_urls))}')
counter += 1
back_url = url['href']
front_url = 'https://www.usyouthsoccer.org'
total_url = front_url + back_url
browser.visit(total_url)
my_html = pd.read_html(total_url)
details_pd = pd.DataFrame(my_html[0])
details_pd.columns = ['Cols', 'Vals']
df = details_pd.T
df.columns = df.iloc[0]
df.drop('Cols', inplace = True)
contacts_pd = pd.DataFrame(my_html[1])
if len(contacts_pd.index) == 1:
df['Contact_Title'] = contacts_pd.iloc[0,0]
df['Contact_Name'] = contacts_pd.iloc[0, 1]
df['Contact_Email'] = contacts_pd.iloc[0, 2]
elif len(contacts_pd.index) == 2:
df['Contact_Title'] = contacts_pd.iloc[0,0]
df['Contact_Name'] = contacts_pd.iloc[0, 1]
df['Contact_Email'] = contacts_pd.iloc[0, 2]
df['Contact_Title2'] = contacts_pd.iloc[1,0]
df['Contact_Name2'] = contacts_pd.iloc[1, 1]
df['Contact_Email2'] = contacts_pd.iloc[1, 2]
elif len(contacts_pd.index) == 3:
df['Contact_Title'] = contacts_pd.iloc[0,0]
df['Contact_Name'] = contacts_pd.iloc[0, 1]
df['Contact_Email'] = contacts_pd.iloc[0, 2]
df['Contact_Title2'] = contacts_pd.iloc[1,0]
df['Contact_Name2'] = contacts_pd.iloc[1, 1]
df['Contact_Email2'] = contacts_pd.iloc[1, 2]
df['Contact_Title3'] = contacts_pd.iloc[2,0]
df['Contact_Name3'] = contacts_pd.iloc[2, 1]
df['Contact_Email3'] = contacts_pd.iloc[2, 2]
team_df = pd.concat([team_df, df])
except HTTPError as err:
continue

Put your try statement inside of the nested for loop. Right now it looks like if you have a HTTP Error it is stopping the entire for loop - instead of continuing through the for loop.
for url in dallas_urls:
try:
print(f'Link {counter} of {len((dallas_urls))}')
counter += 1
back_url = url['href']
front_url = 'https://www.usyouthsoccer.org'
total_url = front_url + back_url
urllib.request.urlretrieve(total_url)
except urllib.error.HTTPError:
print ('Error')
continue

I'm stumped at looping through a returned list of URLs

My first python project, I'm trying to scrape restaurant inspection. One site has summaries that offer keys to the detailed reports that I want to scrape. I'm stumped at looping through the keyed list of urls to get the details.
import pandas as pd
import bs4
import datetime
import re
import lxml
from urllib.request import urlopen
from urllib.error import HTTPError
try:
insp = pd.read_csv("ftp://dbprftp.state.fl.us/pub/llweb/5fdinspi.csv",
usecols=[2,14,18,80,81])
except IOError:
print("The file is not accessible.")
insp.columns = ["CountyName", "InspectDate",
"NumHighVio", "LicenseID", "VisitID"]
# filter for alachua county restaurants
alachua = insp[insp.CountyName == 'Alachua']
# filter for restaurants that had at least one serious violation
alachua = alachua[alachua.NumHighVio > 0]
# change date string to date object
alachua['InspectDate'] = pd.to_datetime(alachua['InspectDate'])
# sort most recent
alachua = alachua.sort_values('InspectDate', ascending=False)
# prefer to have user set timedelta below:
today = pd.to_datetime('today')
startDay = datetime.date.today() - datetime.timedelta(days=30)
alachua = alachua[(alachua['InspectDate'] > startDay) &
(alachua['InspectDate'] < today)]
# takes LicenseID and VisitID, passes it into the urls for detailed reports
for index, rows in alachua.iterrows():
visitID = rows['VisitID']
licID = rows['LicenseID']
urls = "https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID=
%s &licid= %s" % (visitID, licID)
urls = urls.replace(' ', '')
print(urls)
## here's my problem:
for url in urls:
def get_inspect_detail():
html = urlopen(url)
soup = bs4.BeautifulSoup(html.read(), 'lxml')
details = soup.find_all('font', {'face':'verdana'})[10:]
for detail in details:
siteName = details[0].text
licNum = details[2].text
siteRank = details[4].text
expDate = details[6].text
primeStatus = details[8].text
secStatus = details[10].text
siteAddress = details[12].text
inspectResult = details[20].text
observed1 = details[34].get_text
observed2 = details[36].text
observed3 = details[38].text
observed4 = details[40].text
observed5 = details[42].text
observed6 = details[44].text
observed7 = details[46].text
observed8 = details[48].text
observed9 = details[50].text
observed10 = details[52].text
detailsLib = {
'Restaurant': siteName,
'License': licNum,
'Rank': siteRank,
'Expires': expDate,
'Primary': primeStatus,
'Secondary': secStatus,
'Address': siteAddress,
'Result': inspectResult,
'Observed1': observed1,
'Observed2': observed2,
'Observed3': observed3,
'Observed4': observed4,
'Observed5': observed5,
'Observed6': observed6,
'Observed7': observed7,
'Observed8': observed8,
'Observed9': observed9,
'Observed10': observed10
}
repr(get_inspect_detail())
Probably an obvious mistake or lack of knowledge, but I can get the unscrubbed data for one url, but not for all.

I dont see a reason to define your function inside the loop. You would end up with a lot of redundant definitions this way. Second, you could just define a result list and accumulate the detailsLib objects inside it.
def get_inspect_detail(url):
html = urlopen(url)
soup = bs4.BeautifulSoup(html.read(), 'lxml')
details = soup.find_all('font', {'face': 'verdana'})[10:]
result = []
for detail in details:
siteName = details[0].text
licNum = details[2].text
siteRank = details[4].text
expDate = details[6].text
primeStatus = details[8].text
secStatus = details[10].text
siteAddress = details[12].text
inspectResult = details[20].text
observed1 = details[34].get_text
observed2 = details[36].text
observed3 = details[38].text
observed4 = details[40].text
observed5 = details[42].text
observed6 = details[44].text
observed7 = details[46].text
observed8 = details[48].text
observed9 = details[50].text
observed10 = details[52].text
detailsLib = {
'Restaurant': siteName,
'License': licNum,
'Rank': siteRank,
'Expires': expDate,
'Primary': primeStatus,
'Secondary': secStatus,
'Address': siteAddress,
'Result': inspectResult,
'Observed1': observed1,
'Observed2': observed2,
'Observed3': observed3,
'Observed4': observed4,
'Observed5': observed5,
'Observed6': observed6,
'Observed7': observed7,
'Observed8': observed8,
'Observed9': observed9,
'Observed10': observed10
}
result.append(detailsLib)
return result
for url in urls:
repr(get_inspect_detail(url))

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

How to scrape the data which is unable to inspect and which is inside the <svg> tag - python-3.x

Related

Loop and crawler items and save elements as json format in Python

Unable to scrape all data

Having error in the concatenation of the data for the multiple pages in python

Python 3 code stops at HTTP error and I can't figure out how to handle it

I'm stumped at looping through a returned list of URLs

Categories

Resources