How to scrape the data which is unable to inspect and which is inside the <svg> tag - python-3.x
I'm unable to scrape some of the data from a webpage Partywise Result. I want to scrape the partwise{vote%,vote count} from that page.
The code I have tried so far:
import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
def soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage,"html.parser")
return soupdata
#chhattisgarh
edatas = ""
edata1=""
codes = ["S26"]
for code in codes:
soup3 = "http://eciresults.nic.in/PartyWiseResult"+code+".htm"#2018
#soup3 = "https://web.archive.org/web/20140613012440/http://eciresults.nic.in/PartyWiseResult" + code + ".htm" # 2014
soup2 = soup(soup3)
for records2 in soup2.findAll("div",{"id":"piecharts26"}):
print(records2.table)
for records in records2.findAll("table"):
print(records)
edata = ""
for data in records.findAll('td'):
edata= edata+","+data.text
edatas= edatas + "\n" + edata[1:]+","+code
header ="Party,Won,Leading,Total,State code"
file = open(os.path.expanduser("per2014_result.csv"),"wb")#2018
#file = open(os.path.expanduser("per2014_result.csv"),"wb")#2014
file.write(bytes(header, encoding="ascii", errors="ignore"))
file.write(bytes(edatas, encoding="ascii", errors="ignore"))
file.write(bytes(edata1, encoding="ascii", errors="ignore"))
The result which I am expecting from is the %vote share
I want the output to be in CSV format like this:
INC,43.0%,6144192
and so on fully from the page one
and two
There is a loading of the data directly from the javascript inside your div:
if(document.getElementById('piecharts26')!=null)
So you have to use a console browser, such as selenium (link here) , or use a regex:
import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
import re
import json
def get_data(html_page):
s = str(html_page)
r = re.compile('data.addRows\((.*?)\);')
m = r.search(s)
if m:
result = m.group(1)
return json.loads(result.replace("'",'"'))
def soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage,"html.parser")
return soupdata
#chhattisgarh
edatas = ""
edata1=""
codes = ["S26"]
for code in codes:
soup3 = "http://eciresults.nic.in/PartyWiseResult"+code+".htm"#2018
#soup3 = "https://web.archive.org/web/20140613012440/http://eciresults.nic.in/PartyWiseResult" + code + ".htm" # 2014
soup2 = soup(soup3)
result = get_data(soup2)
print(result)
header ="Party,Won,Leading,Total,State code"
file = open(os.path.expanduser("per2014_result.csv"),"wb")#2018
#file = open(os.path.expanduser("per2014_result.csv"),"wb")#2014
file.write(bytes(header, encoding="ascii", errors="ignore"))
file.write(bytes(edatas, encoding="ascii", errors="ignore"))
file.write(bytes(edata1, encoding="ascii", errors="ignore"))
OUTPUT:
[['INC {43.0%,6144192}', 6144192],
['BJP {33.0%,4707141}', 4707141],
['JCCJ {7.6%,1086581}', 1086581],
['IND {5.9%,839053}', 839053],
['BSP {3.9%,552313}', 552313],
['GGP {1.7%,247459}', 247459],
['AAAP {0.9%,123526}', 123526],
['CPI {0.3%,48255}', 48255],
['APoI {0.3%,42013}', 42013],
['SHS {0.2%,34678}', 34678],
['NCP {0.2%,28983}', 28983],
['SP {0.2%,21969}', 21969],
['BYPP {0.1%,8425}', 8425],
['CPM {0.1%,8348}', 8348],
['JD(U) {0.1%,8285}', 8285],
['CSM {0.1%,7783}', 7783],
['BMUP {0.1%,7419}', 7419],
['BSCP {0.0%,5546}', 5546],
['BTP {0.0%,5498}', 5498],
['RJsbhP {0.0%,5141}', 5141],
['RGOP {0.0%,5040}', 5040],
['IPBP {0.0%,4982}', 4982],
['NINSHAD {0.0%,4586}', 4586],
['PSPU {0.0%,4309}', 4309],
['BHBHP {0.0%,3780}', 3780],
['RPI(A) {0.0%,3257}', 3257],
['JAC {0.0%,3034}', 3034],
['CPIM {0.0%,3017}', 3017],
['NDPF {0.0%,2912}', 2912],
['AASPP {0.0%,2474}', 2474],
['BBC {0.0%,2089}', 2089],
['SWAP {0.0%,2023}', 2023],
['cvgrp {0.0%,1582}', 1582],
['bhmm {0.0%,1474}', 1474],
['AVVP {0.0%,1407}', 1407],
['LSWP {0.0%,1399}', 1399],
['CSP {0.0%,1232}', 1232],
['BPSGKD {0.0%,1093}', 1093],
['BKNP {0.0%,1085}', 1085],
['CGVP {0.0%,1053}', 1053],
['SUCI {0.0%,1048}', 1048],
['SUSP {0.0%,988}', 988],
['DPI {0.0%,970}', 970],
['RJBP {0.0%,717}', 717],
['ASSP {0.0%,701}', 701],
['BLRP {0.0%,570}', 570],
['BSHSP {0.0%,562}', 562],
['ABHM {0.0%,549}', 549],
['SSBD {0.0%,468}', 468],
['ABSSP {0.0%,436}', 436],
['BRSP {0.0%,429}', 429],
['ABSKP {0.0%,389}', 389],
['BSSP {0.0%,279}', 279],
['BNIP {0.0%,267}', 267],
['RMGP {0.0%,258}', 258],
['KMSP {0.0%,241}', 241],
['BHBP {0.0%,224}', 224],
['RP(K) {0.0%,202}', 202],
['CMM {0.0%,192}', 192],
['CHSJP {0.0%,183}', 183],
['RSSM {0.0%,72}', 72],
['AnAP {0.0%,66}', 66],
['NOTA {2.0%,282744}', 282744]]
Then you can loop on the result and save it into the csv file
EDIT:
See this edit to save it in csv file:
import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
import re
import json
import csv
def get_data(html_page):
s = str(html_page)
r = re.compile('data.addRows\((.*?)\);')
m = r.search(s)
if m:
result = m.group(1)
return json.loads(result.replace("'",'"'))
def soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage,"html.parser")
return soupdata
codes = ["S26"]
for code in codes:
soup3 = "http://eciresults.nic.in/PartyWiseResult"+code+".htm"#2018
#soup3 = "https://web.archive.org/web/20140613012440/http://eciresults.nic.in/PartyWiseResult" + code + ".htm" # 2014
soup2 = soup(soup3)
result = get_data(soup2)
header = ["Party","Vote%","Count","State code"]
results_export = []
results_export.append(header)
for r in result:
export = []
party = r[0].split(' {')[0]
percent = r[0].split(' {')[1].split(',')[0]
count = r[1]
export.append(str(party))
export.append(str(percent))
export.append(str(count))
export.append(code)
results_export.append(export)
file = open(os.path.expanduser("per2014_result.csv"), "w") # 2018
writer = csv.writer(file)
writer.writerows(results_export)
EDIT2:
def get_data(html_page):
s = str(html_page)
r = re.compile('data.addRows\((.*?)\);')
ms = r.findall(s)
result = '[]'
if ms:
for m in ms:
if m != '[]':
result = m
return json.loads(result.replace("'",'"'))
Related
Loop and crawler items and save elements as json format in Python
Given a link from here: I would like to loop all the counties and then all commercial districts, the save them as a txt file with json format as follows: {"\u5317\u8521": "/ershoufang/beicai/", "\u78a7\u4e91": "/ershoufang/biyun/", "\u66f9\u8def": "/ershoufang/caolu/", "\u5ddd\u6c99": "/ershoufang/chuansha/", "\u5927\u56e2\u9547": "/ershoufang/datuanzhen/", ...} How could I do that? Thanks at advance. Code: from bs4 import BeautifulSoup import requests import os from urllib.parse import urlparse url = 'https://wh.lianjia.com/ershoufang/jiangan/' r = requests.get(url) soup = BeautifulSoup(r.content, "html.parser") text = soup.find_all(text=True) # xpath for counties # counties: /html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div[1]/a[1] # /html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div[1]/a[2] # /html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div[1]/a[3] # xpath for commercial districts # /html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div[2]/a[1] # /html/body/div[3]/div/div[1]/dl[2]/dd/div[1]/div[2]/a[2]
Is this what you want? import json import requests from bs4 import BeautifulSoup base_url = "https://wh.lianjia.com" start_url = f"{base_url}/ershoufang/jiangan/" def get_page(url: str) -> bytes: return requests.get(url).content def make_soup(page: bytes) -> BeautifulSoup: return BeautifulSoup( page, "html.parser", ).find( "div", {"data-role": "ershoufang"}, ) def find_anchors(soup: BeautifulSoup, div_num: int) -> list: return soup.find_all("div")[div_num].find_all("a") countries = [ f"{base_url}{a['href']}" for a in find_anchors(make_soup(get_page(start_url)), div_num=0) ] districts = {} for country in countries: print(f"Fetching data for {country}") districts.update( { a.getText(): a["href"] for a in find_anchors(make_soup(get_page(country)), div_num=1) } ) with open("all_districts.json", "w") as jf: json.dump(districts, jf, indent=4, sort_keys=True) Output: { "CBD\u897f\u5317\u6e56": "/ershoufang/cbdxibeihu/", "\u4e03\u91cc\u5e99": "/ershoufang/qilimiao/", "\u4e09\u73af\u5357": "/ershoufang/sanhuannan/", "\u4e09\u9633\u8def": "/ershoufang/sanyanglu/", "\u4e1c\u6e56\u4e1c\u4ead": "/ershoufang/donghudongting/", "\u4e1c\u897f\u6e56\u5176\u5b83": "/ershoufang/dongxihuqita/", "\u4e2d\u5317\u8def": "/ershoufang/zhongbeilu/", "\u4e2d\u5357\u4e01\u5b57\u6865": "/ershoufang/zhongnandingziqiao/", "\u4e2d\u6cd5\u751f\u6001\u57ce": "/ershoufang/zhongfashengtaicheng/", "\u4e8c\u4e03": "/ershoufang/erqi2/", "\u5149\u8c37\u4e1c": "/ershoufang/guanggudong/", "\u5149\u8c37\u5357": "/ershoufang/guanggunan/", "\u5149\u8c37\u5e7f\u573a": "/ershoufang/guangguguangchang/", "\u5173\u5c71\u5927\u9053": "/ershoufang/guanshandadao/", "\u5173\u897f\u957f\u804c": "/ershoufang/guanxichangzhi/", "\u524d\u5ddd": "/ershoufang/qianchuan/", "\u524d\u8fdb\u6c5f\u6c49": "/ershoufang/qianjinjianghan/", "\u534e\u79d1\u5927": "/ershoufang/huakeda/", "\u5353\u5200\u6cc9": "/ershoufang/zhuodaoquan/", "\u5357\u6e56\u6c83\u5c14\u739b": "/ershoufang/nanhuwoerma/", "\u53e4\u7530": "/ershoufang/gutian/", "\u53f0\u5317\u9999\u6e2f\u8def": "/ershoufang/taibeixiangganglu/", "\u540e\u5b98\u6e56": "/ershoufang/houguanhu/", "\u540e\u6e56": "/ershoufang/houhu/", "\u5434\u5bb6\u5c71": "/ershoufang/wujiashan/", "\u5510\u5bb6\u58a9": "/ershoufang/tangjiadun/", "\u56db\u65b0": "/ershoufang/sixin/", "\u56e2\u7ed3\u5927\u9053": "/ershoufang/tuanjiedadao/", "\u5824\u89d2": "/ershoufang/dijiao/", "\u5854\u5b50\u6e56": "/ershoufang/tazihu/", "\u5927\u667a\u8def": "/ershoufang/dazhilu/", "\u5b97\u5173": "/ershoufang/zongguan/", "\u5b9d\u4e30\u5d07\u4ec1": "/ershoufang/baofengchongren/", "\u5c06\u519b\u8def": "/ershoufang/jiangjunlu/", "\u5e38\u9752\u82b1\u56ed": "/ershoufang/changqinghuayuan/", "\u5e38\u9752\u8def": "/ershoufang/changqinglu/", "\u5e99\u5c71": "/ershoufang/miaoshan/", "\u5f90\u4e1c": "/ershoufang/xudong/", "\u6587\u5316\u5927\u9053": "/ershoufang/wenhuadadao/", "\u65b0\u534e\u8def\u4e07\u8fbe": "/ershoufang/xinhualuwanda/", "\u65b0\u5357\u6e56": "/ershoufang/xinnanhu/", "\u65b0\u6d32\u5176\u5b83": "/ershoufang/xinzhouqita/", "\u6768\u56ed": "/ershoufang/yangyuan/", "\u6768\u6c4a\u6e56": "/ershoufang/yangchahu/", "\u695a\u6cb3\u6c49\u8857": "/ershoufang/chuhehanjie/", "\u6b66\u5e7f\u4e07\u677e\u56ed": "/ershoufang/wuguangwansongyuan/", "\u6b66\u660c\u706b\u8f66\u7ad9": "/ershoufang/wuchanghuochezhan/", "\u6b66\u6e56": "/ershoufang/wuhu/", "\u6c11\u65cf\u5927\u9053": "/ershoufang/minzudadao/", "\u6c34\u679c\u6e56": "/ershoufang/shuiguohu/", "\u6c49\u5357\u5176\u5b83": "/ershoufang/hannanqita/", "\u6c49\u53e3\u5317": "/ershoufang/hankoubei/", "\u6c49\u6b63\u8857": "/ershoufang/hanzhengjie/", "\u6c5f\u590f\u5176\u5b83": "/ershoufang/jiangxiaqita/", "\u6c8c\u53e3": "/ershoufang/dunkou/", "\u6c99\u6e56": "/ershoufang/shahu/", "\u6d2a\u5c71\u5176\u5b83": "/ershoufang/hongshanqita/", "\u738b\u5bb6\u6e7e": "/ershoufang/wangjiawan/", "\u73de\u72ee\u5357\u8def": "/ershoufang/luoshinanlu/", "\u767d\u6c99\u6d32": "/ershoufang/baishazhou/", "\u767e\u6b65\u4ead": "/ershoufang/baibuting/", "\u76d8\u9f99\u57ce": "/ershoufang/panlongcheng/", "\u79ef\u7389\u6865": "/ershoufang/jiyuqiao/", "\u7eb8\u574a": "/ershoufang/zhifang/", "\u8001\u5357\u6e56": "/ershoufang/laonanhu/", "\u80b2\u624d\u82b1\u6865": "/ershoufang/yucaihuaqiao/", "\u8521\u7538\u5176\u5b83": "/ershoufang/caidianqita/", "\u8521\u7538\u57ce\u533a": "/ershoufang/caidianchengqu/", "\u85cf\u9f99\u5c9b": "/ershoufang/canglongdao/", "\u864e\u6cc9\u6768\u5bb6\u6e7e": "/ershoufang/huquanyangjiawan/", "\u8857\u9053\u53e3": "/ershoufang/jiedaokou/", "\u91d1\u878d\u6e2f": "/ershoufang/jinronggang/", "\u91d1\u94f6\u6e56": "/ershoufang/jinyinhu/", "\u949f\u5bb6\u6751": "/ershoufang/zhongjiacun/", "\u957f\u4e30\u5e38\u7801\u5934": "/ershoufang/changfengchangmatou/", "\u957f\u6e2f\u8def": "/ershoufang/changganglu/", "\u9633\u903b": "/ershoufang/yangluo/", "\u96c6\u8d24": "/ershoufang/jixian2/", "\u9752\u5c71": "/ershoufang/qingshan1/", "\u9996\u4e49": "/ershoufang/shouyi/", "\u9ec4\u57d4\u6c38\u6e05": "/ershoufang/huangpuyongqing/", "\u9ec4\u9642\u5176\u5b83": "/ershoufang/huangbeiqita/" }
Unable to scrape all data
from bs4 import BeautifulSoup import requests , sys ,os import pandas as pd URL = r"https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/" My_list = ['2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020'] Year= [] CompanyName = [] Rank = [] Score = [] print('\n>>Process started please wait\n\n') for I, Page in enumerate(My_list, start=1): url = r'https://www.vault.com/best-companies-to-work-for/law/top-100-law-firms-rankings/year/{}'.format(Page) print('\nData fetching from : ',url) Res = requests.get(url) soup = BeautifulSoup(Res.content , 'html.parser') data = soup.find('section',{'class': 'search-result CompanyWorkfor RankingMain FindSchools school-results contrastSection d-flex justify-content-center min-height Rankings CompRank'}) if len(soup) > 0: print("\n>>Getting page source for :" , url) else: print("Please Check url :",url) for i, item in enumerate(data.find_all("div", {"class": "RankItem"})): year = item.find("i",{"class":"fa-stack fa-2x"}) Year.append(year) title = item.find("h3", {"class": "MainLink"}).get_text().strip() CompanyName.append(title) rank = item.find("div", {"class": "RankNumber"}).get_text().strip() Rank.append(rank) score = item.find("div", {"class": "score"}).get_text().strip() Score.append(score) Data = pd.DataFrame({"Year":Year,"CompanyName":CompanyName,"Rank":Rank,"Score":Score}) Data[['First','Score']] = Data.Score.str.split(" " , expand =True,) Data[['hash','Rank']] = Data.Rank.str.split("#" , expand = True,) Data.drop(columns = ['hash','First'],inplace = True) Data.to_csv('Vault_scrap.csv',index = False) For each url the expected output Data for year, rank, title and score is 100 lines, but I'm getting only 10 lines.
You can iterate through the year and pages like this. import requests import pandas as pd url = 'https://www.vault.com/vault/api/Rankings/LoadMoreCompanyRanksJSON' def page_loop(year, url): tableReturn = pd.DataFrame() for page in range(1,101): payload = { 'rank': '2', 'year': year, 'category': 'LBACCompany', 'pg': page} jsonData = requests.get(url, params=payload).json() if jsonData == []: return tableReturn else: print ('page: %s' %page) tableReturn = tableReturn.append(pd.DataFrame(jsonData), sort=True).reset_index(drop=True) return tableReturn results = pd.DataFrame() for year in range(2007,2021): print ("\n>>Getting page source for :" , year) jsonData = page_loop(year, url) results = results.append(pd.DataFrame(jsonData), sort=True).reset_index(drop=True)
Having error in the concatenation of the data for the multiple pages in python
I am facing the error while concatenating the data of the multiple pages and exporting it in the single CSV file. According to my code, the data is exporting upto page 10 but after page number 10 it is working. import urllib.request from bs4 import BeautifulSoup import csv import os from selenium import webdriver from selenium.webdriver.support.select import Select from selenium.webdriver.common.keys import Keys import time import pandas as pd import os url = 'http://www.igrmaharashtra.gov.in/eASR/eASRCommon.aspx? hDistName=Buldhana' chrome_path = r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe' d = webdriver.Chrome(executable_path=chrome_path) d.implicitly_wait(10) d.get(url) Select(d.find_element_by_name('ctl00$ContentPlaceHolder5$ddlTaluka')).select_by_value('7') Select(d.find_element_by_name('ctl00$ContentPlaceHolder5$ddlVillage')).select_by_value('1464') tableElement = d.find_element_by_id( 'ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate') table = pd.read_html(tableElement.get_attribute('outerHTML'))[0] #print(table) table.columns = table.iloc[0] table = table.iloc[1:] #print(type(table)) table = table[table.Select == 'SurveyNo'] #print(table) #assumption SurveyNo exists for all wanted rows surveyNo_scripts = [item.get_attribute('href') for item in d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWi seRate [href*='Select$']")] #print(surveyNo_scripts) i = 0 for script in surveyNo_scripts: d.execute_script(script) surveys = d.find_element_by_css_selector('textarea').text table.iloc[[i],table.columns.get_loc('Select')] = surveys i += 1 print(table) j=2 while True: if len(d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate a[href*='Page${}']".format(j)))>0: #print( d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate a[href*='Page${}']".format(i))[0].get_attribute('href')) d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate a[href*='Page${}']".format(j))[0].click() tableElement = d.find_element_by_css_selector( "#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate") table1 = pd.read_html(tableElement.get_attribute('outerHTML'))[0] table1.columns = table1.iloc[0] table1 = table1.iloc[1:] #print(type(table)) table1 = table1[table1.Select == 'SurveyNo'] #print(table) #assumption SurveyNo exists for all wanted rows surveyNo_scripts = [item.get_attribute('href') for item in d.find_elements_by_css_selector( "#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate [href*='Select$']")] #print(surveyNo_scripts) i = 0 for script in surveyNo_scripts: d.execute_script(script) surveys = d.find_element_by_css_selector('textarea').text table1.iloc[[i],table1.columns.get_loc('Select')] = surveys i += 1 #print(table1) #table = table.append(table1.reindex(columns=table.columns)) table1.columns = table.columns table = pd.concat([table, table1] ,ignore_index=True) print(table) j+=1 else: break table.to_csv(r"C:\Users\Guest\Desktop\Sample_buldhana.csv", sep=',', encoding='utf-8-sig',index = False )
Python 3 code stops at HTTP error and I can't figure out how to handle it
I'm trying to scrape links from the website https://www.usyouthsoccer.org/clubs/club-directory/. Initially, the code broke at the 30th link, so I tried to handle the exception error with urllib HTTPError. Now, the script just stops running at the 30th link. I checked that specific url and it is a bad link. I just want to move past it in the loop, but I'm having trouble with the work around. Any suggestions would be greatly appreciated... import requests from bs4 import BeautifulSoup as bs from splinter import Browser import pandas as pd from urllib.request import Request, urlopen from urllib.error import HTTPError executable_path = {"executable_path": "chromedriver"} browser = Browser("chrome", **executable_path, headless=True) url = 'https://www.usyouthsoccer.org/clubs/club-directory/' zipcode_input = 'CT_Main_0$txtLocation' search_button = '//*[#id="CT_Main_0_btnSearch"]' dropdown = '//*[#id="CT_Main_0_drpMiles"]/option[5]' zip_codes = [64015] team_df = pd.DataFrame() for x in zip_codes: try: print(f'\n{x}\n') url = 'https://www.usyouthsoccer.org/clubs/club-directory/' browser.visit(url) browser.fill(zipcode_input, x) browser.find_by_xpath(dropdown).click() browser.find_by_xpath(search_button).click() html = browser.html soup = bs(html, 'html.parser') dallas_urls = soup.find_all(class_="more") counter = 1 for url in dallas_urls: print(f'Link {counter} of {len((dallas_urls))}') counter += 1 back_url = url['href'] front_url = 'https://www.usyouthsoccer.org' total_url = front_url + back_url browser.visit(total_url) my_html = pd.read_html(total_url) details_pd = pd.DataFrame(my_html[0]) details_pd.columns = ['Cols', 'Vals'] df = details_pd.T df.columns = df.iloc[0] df.drop('Cols', inplace = True) contacts_pd = pd.DataFrame(my_html[1]) if len(contacts_pd.index) == 1: df['Contact_Title'] = contacts_pd.iloc[0,0] df['Contact_Name'] = contacts_pd.iloc[0, 1] df['Contact_Email'] = contacts_pd.iloc[0, 2] elif len(contacts_pd.index) == 2: df['Contact_Title'] = contacts_pd.iloc[0,0] df['Contact_Name'] = contacts_pd.iloc[0, 1] df['Contact_Email'] = contacts_pd.iloc[0, 2] df['Contact_Title2'] = contacts_pd.iloc[1,0] df['Contact_Name2'] = contacts_pd.iloc[1, 1] df['Contact_Email2'] = contacts_pd.iloc[1, 2] elif len(contacts_pd.index) == 3: df['Contact_Title'] = contacts_pd.iloc[0,0] df['Contact_Name'] = contacts_pd.iloc[0, 1] df['Contact_Email'] = contacts_pd.iloc[0, 2] df['Contact_Title2'] = contacts_pd.iloc[1,0] df['Contact_Name2'] = contacts_pd.iloc[1, 1] df['Contact_Email2'] = contacts_pd.iloc[1, 2] df['Contact_Title3'] = contacts_pd.iloc[2,0] df['Contact_Name3'] = contacts_pd.iloc[2, 1] df['Contact_Email3'] = contacts_pd.iloc[2, 2] team_df = pd.concat([team_df, df]) except HTTPError as err: continue
Put your try statement inside of the nested for loop. Right now it looks like if you have a HTTP Error it is stopping the entire for loop - instead of continuing through the for loop. for url in dallas_urls: try: print(f'Link {counter} of {len((dallas_urls))}') counter += 1 back_url = url['href'] front_url = 'https://www.usyouthsoccer.org' total_url = front_url + back_url urllib.request.urlretrieve(total_url) except urllib.error.HTTPError: print ('Error') continue
I'm stumped at looping through a returned list of URLs
My first python project, I'm trying to scrape restaurant inspection. One site has summaries that offer keys to the detailed reports that I want to scrape. I'm stumped at looping through the keyed list of urls to get the details. import pandas as pd import bs4 import datetime import re import lxml from urllib.request import urlopen from urllib.error import HTTPError try: insp = pd.read_csv("ftp://dbprftp.state.fl.us/pub/llweb/5fdinspi.csv", usecols=[2,14,18,80,81]) except IOError: print("The file is not accessible.") insp.columns = ["CountyName", "InspectDate", "NumHighVio", "LicenseID", "VisitID"] # filter for alachua county restaurants alachua = insp[insp.CountyName == 'Alachua'] # filter for restaurants that had at least one serious violation alachua = alachua[alachua.NumHighVio > 0] # change date string to date object alachua['InspectDate'] = pd.to_datetime(alachua['InspectDate']) # sort most recent alachua = alachua.sort_values('InspectDate', ascending=False) # prefer to have user set timedelta below: today = pd.to_datetime('today') startDay = datetime.date.today() - datetime.timedelta(days=30) alachua = alachua[(alachua['InspectDate'] > startDay) & (alachua['InspectDate'] < today)] # takes LicenseID and VisitID, passes it into the urls for detailed reports for index, rows in alachua.iterrows(): visitID = rows['VisitID'] licID = rows['LicenseID'] urls = "https://www.myfloridalicense.com/inspectionDetail.asp?InspVisitID= %s &licid= %s" % (visitID, licID) urls = urls.replace(' ', '') print(urls) ## here's my problem: for url in urls: def get_inspect_detail(): html = urlopen(url) soup = bs4.BeautifulSoup(html.read(), 'lxml') details = soup.find_all('font', {'face':'verdana'})[10:] for detail in details: siteName = details[0].text licNum = details[2].text siteRank = details[4].text expDate = details[6].text primeStatus = details[8].text secStatus = details[10].text siteAddress = details[12].text inspectResult = details[20].text observed1 = details[34].get_text observed2 = details[36].text observed3 = details[38].text observed4 = details[40].text observed5 = details[42].text observed6 = details[44].text observed7 = details[46].text observed8 = details[48].text observed9 = details[50].text observed10 = details[52].text detailsLib = { 'Restaurant': siteName, 'License': licNum, 'Rank': siteRank, 'Expires': expDate, 'Primary': primeStatus, 'Secondary': secStatus, 'Address': siteAddress, 'Result': inspectResult, 'Observed1': observed1, 'Observed2': observed2, 'Observed3': observed3, 'Observed4': observed4, 'Observed5': observed5, 'Observed6': observed6, 'Observed7': observed7, 'Observed8': observed8, 'Observed9': observed9, 'Observed10': observed10 } repr(get_inspect_detail()) Probably an obvious mistake or lack of knowledge, but I can get the unscrubbed data for one url, but not for all.
I dont see a reason to define your function inside the loop. You would end up with a lot of redundant definitions this way. Second, you could just define a result list and accumulate the detailsLib objects inside it. def get_inspect_detail(url): html = urlopen(url) soup = bs4.BeautifulSoup(html.read(), 'lxml') details = soup.find_all('font', {'face': 'verdana'})[10:] result = [] for detail in details: siteName = details[0].text licNum = details[2].text siteRank = details[4].text expDate = details[6].text primeStatus = details[8].text secStatus = details[10].text siteAddress = details[12].text inspectResult = details[20].text observed1 = details[34].get_text observed2 = details[36].text observed3 = details[38].text observed4 = details[40].text observed5 = details[42].text observed6 = details[44].text observed7 = details[46].text observed8 = details[48].text observed9 = details[50].text observed10 = details[52].text detailsLib = { 'Restaurant': siteName, 'License': licNum, 'Rank': siteRank, 'Expires': expDate, 'Primary': primeStatus, 'Secondary': secStatus, 'Address': siteAddress, 'Result': inspectResult, 'Observed1': observed1, 'Observed2': observed2, 'Observed3': observed3, 'Observed4': observed4, 'Observed5': observed5, 'Observed6': observed6, 'Observed7': observed7, 'Observed8': observed8, 'Observed9': observed9, 'Observed10': observed10 } result.append(detailsLib) return result for url in urls: repr(get_inspect_detail(url))