So far I can scrape the initial page and save. What I'm trying to do is use the page count on the site to determine the number of loops.
The page count is found in the code with the 'count =', which in this case is 18. How can I loop my code to scrape and save each page?
Secondly, my code scrapes each url 3 times.
Is there a way to not have the duplicates?
Lastly, I'm using 'strip' to get the dynamic integer for the loop. The element returns the text: Viewing page 1 of 18. Using 'strip' returns the correct number if the last number is a single integer. In this case, since there are two (18), it only returns the 8. Can't figure that one out for the life of me.
Appreciate the help.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import re
import csv
chrome_driver = "C:/chromedriver.exe"
Chrome_options = Options()
Chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9015")
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(chrome_driver, options=Chrome_options)
source = driver.page_source
soup = BeautifulSoup(source, "html.parser")
### set zipcode and search length ###
zipcode = "84105"
search = "1yr" #search option: 1mo 3mo 6mo 1yr 2yr 3yr All
url = 'https://www.redfin.com/zipcode/' + zipcode + '/filter/include=sold-' + search
https = "https://www.redfin.com"
driver.get(url)
#####################################
### get page count ###
count = soup.find('span', class_='pageText').get_text() #grabs total pages to grab
pages = count.strip('Viewing page 1 of') #gives a number of pages to paginate
print("This search has " + pages + " pages" + ": " + zipcode)
print(url)
########################
data = []
for url in soup.find_all('a', attrs={'href': re.compile("^/UT/")}):
print(https + url['href'])
data.append(https + url['href'])
with open("links.csv",'a') as csvfile:
write = csv.writer(csvfile, delimiter = ' ')
write.writerows(data)
Just noticed that you want to loop without duplicates:
import requests
from bs4 import BeautifulSoup
import csv
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'
}
def main(url):
with requests.Session() as req:
print("Extracting Page# 1")
r = req.get(url.format("1"), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
total = int(soup.select_one("span.pageText").text.split(" ")[-1]) + 1
urls = [f'{url[:22]}{a.get("href")}' for a in soup.select(
"a.slider-item")]
for page in range(2, total):
print(f"Extracting Page# {page}")
r = req.get(url.format(page), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
links = [f'{url[:22]}{a.get("href")}' for a in soup.select(
"a.slider-item")]
urls.extend(links)
mylist = list(dict.fromkeys(urls))
with open("links.csv", 'w', newline="") as f:
writer = csv.writer(f)
writer.writerow(["Links"])
writer.writerows(zip(mylist))
main("https://www.redfin.com/zipcode/84105/filter/include=sold-1yr/page-{}")
Related
I am currently web scraping and would like to get the specifications on the same row. When I currently print it column 2 looks like this:
text
text
text
text
text
I would like to get it all on the same row like this
text text text text text
so i can later chop it up into different columns in Excel later.
Is there maybe a transposing command I could use or something else?
Code:
import requests
from bs4 import BeautifulSoup
import csv
with open('Oslo.csv', 'w', newline='') as f:
fieldnames = ['column1', 'column2']
skriver = csv.DictWriter(f, fieldnames=fieldnames)
skriver.writeheader()
def data(page_number):
URL = 'https://www.url.com/' + str(
page_number) + '&sort=PUBLISHED_DESC'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('h2', class_="ads__unit__content__title ads__unit__content__title--fav-placeholder")
for data in ads:
id = data.find('a')
link = (id['id'])
url = 'https://www.url.com/'+str(link)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('div', class_="u-word-break")
for stats in ads:
address = stats.find('p', class_="u-caption")
specs = stats.find('dl', class_="definition-list definition-list--cols1to2")
skriver.writerow({'column1': address.text.strip(), 'column2': specs.text})
for x in range(1, 2):
data(x)
print('Ferdig, du kan åpne oslo.csv')
EDIT: Scraping from the website is illegal, so I removed the URL.
your specs.text is a string that contains \n new lines. You can split it, then join it back with just a space. Ie ' '.join(specs.text.split())
import requests
from bs4 import BeautifulSoup
import csv
with open('Oslo.csv', 'w', newline='') as f:
fieldnames = ['column1', 'column2']
skriver = csv.DictWriter(f, fieldnames=fieldnames)
skriver.writeheader()
def data(page_number):
URL = 'https://www.url.com/' + str(page_number) + '&sort=PUBLISHED_DESC'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('h2', class_="ads__unit__content__title ads__unit__content__title--fav-placeholder")
for data in ads:
id = data.find('a')
link = (id['id'])
url = 'https://www.url.com/'+str(link)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ads = soup.findAll('div', class_="u-word-break")
for stats in ads:
address = stats.find('p', class_="u-caption")
specs = stats.find('dl', class_="definition-list definition-list--cols1to2")
address = ' '.join(address.text.split())
specs = ' '.joins(specs.text.split()) #<-- changed here
skriver.writerow({'column1': address, 'column2': specs})
for x in range(1, 2):
data(x)
print('Ferdig, du kan åpne oslo.csv')
I am working on web scraping, I am taking names from text file by line by line and searching it on Google and scraping addresses from the results. I want to add that result in front of respective names. This is my text file a.txt:
0.5BN FINHEALTH PRIVATE LIMITED
01 SYNERGY CO.
1 BY 0 SOLUTIONS
and this is my code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
out_fl = open('a.txt','r')
for line in out_fl:
query = line
query = query.replace(' ', '+')
# print(line)
URL = f"https://google.com/search?q={query}"
# print(URL)
headers = {"user-agent": USER_AGENT}
resp = requests.get(URL, headers=headers)
if resp.status_code == 200:
soup = BeautifulSoup(resp.content, "html.parser")
results = []
for g in soup.find_all('div', class_="i4J0ge"):
address = soup.find('span', class_="LrzXr")
if address:
address = (address.text)
else:
print("Not found")
phone = soup.find('span',class_="LrzXr zdqRlf kno-fv")
if phone:
phone = (phone.text)
else:
print("None")
company = line
item = {"company": line.replace('\n',''),"Address" : address,"Phone" : phone}
# print(item)
results.append(item)
print(results)
df = pd.DataFrame(results, columns=["company", "Address", "Phone"])
df.to_excel('filename.xlsx', sheet_name='sheet name', index=False)
out_fl.close()
And I don't know where it overwrites, please help me out. Thanks.
try this but you have to modify the results as per your
import requests
from bs4 import BeautifulSoup
import pandas as pd
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
df = pd.DataFrame(results, columns=["company","result"])
out_fl = open('a.txt','r')
for line in out_fl:
query = line
query = query.replace(' ', '+')
# print(line)
URL = f"https://google.com/search?q={query}"
# print(URL)
headers = {"user-agent": USER_AGENT}
resp = requests.get(URL, headers=headers)
if resp.status_code == 200:
soup = BeautifulSoup(resp.content, "html.parser")
results = []
for g in soup.find_all('div', class_="i4J0ge"):
address = soup.find('span', class_="LrzXr")
if address:
address = (address.text)
else:
print("Not found")
phone = soup.find('span',class_="LrzXr zdqRlf kno-fv")
if phone:
phone = (phone.text)
else:
print("None")
company = line
item = {"company": line.replace('\n',''),"Address" : address,"Phone" : phone}
# print(item)
results.append(item)
print(results)
df.loc[query]=[query,results]
df.to_excel("results.xlsx",sheet_name="result", index=False)
This script will produce CSV with companies/phones from your input file a.txt:
import requests
import pandas as pd
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0'}
with open('a.txt','r') as f_in:
companies = [line.strip() for line in f_in if line.strip()]
all_data = []
for company in companies:
print(company)
soup = BeautifulSoup(requests.get('https://google.com/search', params={'q': company, 'hl': 'en'}, headers=headers).content, 'html.parser')
address = soup.select_one('.LrzXr')
if address:
address = address.text
else:
address = 'Not Found'
phone = soup.select_one('.LrzXr.zdqRlf.kno-fv')
if phone:
phone = phone.text
else:
phone = 'Not Found'
all_data.append({"Company": company,"Address": address,"Phone": phone})
df = pd.DataFrame(all_data)
df.to_csv('data.csv')
Produces data.csv (screenshot from LibreOffice):
Problem:
Don't know if google fu is failing me again but I am unable to download csvs from a list of urls. I have used requests and bs4 to gather the urls (the final list is correct) - see process below for more info.
I then followed one of the answers given here using urllib to download: Trying to download data from URL with CSV File, as well as a number other stackoverflow python answers for downloading csvs.
Currently I am stuck with an
HTTP Error 404: Not Found
(below stack trace is from last attempt where passing User-Agent)
----> 9 f = urllib.request.urlopen(req)
10 print(f.read().decode('utf-8'))
#other lines
--> 650 raise HTTPError(req.full_url, code, msg, hdrs, fp)
651
652 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 404: Not Found
I tried the solution here of adding a User-Agent: Web Scraping using Python giving HTTP Error 404: Not Found , though I would have expected a 403 not 404 error code - but seems to have worked for a number of OPs.
This still failed with same error. I am pretty sure I can solve this by simply using selenium and passing the csv urls to .get but I want to know if I can solve this with requests alone.
Outline:
I visit this this page:
https://digital.nhs.uk/data-and-information/publications/statistical/patients-registered-at-a-gp-practice
I grab all the monthly version links e.g. Patients Registered at a GP Practice May 2019, I then visit each of those pages and grab all the csv links within.
I loop the final dictionary of filename:download_url pairs attempting to download the files.
Question:
Can anyone see what I am doing wrong or how to fix this so I can download the files without resorting to selenium? I'm also unsure of the most efficient way to accomplish this - perhaps urllib is not actually required at all and just requests will suffice?
Python:
Without user-agent:
import requests
from bs4 import BeautifulSoup as bs
import urllib
base = 'https://digital.nhs.uk/'
all_files = []
with requests.Session() as s:
r = s.get('https://digital.nhs.uk/data-and-information/publications/statistical/patients-registered-at-a-gp-practice')
soup = bs(r.content, 'lxml')
links = [base + item['href'] for item in soup.select('.cta__button')]
for link in links:
r = s.get(link)
soup = bs(r.content, 'lxml')
file_links = {item.text.strip().split('\n')[0]:base + item['href'] for item in soup.select('[href$=".csv"]')}
if file_links:
all_files.append(file_links) #ignore empty dicts as for some months there is no data yet
else:
print('no data : ' + link)
all_files = {k: v for d in all_files for k, v in d.items()} #flatten list of dicts to single dict
path = r'C:\Users\User\Desktop'
for k,v in all_files.items():
#print(k,v)
print(v)
response = urllib.request.urlopen(v)
html = response.read()
with open(path + '\\' + k + '.csv', 'wb') as f:
f.write(html)
break #as only need one test case
Test with adding User-Agent:
req = urllib.request.Request(
v,
data=None,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
)
f = urllib.request.urlopen(req)
print(f.read().decode('utf-8'))
looking at the values, it's showing me for your links
https://digital.nhs.uk/https://files.digital.nhs.uk/publicationimport/pub13xxx/pub13932/gp-reg-patients-04-2014-lsoa.csv
I think you want to drop the base +, so use this:
file_links = {item.text.strip().split('\n')[0]:item['href'] for item in soup.select('[href$=".csv"]')}
instead of:
file_links = {item.text.strip().split('\n')[0]:base + item['href'] for item in soup.select('[href$=".csv"]')}
Edit: Full Code:
import requests
from bs4 import BeautifulSoup as bs
base = 'https://digital.nhs.uk/'
all_files = []
with requests.Session() as s:
r = s.get('https://digital.nhs.uk/data-and-information/publications/statistical/patients-registered-at-a-gp-practice')
soup = bs(r.content, 'lxml')
links = [base + item['href'] for item in soup.select('.cta__button')]
for link in links:
r = s.get(link)
soup = bs(r.content, 'lxml')
file_links = {item.text.strip().split('\n')[0]:item['href'] for item in soup.select('[href$=".csv"]')}
if file_links:
all_files.append(file_links) #ignore empty dicts as for some months there is no data yet
else:
print('no data : ' + link)
all_files = {k: v for d in all_files for k, v in d.items()} #flatten list of dicts to single dict
path = 'C:/Users/User/Desktop/'
for k,v in all_files.items():
#print(k,v)
print(v)
response = requests.get(v)
html = response.content
k = k.replace(':', ' -')
file = path + k + '.csv'
with open(file, 'wb' ) as f:
f.write(html)
break #as only need one test case
I have been using Beautiful Soup for parsing webpages for some data extraction. It has worked perfectly well for me so far, for other webpages. But however I'm trying to count the number of < a> tags in this page,
from bs4 import BeautifulSoup
import requests
catsection = "cricket"
url_base = "http://www.dnaindia.com/"
i = 89
url = url_base + catsection + "?page=" + str(i)
print(url)
#This is the page I'm trying to parse and also the one in the hyperlink
#I get the correct url i'm looking for at this stage
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, 'html.parser')
j=0
for num in soup.find_all('a'):
j=j+1
print(j)
I'm getting the output as 0. This makes me think that the 2 lines after r=requests.get(url) is probably not working(there's obviously no chance that there's zero < a> tags in the page), and i'm not sure about what alternative solution I can use here. Does anybody have any solution or faced a similar kind of problem before?
Thanks, in advance.
You need to pass some of the information along with the request to the server.
Following code should work...You can play along with other parameter as well
from bs4 import BeautifulSoup
import requests
catsection = "cricket"
url_base = "http://www.dnaindia.com/"
i = 89
url = url_base + catsection + "?page=" + str(i)
print(url)
headers = {
'User-agent': 'Mozilla/5.0'
}
#This is the page I'm trying to parse and also the one in the hyperlink
#I get the correct url i'm looking for at this stage
r = requests.get(url, headers=headers)
data = r.text
soup = BeautifulSoup(data, 'html.parser')
j=0
for num in soup.find_all('a'):
j=j+1
print(j)
Put any url in the parser and check the number of "a" tags available on that page:
from bs4 import BeautifulSoup
import requests
url_base = "http://www.dnaindia.com/cricket?page=1"
res = requests.get(url_base, headers={'User-agent': 'Existed'})
soup = BeautifulSoup(res.text, 'html.parser')
a_tag = soup.select('a')
print(len(a_tag))
So im trying to get all the statistics in the statistics box page on the url page for each team. An example of what the page looks like is on the hyperlink I put below. Im trying to have if so it prints out;
month : win %
month : win %
All time: win%
But I am not to sure how to write that code, since the last piece of code I wrote in the main was giving me an error.
http://www.gosugamers.net/counterstrike/teams/16448-nasty-gravy-runners
import time
import requests
from bs4 import BeautifulSoup
def get_all(url, base): # Well called it will print all the team links
r = requests.get(url)
page = r.text
soup = BeautifulSoup(page, 'html.parser')
for team_links in soup.select('div.details h3 a'):
members = int(team_links.find_next('th', text='Members:').find_next_sibling('td').text.strip().split()[0])
if members < 5:
continue
yield base + team_links['href']
next_page = soup.find('div', {'class': 'pages'}).find('span', text='Next')
while next_page:
# Gives the server a break
time.sleep(0.2)
r = requests.get(BASE_URL + next_page.find_previous('a')['href'])
page = r.text
soup = BeautifulSoup(page)
for team_links in soup.select('div.details h3 a'):
yield BASE_URL + team_links['href']
next_page = soup.find('div', {'class': 'pages'}).find('span', text='Next')
if __name__ == '__main__':
BASE_URL = 'http://www.gosugamers.net'
URL = 'http://www.gosugamers.net/counterstrike/teams'
for links in get_all(URL, BASE_URL): # When run it will generate all the links for all the teams
r = requests.get(links)
page = r.content
soup = BeautifulSoup(page)
for statistics in soup.select('div.statistics tr'):
win_rate = int(statistics.find('th', text='Winrate:').find_next_sibling('td'))
print(win_rate)
Not sure exactly what you want but this will get all the team stats:
from bs4 import BeautifulSoup, Tag
import requests
soup = BeautifulSoup(requests.get("http://www.gosugamers.net/counterstrike/teams/16448-nasty-gravy-runners").content)
table = soup.select_one("table.stats-table")
head1 = [th.text.strip() for th in table.select("tr.header th") if th.text]
head2 = [th.text.strip() for th in table.select_one("tr + tr") if isinstance(th, Tag)]
scores = [th.text.strip() for th in table.select_one("tr + tr + tr") if isinstance(th, Tag)]
print(head1, head2, scores)
Output:
([u'Jun', u'May', u'All time'], [u'Winrate:', u'0%', u'0%', u'0%'], [u'Matches played:', u'0 / 0 / 0', u'0 / 0 / 0', u'0 / 0 / 0'])