Python Web Scrape Unknown Number of Pages - python-3.x

I have working code that scrapes a single Craigslist page for specific information, but what would I need to add in order to grab the data from ALL of the pages (not knowing how many pages ahead of time)?
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url="https://portland.craigslist.org/search/sss?query=electronics&sort=date"
uClient=uReq(my_url) #sends GET request to URL
page_html=uClient.read() #reads returned data and puts it in a variable
uClient.close() #close the connection
#create a file that we will want later to write parsed data to
filename="ScrapedData.csv"
f=open(filename, 'w')
headers="date, location, title, price\n"
f.write(headers)
#use BS to parse the webpage
page_soup=soup(page_html,'html.parser') #applying BS to the obtained html
containers=page_soup.findAll('p',{'class','result-info'})
for container in containers:
container_date=container.findAll('time',{'class','result-date'})
date=container_date[0].text
try:
container_location=container.findAll('span',{'class','result-hood'})
location=container_location[0].text
except:
try:
container_location=container.findAll('span',{'class','nearby'})
location=container_location[0].text
except:
location='NULL'
container_title=container.findAll('a',{'class','result-title'})
title=container_title[0].text
try:
container_price=container.findAll('span',{'class','result-price'})
price=container_price[0].text
except:
price='NULL'
#to print to screen
print('date:'+date)
print('location:'+location)
print('title:'+title)
print('price:'+price)
#to write to csv
f.write(date+','+location.replace(",","-")+','+title.replace(","," ")+','+price+'\n')
f.close()

Apart from what sir Andersson has already shown, you can do that as well for this site:
import requests
from bs4 import BeautifulSoup
import csv
page_link = "https://portland.craigslist.org/search/sss?s={}&query=electronics&sort=date"
for link in [page_link.format(page) for page in range(0,1147,120)]: #this is the fix
res = requests.get(link)
soup = BeautifulSoup(res.text,'lxml')
for container in soup.select('.result-info'):
try:
date = container.select('.result-date')[0].text
except IndexError:
date = ""
try:
title = container.select('.result-title')[0].text
except IndexError:
title = ""
try:
price = container.select('.result-price')[0].text
except IndexError:
price = ""
print(date,title,price)
with open("craigs_item.csv","a",newline="",encoding="utf-8") as outfile:
writer = csv.writer(outfile)
writer.writerow([date,title,price])

You can try to loop through all pages by handling "s" parameter in URL until you find page with no results (page with text "search and you will find"):
import requests
results_counter = 0
while True:
my_url="https://portland.craigslist.org/search/sss?query=electronics&sort=date&s=%d" % results_counter
page_html = requests.get(my_url).text
if "search and you will find" in page_html:
break
else:
results_counter += 120
filename="ScrapedData.csv"
f=open(filename, 'w')
headers="date, location, title, price\n"
f.write(headers)
page_soup=soup(page_html,'html.parser') #applying BS to the obtained html
containers=page_soup.findAll('p',{'class','result-info'})
...

Related

Getting incorrect link on parsing web page in BeautifulSoup

I'm trying to get the download link from the button in this page. But when I open the download link that I get from my code I get this message
I noticed that if I manually click the button and open the link in a new page the csrfKey part of the link is always same whereas when I run the code I get a different key every time. Here's my code
from bs4 import BeautifulSoup
import requests
import re
def GetPage(link):
source_new = requests.get(link).text
soup_new = BeautifulSoup(source_new, 'lxml')
container_new = soup_new.find_all(class_='ipsButton')
for data_new in container_new:
#print(data_new)
headline = data_new # Display text
match = re.findall('download', str(data_new), re.IGNORECASE)
if(match):
print(f'{headline["href"]}\n')
if __name__ == '__main__':
link = 'https://eci.gov.in/files/file/10985-5-number-and-types-of-constituencies/'
GetPage(link)
Before you get to the actual download links of the files, you need to agree to Terms and Conditions. So, you need to fake this with requests and then parse the next page you get.
Here's how:
import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
link = 'https://eci.gov.in/files/file/10985-5-number-and-types-of-constituencies/'
with requests.Session() as connection:
r = connection.get("https://eci.gov.in/")
confirmation_url = BeautifulSoup(
connection.get(link).text, 'lxml'
).select_one(".ipsApp .ipsButton_fullWidth")["href"]
fake_agree_to_continue = connection.get(
confirmation_url.replace("?do=download", "?do=download&confirm=1")
).text
download_links = [
a["href"] for a in
BeautifulSoup(
fake_agree_to_continue, "lxml"
).select(".ipsApp .ipsButton_small")[1:]]
for download_link in download_links:
response = connection.get(download_link)
file_name = (
response
.headers["Content-Disposition"]
.replace('"', "")
.split(" - ")[-1]
)
print(f"Downloading: {file_name}")
with open(file_name, "wb") as f:
f.write(response.content)
This should output:
Downloading: Number And Types Of Constituencies.pdf
Downloading: Number And Types Of Constituencies.xls
And save two files: a .pdf and a .xls. The later one looks like this:

How to scrape multiple pages with requests in python

recently started getting into web scraping and i have managed ok but now im stuck and i cant find the answer or figure it out.
Here is my code for scraping and exporting info from a single page
import requests
page = requests.get("https://www.example.com/page.aspx?sign=1")
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')
#finds the right heading to grab
box = soup.find('h1').text
heading = box.split()[0]
#finds the right paragraph to grab
reading = soup.find_all('p')[0].text
print (heading, reading)
import csv
from datetime import datetime
# open a csv file with append, so old data will not be erased
with open('index.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([heading, reading, datetime.now()])
Problem occurs when i try to scrape multiple pages at the same time.
They are all the same just pagination changes eg
https://www.example.com/page.aspx?sign=1
https://www.example.com/page.aspx?sign=2
https://www.example.com/page.aspx?sign=3
https://www.example.com/page.aspx?sign=4 etc
Instead of writing the same code 20 times how do i stick all the data in a tuple or an array and export to csv.
Many thanks in advance.
Just try it out with a loop, until you got no page available (request is not OK). Should be easy to get.
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime
results = []
page_number = 1
while True:
response = requests.get(f"https://www.example.com/page.aspx?sign={page_number}")
if response.status_code != 200:
break
soup = BeautifulSoup(page.content, 'html.parser')
#finds the right heading to grab
box = soup.find('h1').text
heading = box.split()[0]
#finds the right paragraph to grab
reading = soup.find_all('p')[0].text
# write a list
# results.append([heading, reading, datetime.now()])
# or tuple.. your call
results.append((heading, reading, datetime.now()))
page_number = page_number + 1
with open('index.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
for result in results:
writer.writerow(result)

Writing multiple files as output when webscraping - python bs4

to preface - I am quite new to python and my HTML skills are kindergarten level.
So I am trying to save the quotes from this website which has many links in it for each member of the US Election candidates.
I have managed to get the actual code to extract the quotes (with the help of soem stackoverflow users), but am lost on how to write these quotes in to separate text files for each candidate.
For example, the first page, with all of Justin Amash's quotes should be written to a file: JustinAmash.txt.
The second page, with all of Michael Bennet's quotes should be written to MichaelBennet.txt (or something in that form). and so on.. Is there a way to do this?
For reference, to scrape the pages, the following code works:
import bs4
from urllib.request import Request,urlopen as uReq, HTTPError
#Import HTTPError in order to avoid the links with no content/resource of interest
from bs4 import BeautifulSoup as soup_
import re
#define url of interest
my_url = 'http://archive.ontheissues.org/Free_Trade.htm'
def make_soup(url):
# set up known browser user agent for the request to bypass HTMLError
req=Request(url,headers={'User-Agent': 'Mozilla/5.0'})
#opening up connection, grabbing the page
uClient = uReq(req)
page_html = uClient.read()
uClient.close()
#html is jumbled at the moment, so call html using soup function
soup = soup_(page_html, "lxml")
return soup
# Test: print title of page
#soup.title
soup = make_soup(my_url)
tags = soup.findAll("a" , href=re.compile("javascript:pop\("))
#print(tags)
# open a text file and write it if it doesn't exist
file1 = open("Quotefile.txt","w")
# get list of all URLS
for links in tags:
link = links.get('href')
if "java" in link:
print("http://archive.ontheissues.org" + link[18:len(link)-3])
main_url = "http://archive.ontheissues.org" + link[18:len(link)-3]
try:
sub_soup = make_soup(main_url)
content_collexn = sub_soup.body.contents #Splitting up the page into contents for iterative access
#text_data = [] #This list can be used to store data related to every person
for item in content_collexn:
#Accept an item if it belongs to the following classes
if(type(item) == str):
print(item.get_text())
elif(item.name == "h3"):
#Note that over here, every h3 tagged title has a string following it
print(item.get_text())
#Hence, grab that string too
print(item.next_sibling)
elif(item.name in ["p", "ul", "ol"]):
print(item.get_text())
except HTTPError: #Takes care of missing pages and related HTTP exception
print("[INFO] Resource not found. Skipping to next link.")
#print(text_data)
You can store that text data into the list you had started with text_data. Join all those items and then write to file:
So something like:
import bs4
from urllib.request import Request,urlopen as uReq, HTTPError
#Import HTTPError in order to avoid the links with no content/resource of interest
from bs4 import BeautifulSoup as soup_
import re
#define url of interest
my_url = 'http://archive.ontheissues.org/Free_Trade.htm'
def make_soup(url):
# set up known browser user agent for the request to bypass HTMLError
req=Request(url,headers={'User-Agent': 'Mozilla/5.0'})
#opening up connection, grabbing the page
uClient = uReq(req)
page_html = uClient.read()
uClient.close()
#html is jumbled at the moment, so call html using soup function
soup = soup_(page_html, "lxml")
return soup
# Test: print title of page
#soup.title
soup = make_soup(my_url)
tags = soup.findAll("a" , href=re.compile("javascript:pop\("))
#print(tags)
# open a text file and write it if it doesn't exist
#file1 = open("Quotefile.txt","w")
# get list of all URLS
candidates = []
for links in tags:
link = links.get('href')
if "java" in link:
#print("http://archive.ontheissues.org" + link[18:len(link)-3])
main_url = "http://archive.ontheissues.org" + link[18:len(link)-3]
candidate = link.split('/')[-1].split('_Free_Trade')[0]
if candidate in candidates:
continue
else:
candidates.append(candidate)
try:
sub_soup = make_soup(main_url)
content_collexn = sub_soup.body.contents #Splitting up the page into contents for iterative access
text_data = [] #This list can be used to store data related to every person
for item in content_collexn:
#Accept an item if it belongs to the following classes
if(type(item) == str):
#print(item.get_text())
text_data.append(item.get_text())
elif(item.name == "h3"):
#Note that over here, every h3 tagged title has a string following it
#print(item.get_text())
text_data.append(item.get_text())
#Hence, grab that string too
#print(item.next_sibling)
text_data.append(item.next_sibling)
elif(item.name in ["p", "ul", "ol"]):
#print(item.get_text())
text_data.append(item.get_text())
except HTTPError: #Takes care of missing pages and related HTTP exception
print("[INFO] Resource not found. Skipping to next link.")
candidates.remove(candidate)
continue
text_data = '\n'.join(text_data)
with open("C:/%s.txt" %(candidate), "w") as text_file:
text_file.write(text_data)
print('Aquired: %s' %(candidate))

How to scrape all the test match details in cricinfo

I am trying to scrape all the test match details but it is showing HTTP Error 504: Gateway Timeout I am getting the details of test matches but it is not showing this is my code i have used bs4 to scrape the test match details from cricinfo
I need to scrape the details of 2000 test matches this is my code
import urllib.request as req
BASE_URL = 'http://www.espncricinfo.com'
if not os.path.exists('./espncricinfo-fc'):
os.mkdir('./espncricinfo-fc')
for i in range(0, 2000):
soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=test;all=1;page=' + str(i)).read())
time.sleep(1)
for new_host in soupy.findAll('a', {'class' : 'srchPlyrNmTxt'}):
try:
new_host = new_host['href']
except:
continue
odiurl =BASE_URL + urljoin(BASE_URL,new_host)
new_host = unicodedata.normalize('NFKD', new_host).encode('ascii','ignore')
print(new_host)
html = req.urlopen(odiurl).read()
if html:
with open('espncricinfo-fc/{0!s}'.format(str.split(new_host, "/")[4]), "wb") as f:
f.write(html)
print(html)
else:
print("no html")
this is usually happen when doing multiple request too fast, it can be the server is down or your connection blocked by server firewall, try increase your sleep() or add random sleep.
import random
.....
for i in range(0, 2000):
soupy = BeautifulSoup(....)
time.sleep(random.randint(2,6))
not sure why, seems to be working for me.
I made a few changes in the loop through the links. I'm not sure how you're wanting the output to look in terms of writing it to your file, so I left that part alone. But like I said, seems to be working ok on my end.
import bs4
import requests
import os
import time
import urllib.request as req
BASE_URL = 'http://www.espncricinfo.com'
if not os.path.exists('C:/espncricinfo-fc'):
os.mkdir('C:/espncricinfo-fc')
for i in range(0, 2000):
i=0
url = 'http://search.espncricinfo.com/ci/content/match/search.html?search=test;all=1;page=%s' %i
html = requests.get(url)
print ('Checking page %s of 2000' %(i+1))
soupy = bs4.BeautifulSoup(html.text, 'html.parser')
time.sleep(1)
for new_host in soupy.findAll('a', {'class' : 'srchPlyrNmTxt'}):
try:
new_host = new_host['href']
except:
continue
odiurl = BASE_URL + new_host
new_host = odiurl
print(new_host)
html = req.urlopen(odiurl).read()
if html:
with open('C:/espncricinfo-fc/{0!s}'.format('_'.join(str.split(new_host, "/")[4:])), "wb") as f:
f.write(html)
#print(html)
else:
print("no html")

Python Web Scraping using BS

I have a web scraping program that gets multiple pages, but I have to set the while loop to a number. I want to make a condition that stops the loop once it reaches the last page or recognizes there are no more items to scrape. Assume I don't know how many pages exist. How do I change the while loop condition to make it stop without putting a random number?
import requests
from bs4 import BeautifulSoup
import csv
filename="output.csv"
f=open(filename, 'w', newline="",encoding='utf-8')
headers="Date, Location, Title, Price\n"
f.write(headers)
i=0
while i<5000:
if i==0:
page_link="https://portland.craigslist.org/search/sss?query=xbox&sort=date"
else:
page_link="https://portland.craigslist.org/search/sss?s={}&query=xbox&sort=date".format(i)
res=requests.get(page_link)
soup=BeautifulSoup(res.text,'html.parser')
for container in soup.select('.result-info'):
date=container.select('.result-date')[0].text
try:
location=container.select('.result-hood')[0].text
except:
try:
location=container.select('.nearby')[0].text
except:
location='NULL'
title=container.select('.result-title')[0].text
try:
price=container.select('.result-price')[0].text
except:
price="NULL"
print(date,location,title,price)
f.write(date+','+location.replace(","," ")+','+title.replace(","," ")+','+price+'\n')
i+=120
f.close()
I use while True to run endless loop and break to exit when there is no data
data = soup.select('.result-info')
if not data:
print('END: no data:')
break
I use module csv to save data so I don't have to use replace(","," ").
It will put text in " " if there is , in text.
s={} can be in any place after ? so I put it at the end to make code more readable.
Portal gives first page even if you use s=0 so I don't have to check i == 0
(BTW: in my code it has more readable name offset)
Full code.
import requests
from bs4 import BeautifulSoup
import csv
filename = "output.csv"
f = open(filename, 'w', newline="", encoding='utf-8')
csvwriter = csv.writer(f)
csvwriter.writerow( ["Date", "Location", "Title", "Price"] )
offset = 0
while True:
print('offset:', offset)
url = "https://portland.craigslist.org/search/sss?query=xbox&sort=date&s={}".format(offset)
response = requests.get(url)
if response.status_code != 200:
print('END: request status:', response.status)
break
soup = BeautifulSoup(response.text, 'html.parser')
data = soup.select('.result-info')
if not data:
print('END: no data:')
break
for container in data:
date = container.select('.result-date')[0].text
try:
location = container.select('.result-hood')[0].text
except:
try:
location = container.select('.nearby')[0].text
except:
location = 'NULL'
#location = location.replace(","," ") # don't need it with `csvwriter`
title = container.select('.result-title')[0].text
try:
price = container.select('.result-price')[0].text
except:
price = "NULL"
#title.replace(",", " ") # don't need it with `csvwriter`
print(date, location, title, price)
csvwriter.writerow( [date, location, title, price] )
offset += 120
f.close()

Resources