Python Web Scraping using BS - python-3.x

I have a web scraping program that gets multiple pages, but I have to set the while loop to a number. I want to make a condition that stops the loop once it reaches the last page or recognizes there are no more items to scrape. Assume I don't know how many pages exist. How do I change the while loop condition to make it stop without putting a random number?
import requests
from bs4 import BeautifulSoup
import csv
filename="output.csv"
f=open(filename, 'w', newline="",encoding='utf-8')
headers="Date, Location, Title, Price\n"
f.write(headers)
i=0
while i<5000:
if i==0:
page_link="https://portland.craigslist.org/search/sss?query=xbox&sort=date"
else:
page_link="https://portland.craigslist.org/search/sss?s={}&query=xbox&sort=date".format(i)
res=requests.get(page_link)
soup=BeautifulSoup(res.text,'html.parser')
for container in soup.select('.result-info'):
date=container.select('.result-date')[0].text
try:
location=container.select('.result-hood')[0].text
except:
try:
location=container.select('.nearby')[0].text
except:
location='NULL'
title=container.select('.result-title')[0].text
try:
price=container.select('.result-price')[0].text
except:
price="NULL"
print(date,location,title,price)
f.write(date+','+location.replace(","," ")+','+title.replace(","," ")+','+price+'\n')
i+=120
f.close()

I use while True to run endless loop and break to exit when there is no data
data = soup.select('.result-info')
if not data:
print('END: no data:')
break
I use module csv to save data so I don't have to use replace(","," ").
It will put text in " " if there is , in text.
s={} can be in any place after ? so I put it at the end to make code more readable.
Portal gives first page even if you use s=0 so I don't have to check i == 0
(BTW: in my code it has more readable name offset)
Full code.
import requests
from bs4 import BeautifulSoup
import csv
filename = "output.csv"
f = open(filename, 'w', newline="", encoding='utf-8')
csvwriter = csv.writer(f)
csvwriter.writerow( ["Date", "Location", "Title", "Price"] )
offset = 0
while True:
print('offset:', offset)
url = "https://portland.craigslist.org/search/sss?query=xbox&sort=date&s={}".format(offset)
response = requests.get(url)
if response.status_code != 200:
print('END: request status:', response.status)
break
soup = BeautifulSoup(response.text, 'html.parser')
data = soup.select('.result-info')
if not data:
print('END: no data:')
break
for container in data:
date = container.select('.result-date')[0].text
try:
location = container.select('.result-hood')[0].text
except:
try:
location = container.select('.nearby')[0].text
except:
location = 'NULL'
#location = location.replace(","," ") # don't need it with `csvwriter`
title = container.select('.result-title')[0].text
try:
price = container.select('.result-price')[0].text
except:
price = "NULL"
#title.replace(",", " ") # don't need it with `csvwriter`
print(date, location, title, price)
csvwriter.writerow( [date, location, title, price] )
offset += 120
f.close()

Related

Sending GET requests to amazon.in but the webserver responded with response code 503, what to do?

Here is my code:
This whole script worked fine for the first 2-3 times but now is constantly sending 503 responses
The Internet was checked by me multiple times but there wasn't any problem with internet
from bs4 import BeautifulSoup
import requests, sys, os, json
def get_amazon_search_page(search):
search = search.strip().replace(" ", "+")
for i in range(3): # tries to connect and get request the amazon 3 times
try:
print("Searching...")
response = requests.get("https://www.amazon.in/s?k={}&ref=nb_sb_noss".format(search)) # search string will be manipulated by replacing all spaces with "+" in order to search from the website itself
print(response.status_code)
if response.status_code == 200:
return response.content, search
except Exception:
pass
print("Is the search valid for the site: https://www.amazon.in/s?k={}&ref=nb_sb_noss".format(search))
sys.exit(1)
def get_items_from_page(page_content):
print(page_content)
soup = BeautifulSoup(page_content, "html.parser") # soup for extracting information
items = soup.find_all("span", class_ = "a-size-medium a-color-base a-text-normal")
prices = soup.find_all("span", class_ = "a-price-whole")
item_list = []
total_price_of_all = 0
for item, price in zip(items, prices):
dict = {}
dict["Name"] = item.text
dict["Price"] = int(price.text)
total_price_of_all += int(price.text.replace(",", ""))
item_list.append(dict)
average_price = total_price_of_all/len(item_list)
file = open("items.json", "w")
json.dump(item_list, file, indent = 4)
print("Your search results are available in the items.json file")
print("Average prices for the search: {}".format(average_price))
file.close()
def main():
os.system("clear")
print("Note: Sometimes amazon site misbehaves by sending 503 responses, this can be due to heavy traffic on that site, please cooperate\n\n")
search = input("Enter product name: ").strip()
page_content = get_amazon_search_page(search)
get_items_from_page(page_content)
if __name__ == "__main__":
while True:
main()
Please Help !
The server blocks you from scraping it.
If you check the robots.txt, you can see that the link you are trying to request is disallowed:
Disallow: */s?k=*&rh=n*p_*p_*p_
However, a simple way to bypass this blocking would be to change your User-Agent (see here). By default, requests sends something like this "python-requests/2.22.0". Changing it to something more browser-like would temporarily work.

Function for these parse elements will do not repeat. BeautifulSoup

Which function(or etc) is ideal so that these nicknames do not repeat on my parser. Dont know how to do that. I'l be very grateful if you help me.
Source:
from urllib.request import urlopen as uReq
from urllib.request import Request
from bs4 import BeautifulSoup as soup
# save all the nicknames to 'CSV' file format
filename = "BattlePassNicknames.csv"
f = open(filename, "a", encoding="utf-8")
headers1 = "Member of JAZE Battle Pass 2019\n"
b = 1
if b < 2:
f.write(headers1)
b += 1
# start page
i = 1
while True:
# disable jaze guard. turn off html 'mod_security'
link = 'https://jaze.ru/forum/topic?id=50&page='+str(i)
my_url = Request(
link,
headers={'User-Agent': 'Mozilla/5.0'}
)
i += 1 # increment page no for next run
uClient = uReq(my_url)
if uClient.url != link:
break
page_html = uClient.read()
# Check if there was a redirect
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
# grabs each name of player
containers = page_soup.findAll("div", {"class": "top-area"})
for container in containers:
playerName = container.div.a.text.strip()
print("BattlePass PlayerName: " + playerName)
f.write(playerName + "\n")
You can add all the names to a set.
A set object is an unordered collection of distinct hashable objects. Common uses include membership testing, removing duplicates from a sequence, and computing mathematical operations such as intersection, union, difference, and symmetric difference.
my_set = set()
# Lets add some elements to a set
my_set.add('a')
my_set.add('b')
print(my_set) # prints {'a', 'b'}
# Add one more 'a'
my_set.add('a')
print(my_set) # still prints {'a', 'b'} !
In your case, let's add all the names to a set and then write to the file after the for loop.
from urllib.request import urlopen as uReq
from urllib.request import Request
from bs4 import BeautifulSoup as soup
# save all the nicknames to 'CSV' file format
filename = "BattlePassNicknames.csv"
f = open(filename, "a", encoding="utf-8")
headers1 = "Member of JAZE Battle Pass 2019\n"
b = 1
if b < 2:
f.write(headers1)
b += 1
# start page
i = 1
names = set()
while True:
# disable jaze guard. turn off html 'mod_security'
link = 'https://jaze.ru/forum/topic?id=50&page='+str(i)
my_url = Request(
link,
headers={'User-Agent': 'Mozilla/5.0'}
)
i += 1 # increment page no for next run
uClient = uReq(my_url)
if uClient.url != link:
break
page_html = uClient.read()
# Check if there was a redirect
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
# grabs each name of player
containers = page_soup.findAll("div", {"class": "top-area"})
for container in containers:
playerName = container.div.a.text.strip()
names.add(playerName)
for name in names:
f.write(name + "\n")
f.close()
EDIT
Sets do not preserve the order. If you want to retain the order, just use lists.
...
names = []
while True:
...
for container in containers:
playerName = container.div.a.text.strip()
if playerName not in names:
names.append(playerName)
for name in names:
f.write(name + "\n")
f.close()

How to scrape all the test match details in cricinfo

I am trying to scrape all the test match details but it is showing HTTP Error 504: Gateway Timeout I am getting the details of test matches but it is not showing this is my code i have used bs4 to scrape the test match details from cricinfo
I need to scrape the details of 2000 test matches this is my code
import urllib.request as req
BASE_URL = 'http://www.espncricinfo.com'
if not os.path.exists('./espncricinfo-fc'):
os.mkdir('./espncricinfo-fc')
for i in range(0, 2000):
soupy = BeautifulSoup(urllib2.urlopen('http://search.espncricinfo.com/ci/content/match/search.html?search=test;all=1;page=' + str(i)).read())
time.sleep(1)
for new_host in soupy.findAll('a', {'class' : 'srchPlyrNmTxt'}):
try:
new_host = new_host['href']
except:
continue
odiurl =BASE_URL + urljoin(BASE_URL,new_host)
new_host = unicodedata.normalize('NFKD', new_host).encode('ascii','ignore')
print(new_host)
html = req.urlopen(odiurl).read()
if html:
with open('espncricinfo-fc/{0!s}'.format(str.split(new_host, "/")[4]), "wb") as f:
f.write(html)
print(html)
else:
print("no html")
this is usually happen when doing multiple request too fast, it can be the server is down or your connection blocked by server firewall, try increase your sleep() or add random sleep.
import random
.....
for i in range(0, 2000):
soupy = BeautifulSoup(....)
time.sleep(random.randint(2,6))
not sure why, seems to be working for me.
I made a few changes in the loop through the links. I'm not sure how you're wanting the output to look in terms of writing it to your file, so I left that part alone. But like I said, seems to be working ok on my end.
import bs4
import requests
import os
import time
import urllib.request as req
BASE_URL = 'http://www.espncricinfo.com'
if not os.path.exists('C:/espncricinfo-fc'):
os.mkdir('C:/espncricinfo-fc')
for i in range(0, 2000):
i=0
url = 'http://search.espncricinfo.com/ci/content/match/search.html?search=test;all=1;page=%s' %i
html = requests.get(url)
print ('Checking page %s of 2000' %(i+1))
soupy = bs4.BeautifulSoup(html.text, 'html.parser')
time.sleep(1)
for new_host in soupy.findAll('a', {'class' : 'srchPlyrNmTxt'}):
try:
new_host = new_host['href']
except:
continue
odiurl = BASE_URL + new_host
new_host = odiurl
print(new_host)
html = req.urlopen(odiurl).read()
if html:
with open('C:/espncricinfo-fc/{0!s}'.format('_'.join(str.split(new_host, "/")[4:])), "wb") as f:
f.write(html)
#print(html)
else:
print("no html")

Complex python3 csv scraper

I've got the code below working great when pulling data from a row, in my case row[0]. I'm wondering how to tweak it to pull data from multiple rows?
Also, I would love to be able to specify which divTag class (see the code below) to use for a specific column.
Something like for row[1,2] use:
divTag = soup.find("div", {"class": "productsPicture"})
and for row[4,5] use:
divTag = soup.find("div", {"class": "product_content"})
If that make sense to you guys.
from bs4 import BeautifulSoup
import requests
import csv
with open('urls.csv', 'r') as csvFile, open('results.csv', 'w', newline='') as results:
reader = csv.reader(csvFile, delimiter=';')
writer = csv.writer(results)
for row in reader:
# get the url
url = row[0]
print(url)
# fetch content from server
try:
html = requests.get(url).content
except requests.exceptions.ConnectionError as e:
writer.writerow([url, '', 'bad url'])
continue
# soup fetched content
soup = BeautifulSoup(html, 'html.parser')
divTag = soup.find("div", {"class": "productsPicture"})
if divTag:
# Return all 'a' tags that contain an href
for a in divTag.find_all("a", href=True):
url_sub = a['href']
# Test that link is valid
try:
r = requests.get(url_sub)
writer.writerow([url, url_sub, 'ok'])
except requests.exceptions.ConnectionError as e:
writer.writerow([url, url_sub, 'bad link'])
else:
writer.writerow([url, '', 'no results'])
urls.csv sample:
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E705Y-0193;
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E703Y-0193;
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E702Y-4589;
https://www.tennis-point.com/index.php?stoken=737F2976&lang=1&cl=search&searchparam=E706Y-9093;
Example classes to search for:
To add a per column find parameter, you could create a dictionary mapping the index number into the required find parameters as follows:
from bs4 import BeautifulSoup
import requests
import csv
class_1 = {"class": "productsPicture"}
class_2 = {"class": "product_content"}
class_3 = {"class": "id-fix"}
# map a column number to the required find parameters
class_to_find = {
0 : class_3, # Not defined in question
1 : class_1,
2 : class_1,
3 : class_3, # Not defined in question
4 : class_2,
5 : class_2}
with open('urls.csv', 'r') as csvFile, open('results.csv', 'w', newline='') as results:
reader = csv.reader(csvFile)
writer = csv.writer(results)
for row in reader:
# get the url
output_row = []
for index, url in enumerate(row):
url = url.strip()
# Skip any empty URLs
if len(url):
#print('col: {}\nurl: {}\nclass: {}\n\n'.format(index, url, class_to_find[index]))
# fetch content from server
try:
html = requests.get(url).content
except requests.exceptions.ConnectionError as e:
output_row.extend([url, '', 'bad url'])
continue
except requests.exceptions.MissingSchema as e:
output_row.extend([url, '', 'missing http...'])
continue
# soup fetched content
soup = BeautifulSoup(html, 'html.parser')
divTag = soup.find("div", class_to_find[index])
if divTag:
# Return all 'a' tags that contain an href
for a in divTag.find_all("a", href=True):
url_sub = a['href']
# Test that link is valid
try:
r = requests.get(url_sub)
output_row.extend([url, url_sub, 'ok'])
except requests.exceptions.ConnectionError as e:
output_row.extend([url, url_sub, 'bad link'])
else:
output_row.extend([url, '', 'no results'])
writer.writerow(output_row)
The enumerate() function is used to return a counter whist iterating over a list. So index will be 0 for the first URL, and 1 for the next. This can then be used with the class_to_find dictionary to get the required parameters to search on.
Each URL results in 3 columns being created, the url, the sub-url if successful and the result. These can be removed if not needed.

Python Web Scrape Unknown Number of Pages

I have working code that scrapes a single Craigslist page for specific information, but what would I need to add in order to grab the data from ALL of the pages (not knowing how many pages ahead of time)?
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url="https://portland.craigslist.org/search/sss?query=electronics&sort=date"
uClient=uReq(my_url) #sends GET request to URL
page_html=uClient.read() #reads returned data and puts it in a variable
uClient.close() #close the connection
#create a file that we will want later to write parsed data to
filename="ScrapedData.csv"
f=open(filename, 'w')
headers="date, location, title, price\n"
f.write(headers)
#use BS to parse the webpage
page_soup=soup(page_html,'html.parser') #applying BS to the obtained html
containers=page_soup.findAll('p',{'class','result-info'})
for container in containers:
container_date=container.findAll('time',{'class','result-date'})
date=container_date[0].text
try:
container_location=container.findAll('span',{'class','result-hood'})
location=container_location[0].text
except:
try:
container_location=container.findAll('span',{'class','nearby'})
location=container_location[0].text
except:
location='NULL'
container_title=container.findAll('a',{'class','result-title'})
title=container_title[0].text
try:
container_price=container.findAll('span',{'class','result-price'})
price=container_price[0].text
except:
price='NULL'
#to print to screen
print('date:'+date)
print('location:'+location)
print('title:'+title)
print('price:'+price)
#to write to csv
f.write(date+','+location.replace(",","-")+','+title.replace(","," ")+','+price+'\n')
f.close()
Apart from what sir Andersson has already shown, you can do that as well for this site:
import requests
from bs4 import BeautifulSoup
import csv
page_link = "https://portland.craigslist.org/search/sss?s={}&query=electronics&sort=date"
for link in [page_link.format(page) for page in range(0,1147,120)]: #this is the fix
res = requests.get(link)
soup = BeautifulSoup(res.text,'lxml')
for container in soup.select('.result-info'):
try:
date = container.select('.result-date')[0].text
except IndexError:
date = ""
try:
title = container.select('.result-title')[0].text
except IndexError:
title = ""
try:
price = container.select('.result-price')[0].text
except IndexError:
price = ""
print(date,title,price)
with open("craigs_item.csv","a",newline="",encoding="utf-8") as outfile:
writer = csv.writer(outfile)
writer.writerow([date,title,price])
You can try to loop through all pages by handling "s" parameter in URL until you find page with no results (page with text "search and you will find"):
import requests
results_counter = 0
while True:
my_url="https://portland.craigslist.org/search/sss?query=electronics&sort=date&s=%d" % results_counter
page_html = requests.get(my_url).text
if "search and you will find" in page_html:
break
else:
results_counter += 120
filename="ScrapedData.csv"
f=open(filename, 'w')
headers="date, location, title, price\n"
f.write(headers)
page_soup=soup(page_html,'html.parser') #applying BS to the obtained html
containers=page_soup.findAll('p',{'class','result-info'})
...

Resources