Web scraping multiple pages with BeautifulSoup

Web scraping multiple pages with BeautifulSoup - python-3.x

I collected all the necessary information from the first page, but don’t know how to collect information from all pages of the site. I try to find my solution in other stackoverflow topics but didn't understand anything. I will be very grateful if you help me with this.
my parsing site: https://jaze.ru/forum/topic?id=50&page=1
source:
from urllib.request import urlopen as uReq
from urllib.request import Request
from bs4 import BeautifulSoup as soup
# my_url and cutoff mod_security
my_url = Request('http://jaze.ru/forum/topic?id=50&page=1', headers={'User-Agent': 'Mozilla/5.0'})
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
# grabs each name of player
containers = page_soup.findAll("div", {"class":"top-area"})
for container in containers:
playerName = container.div.a.text.strip()
print("BattlePass PlayerName: " + playerName)
source2
from urllib.request import urlopen as uReq
from urllib.request import Request
from bs4 import BeautifulSoup as soup
# start page
i = 1
while True:
link = 'https://jaze.ru/forum/topic?id=50&page='+str(i)
my_url = Request(
link,
headers={'User-Agent': 'Mozilla/5.0'}
)
i += 1 # increment page no for next run
uClient = uReq(my_url)
if uClient.url != link:
break
page_html = uClient.read()
# Check if there was a redirect
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
# grabs each name of player
containers = page_soup.findAll("div", {"class": "top-area"})
# save all info to csv file
filename = "BattlePassNicknames.csv"
f = open(filename, "w", encoding="utf-8")
headers1 = "Member of JAZE Battle Pass 2019\n"
f.write(headers1)
for container in containers:
playerName = container.div.a.text.strip()
print("BattlePass PlayerName: " + playerName)
f.write(playerName + "\n")
f.close()

The website redirects you to a different page if the page query parameter is greater than the last available page, you can use this to increment page till you are redirected. This is applicable if you already know the topic id (50 in this case).
from urllib.request import urlopen as uReq
from urllib.request import Request
from bs4 import BeautifulSoup as soup
# start page
i = 1
while True:
link = 'https://jaze.ru/forum/topic?id=50&page='+str(i)
my_url = Request(
link,
headers={'User-Agent': 'Mozilla/5.0'}
)
i += 1 # increment page no for next run
uClient = uReq(my_url)
if uClient.url != link:
break
page_html = uClient.read()
# Check if there was a redirect
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
# grabs each name of player
containers = page_soup.findAll("div", {"class": "top-area"})
for container in containers:
playerName = container.div.a.text.strip()
print("BattlePass PlayerName: " + playerName)
Output
BattlePass PlayerName: VANTY3
BattlePass PlayerName: VANTY3
BattlePass PlayerName: KK#キング
BattlePass PlayerName: memories
BattlePass PlayerName: Waffel
BattlePass PlayerName: CynoBap
...
BattlePass PlayerName: Switchback
If you also want to try this out with random topic ids then you have to handle urllib.error.HTTPError somewhere in you code to deal with all the 404s etc..

Related

How to scrape if the whole page changes?

with this can reached the link 3 days ago:
import requests
from bs4 import BeautifulSoup
html_url23 = "http://streamstat.net/videoplayer.cgi?sid=148177550&ext=.m3u8"
html_response = requests.get(html_url23)
soup = BeautifulSoup(html_response.text, 'html.parser')
for vid in soup.find_all('source'):
FIXTV = vid['src']
now there is only one "text",
what can be done at this time?
No listen URL! SID not found!
it wouldn't bother me so much if the link didn't work, but when printing, if there is a "broken" link in the links, the whole thing doesn't work
in this example the CINELIFEHD works, as soon as I add FIXTV the print no longer works because of the changed page
import requests
from bs4 import BeautifulSoup
html_url55 = "http://streamstat.net/videoplayer.cgi?sid=14358315&ext=.m3u8"
html_response = requests.get(html_url55)
soup = BeautifulSoup(html_response.text, 'html.parser')
for vid in soup.find_all('source'):
CINELIFEHD = vid['src']
html_url23 = "http://streamstat.net/videoplayer.cgi?sid=148177550&ext=.m3u8"
html_response = requests.get(html_url23)
soup = BeautifulSoup(html_response.text, 'html.parser')
for vid in soup.find_all('source'):
FIXTV = vid['src']
print(
"#EXTM3U"
+ '\n' +"#EXTINF:0,tvg-logo=https://cinelife.com/wp-content/uploads/2020/04/cinelife_logo.png, CINE LIFE HD" + '\n' +
CINELIFEHD
+ '\n' + "#EXTINF:0,tvg-logo=http://1241.hu/userfiles/image/tvcsatornak/pic_atkoto_55_fix_tv.png, Fix" + '\n' +
FIXTV
)

from bs4 import BeautifulSoup
import requests
keys = [148177550, 14358315]
params = {
'ext': '.m3u8'
}
def get_soup(content):
return BeautifulSoup(content, 'lxml')
def main(url):
with requests.Session() as req:
for k in keys:
params['sid'] = k
r = req.get(url, params=params)
soup = get_soup(r.text)
try:
goal = soup.select_one('source')['src']
except TypeError:
goal = "N/A"
print("Key: {:10}, Result: {}".format(k, goal))
main('http://streamstat.net/videoplayer.cgi')
Output:
Key: 148177550, Result: N/A
Key: 14358315, Result: https://magselect-stirr.amagi.tv/playlist1080p.m3u8

Function for these parse elements will do not repeat. BeautifulSoup

Which function(or etc) is ideal so that these nicknames do not repeat on my parser. Dont know how to do that. I'l be very grateful if you help me.
Source:
from urllib.request import urlopen as uReq
from urllib.request import Request
from bs4 import BeautifulSoup as soup
# save all the nicknames to 'CSV' file format
filename = "BattlePassNicknames.csv"
f = open(filename, "a", encoding="utf-8")
headers1 = "Member of JAZE Battle Pass 2019\n"
b = 1
if b < 2:
f.write(headers1)
b += 1
# start page
i = 1
while True:
# disable jaze guard. turn off html 'mod_security'
link = 'https://jaze.ru/forum/topic?id=50&page='+str(i)
my_url = Request(
link,
headers={'User-Agent': 'Mozilla/5.0'}
)
i += 1 # increment page no for next run
uClient = uReq(my_url)
if uClient.url != link:
break
page_html = uClient.read()
# Check if there was a redirect
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
# grabs each name of player
containers = page_soup.findAll("div", {"class": "top-area"})
for container in containers:
playerName = container.div.a.text.strip()
print("BattlePass PlayerName: " + playerName)
f.write(playerName + "\n")

You can add all the names to a set.
A set object is an unordered collection of distinct hashable objects. Common uses include membership testing, removing duplicates from a sequence, and computing mathematical operations such as intersection, union, difference, and symmetric difference.
my_set = set()
# Lets add some elements to a set
my_set.add('a')
my_set.add('b')
print(my_set) # prints {'a', 'b'}
# Add one more 'a'
my_set.add('a')
print(my_set) # still prints {'a', 'b'} !
In your case, let's add all the names to a set and then write to the file after the for loop.
from urllib.request import urlopen as uReq
from urllib.request import Request
from bs4 import BeautifulSoup as soup
# save all the nicknames to 'CSV' file format
filename = "BattlePassNicknames.csv"
f = open(filename, "a", encoding="utf-8")
headers1 = "Member of JAZE Battle Pass 2019\n"
b = 1
if b < 2:
f.write(headers1)
b += 1
# start page
i = 1
names = set()
while True:
# disable jaze guard. turn off html 'mod_security'
link = 'https://jaze.ru/forum/topic?id=50&page='+str(i)
my_url = Request(
link,
headers={'User-Agent': 'Mozilla/5.0'}
)
i += 1 # increment page no for next run
uClient = uReq(my_url)
if uClient.url != link:
break
page_html = uClient.read()
# Check if there was a redirect
uClient.close()
# html parsing
page_soup = soup(page_html, "html.parser")
# grabs each name of player
containers = page_soup.findAll("div", {"class": "top-area"})
for container in containers:
playerName = container.div.a.text.strip()
names.add(playerName)
for name in names:
f.write(name + "\n")
f.close()
EDIT
Sets do not preserve the order. If you want to retain the order, just use lists.
...
names = []
while True:
...
for container in containers:
playerName = container.div.a.text.strip()
if playerName not in names:
names.append(playerName)
for name in names:
f.write(name + "\n")
f.close()

Not showing full HTML links where are letters ćčžšđ

I'm scraping data from the phonebook, and the problem is that it's not showing full html link. It stops before the čćžšđ letters and my question is how to include čćžšđ letters in the html link.
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'http://www.imenik.hr/imenik/trazi/1/Osobe/sve' \
'/sve/vaznost/mjesto:vi%C5%A1njan.html'
r = urlopen(url).read()
page_soup = BeautifulSoup(r, 'html.parser')
for container in page_soup.find_all('div', class_ = 'list_naslov'):
base_url = 'http://www.imenik.hr'
for link in container.find_all('a', href=True):
print('href:',link['href'])
request_href = base_url + link['href'].replace(' ', '%20')
print(request_href)

'list' object has no attribute 'timeout' and only prints first item in the table

I am trying to pull a table from a list of URL's. When I only input one URL it only prints out the first items in the table and when I add more URL's to the list I get the error message 'list' object has no attribute 'timeout'. What is the best way to get the rest of the items and adding more URL's?
Below is the code I am running.
import time, random, csv, bs4, requests, io
import pandas as pd
timeDelay = random.randrange(5, 20)
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_urls = [
"https://www.lonza.com/products-services/bio-research/electrophoresis-of-nucleic-acids-and-proteins/nucleic-acid-electrophoresis/precast-gels-for-dna-and-rna-analysis/truband-gel-anchors.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-epithelial-cells/nucleofector-kits-for-human-mammary-epithelial-cells-hmec.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-neural-cells/nucleofector-kits-for-mammalian-glial-cells.aspx",
]
uClient = uReq(my_urls)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll('tbody')
product_name_list =[]
cat_no_list = []
size_list = []
price_list =[]
for container in containers:
if (len(container) > 0):
#try:
title_container = container.findAll('td')
Product_name = title_container[0].text.strip()
product_name_list.append(Product_name)
CatNo_container = container.findAll('td')
CatNo = CatNo_container[1].text.strip()
cat_no_list.append(CatNo)
#Size_container = container.findAll('div',{'class':'col-xs-2 noPadding'})
#Size = Size_container[0].text.strip()
#size_list.append(Size)
Price_container = container.findAll('td')
Price = Price_container[4].text.strip()
price_list.append(Price)
print('Product_name: '+ Product_name)
print('CatNo: ' + CatNo)
print('Size: ' + 'N/A')
print('Price: ' + Price)
print(" ")
time.sleep(timeDelay)

You are passing a list here, uClient = uReq(my_urls) as my_urls where a string is required.
You need to pass the individual element of the list i.e. the strings.
Here is the edited code that works for multiple urls.
UPDATED CODE (to get all items):
import time, random, csv, bs4, requests, io
import pandas as pd
timeDelay = random.randrange(5, 20)
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_urls = [
"https://www.lonza.com/products-services/bio-research/electrophoresis-of-nucleic-acids-and-proteins/nucleic-acid-electrophoresis/precast-gels-for-dna-and-rna-analysis/truband-gel-anchors.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-epithelial-cells/nucleofector-kits-for-human-mammary-epithelial-cells-hmec.aspx",
"https://www.lonza.com/products-services/bio-research/transfection/nucleofector-kits-for-primary-cells/nucleofector-kits-for-primary-neural-cells/nucleofector-kits-for-mammalian-glial-cells.aspx",
]
for url in my_urls:
print("URL using: ", url)
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll('tbody')
product_name_list =[]
cat_no_list = []
size_list = []
price_list =[]
for container in containers:
if (len(container) > 0):
#try:
items = container.findAll('tr')
for item in items:
item = item.text.split('\n')
Product_name = item[1]
product_name_list.append(Product_name)
CatNo = item[2]
cat_no_list.append(CatNo)
#Size_container = container.findAll('div',{'class':'col-xs-2 noPadding'})
#Size = Size_container[0].text.strip()
#size_list.append(Size)
Price = item[6]
price_list.append(Price)
print('Product_name: '+ Product_name)
print('CatNo: ' + CatNo)
print('Size: ' + 'N/A')
print('Price: ' + Price)
print(" ")
time.sleep(timeDelay)

Python :Page Navigator Maximum Value Scraper - Only getting the output of last value

This is the program that I have created to extract the maximum page value from each category section from the list.I am unable to fetch all the value,I am just getting the value of the last value in the list.What changes do I need to make in order to get all the outputs.
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
#List for extended links to the base url
links = ['Link_1/','Link_2/','Link_3/']
#Function to find out the biggest number present in the page navigation
#section.Every element before 'Next→' is consist of the upper limit
def page_no():
bs = soup(page_html, "html.parser")
max_page = bs.find('a',{'class':'next page-numbers'}).findPrevious().text
print(max_page)
#url loop
for url in links:
my_urls ='http://example.com/category/{}/'.format(url)
# opening up connection,grabbing the page
uClient = uReq(my_urls)
page_html = uClient.read()
uClient.close()
page_no()
Page Navigator Example:
1 2 3 … 15 Next →
Thanks in Advance

You need to put page_html inside the function and indent the last 4 lines. Also it would be better to return the max_page value so you can use it ojtside the function.
def page_no(page_html):
bs = soup(page_html, "html.parser")
max_page = bs.find('a',{'class':'next page-numbers'}).findPrevious().text
return max_page
#url loop
for url in links:
my_urls='http://example.com/category/{}/'.format(url)
# opening up connection,grabbing the page
uClient = uReq(my_urls)
page_html = uClient.read()
uClient.close()
max_page = page_no(page_html)
print(max_page)

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Web scraping multiple pages with BeautifulSoup - python-3.x

Related

How to scrape if the whole page changes?

Function for these parse elements will do not repeat. BeautifulSoup

Not showing full HTML links where are letters ćčžšđ

'list' object has no attribute 'timeout' and only prints first item in the table

Python :Page Navigator Maximum Value Scraper - Only getting the output of last value

Categories

Resources