How to scrape if the whole page changes?

How to scrape if the whole page changes? - python-3.x

with this can reached the link 3 days ago:
import requests
from bs4 import BeautifulSoup
html_url23 = "http://streamstat.net/videoplayer.cgi?sid=148177550&ext=.m3u8"
html_response = requests.get(html_url23)
soup = BeautifulSoup(html_response.text, 'html.parser')
for vid in soup.find_all('source'):
FIXTV = vid['src']
now there is only one "text",
what can be done at this time?
No listen URL! SID not found!
it wouldn't bother me so much if the link didn't work, but when printing, if there is a "broken" link in the links, the whole thing doesn't work
in this example the CINELIFEHD works, as soon as I add FIXTV the print no longer works because of the changed page
import requests
from bs4 import BeautifulSoup
html_url55 = "http://streamstat.net/videoplayer.cgi?sid=14358315&ext=.m3u8"
html_response = requests.get(html_url55)
soup = BeautifulSoup(html_response.text, 'html.parser')
for vid in soup.find_all('source'):
CINELIFEHD = vid['src']
html_url23 = "http://streamstat.net/videoplayer.cgi?sid=148177550&ext=.m3u8"
html_response = requests.get(html_url23)
soup = BeautifulSoup(html_response.text, 'html.parser')
for vid in soup.find_all('source'):
FIXTV = vid['src']
print(
"#EXTM3U"
+ '\n' +"#EXTINF:0,tvg-logo=https://cinelife.com/wp-content/uploads/2020/04/cinelife_logo.png, CINE LIFE HD" + '\n' +
CINELIFEHD
+ '\n' + "#EXTINF:0,tvg-logo=http://1241.hu/userfiles/image/tvcsatornak/pic_atkoto_55_fix_tv.png, Fix" + '\n' +
FIXTV
)

from bs4 import BeautifulSoup
import requests
keys = [148177550, 14358315]
params = {
'ext': '.m3u8'
}
def get_soup(content):
return BeautifulSoup(content, 'lxml')
def main(url):
with requests.Session() as req:
for k in keys:
params['sid'] = k
r = req.get(url, params=params)
soup = get_soup(r.text)
try:
goal = soup.select_one('source')['src']
except TypeError:
goal = "N/A"
print("Key: {:10}, Result: {}".format(k, goal))
main('http://streamstat.net/videoplayer.cgi')
Output:
Key: 148177550, Result: N/A
Key: 14358315, Result: https://magselect-stirr.amagi.tv/playlist1080p.m3u8

Related

Script isn't retrieving all the info

I tried making a python script that gets all the fighter names and their records from boxrec.com. The issue is that it doesn't retrieve them all (Floyd Mayweather is missing) and some of them appear several times (Success Tetteh for example).
The output is too big to post it here: https://cryptpad.fr/pad/#/2/pad/view/mYd4jIMOxY7QNUqW2-5TvYIvvx84KXbiMdYvXINGV9M/
Edit: For some fighters the records are wrong (Vasyl Lomachenko for example appears to have 28 wins, but he has 14)
import numpy
from requests import Session
from bs4 import BeautifulSoup
import pandas as pd
import pyautogui
import time
def main():
fighter_names = []
fighter_wins = []
fighter_losses = []
fighter_draws = []
username = "username"
password = "password"
site = "https://boxrec.com/en/login"
payload = {
'_username': username,
'_password': password,
'login[go]': None
}
with Session() as s:
s.get(site)
s.post(site, data=payload, headers={
"Content-Type": "application/x-www-form-urlencoded"
})
pages = numpy.arange(1, 19152, 20)
for page in pages:
page = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(page.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
if not names_a:
print("solving captcha")
page = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(page.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
pyautogui.click(x=118, y=1061)
time.sleep(1)
pyautogui.click(x=1035, y=619)
time.sleep(2)
pyautogui.click(x=97, y=59)
time.sleep(1)
pyautogui.click(x=834, y=247)
time.sleep(2)
if not names_a:
print("please solve captcha manually")
while not names_a:
page = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(page.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
wins_span = soup.find_all('span', class_='textWon')
loses_span = soup.find_all('span', class_='textLost')
draws_span = soup.find_all('span', class_='textDraw')
for container in names_a:
name = container.text
print(name)
fighter_names.append(name)
for container in wins_span:
wins = container.text
fighter_wins.append(wins)
for container in loses_span:
losses = container.text
fighter_losses.append(losses)
for container in draws_span:
draws = container.text
fighter_draws.append(draws)
fighters = {
"name": fighter_names,
"wins": fighter_wins,
"loses": fighter_losses,
"draws": fighter_draws
}
df = pd.DataFrame.from_dict(fighters, orient="index")
df = df.transpose()
df.to_csv("fighters.csv")
if __name__ == '__main__':
main()

I would refrain from using the same variable name to represent 2 separate things...Looks like you have page variable being used in 2 separate instances, which can be confusing.
As far as some of the issues, I'm assuming at some point there's a mismatch in the lists so the corresponding data isn't lining up with the correct fighter name, etc. or there's something off with the sites actual data/html. Not entirely sure as I haven't debugged. Reason being, have you considered using pandas to parse the table then just split the 'w-l-d' column? I think it would be far easier to let pandas do the parsing as to not miss something in the 900+ pages you need to go through.
See if this helps:
import numpy
from requests import Session
from bs4 import BeautifulSoup
import pandas as pd
import pyautogui
import time
import math
def main():
final_df = pd.DataFrame()
username = 'username'
password = 'password'
site = "https://boxrec.com/en/login"
payload = {
'_username': username,
'_password': password,
'login[go]': None
}
with Session() as s:
s.get(site)
s.post(site, data=payload, headers={
"Content-Type": "application/x-www-form-urlencoded"
})
pages = numpy.arange(1, 19152, 20)
for page in pages:
response = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(response.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
if not names_a:
print("solving captcha")
response = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(response.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
pyautogui.click(x=118, y=1061)
time.sleep(1)
pyautogui.click(x=1035, y=619)
time.sleep(2)
pyautogui.click(x=97, y=59)
time.sleep(1)
pyautogui.click(x=834, y=247)
time.sleep(2)
if not names_a:
print("please solve captcha manually")
while not names_a:
response = s.get(
"https://boxrec.com/en/locations/people?l%5Brole%5D=proboxer&l%5Bdivision%5D=&l%5Bcountry%5D=&l"
"%5Bregion%5D=&l%5Btown%5D=&l_go=&offset= "
+ str(page))
soup = BeautifulSoup(response.text, 'html.parser')
names_a = soup.find_all('a', class_='personLink')
df = pd.read_html(response.text)[-1]
df = df[['name','w-l-d']]
df = df[df['w-l-d'].astype(str).str.match(r"(^\d*.\d*.\d*$)")] # <--- ADD THIS LINE
df[['wins','loses','draws']] = df['w-l-d'].str.split(expand=True)
df = df.drop('w-l-d', axis=1)
print('Page: %d of %d' %(((page-1)/20)+1,math.ceil(19152/20)))
final_df = final_df.append(df, sort=False).reset_index(drop=True)
final_df.to_csv("fighters.csv")
if __name__ == '__main__':
main()

Parse through website with Beautiful Soup to find matching Data

I am trying Python + BeautifulSoup to loop through a website in order to find a matching string contained in a tag.
When the matching substring is found stop the iteration and print the span, can't find a way to make this work.
this is what I could manage to work out so far
import urllib.request
from bs4 import BeautifulSoup as b
num = 1
base_url = "https://v-tac.it/led-products-results-page/?q="
request = '500'
separator = '&start='
page_num = "1"
url = base_url + request + separator + page_num
html = urllib.request.urlopen(url).read()
soup = b(html, "html.parser")
for i in range(100) :
for post in soup.findAll("div",{"class" : "spacer"}):
h = post.findAll("span")[0].text
if "request" in h:
break
print(h)
num += 1
page_num = str(num)
url = base_url + request + separator + page_num
html = urllib.request.urlopen(url).read()
soup = b(html, "html.parser")
print("We are at page " + page_num)
But it doesn't return anything, it only cycles through the pages.
Thanks in advance for any help

If it is in the text then with bs4 4.7.1 you should be able to use :contains
soup.select_one('.spacer span:contains("request")').text if soup.select_one('.spacer span:contains("request")') is not None else 'Not found'
I'm not sure why when you have for i in range(100) , you don't use i instead of num later; then you wouldn't need +=

Login timeout while scraping

While I'm scraping boxrec (www.boxrec.com) using BeautifulSoup and the urllib library, my login timouts and the process stops. I need to logout and login manually to resume the process.
libraries I'm using:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
#This is the final function that is getting interrupted
def AllDataWriter(rankingURL, nfileNameT):
uClientRanking = uReq(rankingURL)
try:
rankingURL_html = uClientRanking.read()
except (http.client.IncompleteRead) as e:
rankingURL_html = e.partial
rankingURL_soup = soup(rankingURL_html, 'html.parser')
rankingFighters = rankingURL_soup.findAll('a', {'class': 'personLink'})
with open(nfileNameT, mode='w', encoding="utf-8") as csv_file:
f = csv.writer(csv_file)
f.writerow(dataBaseHeader)
i=0
for item in rankingFighters:
thisURL = 'http://boxrec.com' + rankingFighters[i]['href']
fighterArray = getFighterData(thisURL)
for d in range(0,int((len(fighterArray))/38)):
u = d * 38
f.writerow(fighterArray[u:(u+38)])
i = i + 1

Not able to parse webpage contents using beautiful soup

I have been using Beautiful Soup for parsing webpages for some data extraction. It has worked perfectly well for me so far, for other webpages. But however I'm trying to count the number of < a> tags in this page,
from bs4 import BeautifulSoup
import requests
catsection = "cricket"
url_base = "http://www.dnaindia.com/"
i = 89
url = url_base + catsection + "?page=" + str(i)
print(url)
#This is the page I'm trying to parse and also the one in the hyperlink
#I get the correct url i'm looking for at this stage
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data, 'html.parser')
j=0
for num in soup.find_all('a'):
j=j+1
print(j)
I'm getting the output as 0. This makes me think that the 2 lines after r=requests.get(url) is probably not working(there's obviously no chance that there's zero < a> tags in the page), and i'm not sure about what alternative solution I can use here. Does anybody have any solution or faced a similar kind of problem before?
Thanks, in advance.

You need to pass some of the information along with the request to the server.
Following code should work...You can play along with other parameter as well
from bs4 import BeautifulSoup
import requests
catsection = "cricket"
url_base = "http://www.dnaindia.com/"
i = 89
url = url_base + catsection + "?page=" + str(i)
print(url)
headers = {
'User-agent': 'Mozilla/5.0'
}
#This is the page I'm trying to parse and also the one in the hyperlink
#I get the correct url i'm looking for at this stage
r = requests.get(url, headers=headers)
data = r.text
soup = BeautifulSoup(data, 'html.parser')
j=0
for num in soup.find_all('a'):
j=j+1
print(j)

Put any url in the parser and check the number of "a" tags available on that page:
from bs4 import BeautifulSoup
import requests
url_base = "http://www.dnaindia.com/cricket?page=1"
res = requests.get(url_base, headers={'User-agent': 'Existed'})
soup = BeautifulSoup(res.text, 'html.parser')
a_tag = soup.select('a')
print(len(a_tag))

slowing down a webscrape using python

I am trying to scrape a site and the problem I am running into is the page takes time to load. So by the time my scraping is done I may get only five items when there may be 25. Is there a way to slow down python. I am using beautifulSoup
Here is the code I am using
import urllib
import urllib.request
from bs4 import BeautifulSoup
theurl="http://agscompany.com/product-category/fittings/tube-nuts/316-tube/"
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage,"html.parser")
for pn in soup.find_all('div',{"class":"shop-item-text"}):
pn2 = pn.text
print(pn2)
Thank you

All the results can be accessed from theses pages :
http://agscompany.com/product-category/fittings/tube-nuts/316-tube/page/
http://agscompany.com/product-category/fittings/tube-nuts/316-tube/page/2/
...
So you can access them with a loop on the page number :
import urllib
import urllib.request
from bs4 import BeautifulSoup
theurl="http://agscompany.com/product-category/fittings/tube-nuts/316-tube/"
for i in range(1,5):
thepage = urllib.request.urlopen(theurl + '/page/' + str(i) + '/')
soup = BeautifulSoup(thepage,"html.parser")
for pn in soup.find_all('div',{"class":"shop-item-text"}):
pn2 = pn.text
print(pn2)

More generic version of #Kenavoz's answer.
This approach doesn't care about how many pages there are.
Also, I would go for requests rather than urllib.
import requests
from bs4 import BeautifulSoup
url_pattern = 'http://agscompany.com/product-category/fittings/tube-nuts/316-tube/page/{index}/'
status_code = 200
url_index = 1
while status_code == 200:
url = url_pattern.format(index=url_index)
response = requests.get(url)
status_code = response.status_code
url_index += 1
soup = BeautifulSoup(response.content, 'html.parser')
page_items = soup.find_all('div', {'class': 'shop-item-text'})
for page_item in page_items:
print(page_item.text)

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

How to scrape if the whole page changes? - python-3.x

Related

Script isn't retrieving all the info

Parse through website with Beautiful Soup to find matching Data

Login timeout while scraping

Not able to parse webpage contents using beautiful soup

slowing down a webscrape using python

Categories

Resources