Python web scrape from multiple columns

Python web scrape from multiple columns - python-3.x

I am trying to pull data from various columns in the odds table from this website:
https://www.sportsbookreview.com/betting-odds/nba-basketball/totals/?date=20190419
I have tried using the following code but I am only getting the open lines. I want to be able to get exact columns. For example, the pinnacle and bookmaker columns.
import urllib
import urllib.request
from bs4 import BeautifulSoup
theurl = "https://www.sportsbookreview.com/betting-odds/nba-
basketball/totals/?date=20190419"
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage,"html.parser")
for lines in soup.findAll('span',{"class":"_3Nv_7"}):
print(lines.get_text())

import urllib
import urllib.request
from bs4 import BeautifulSoup
theurl = "https://www.sportsbookreview.com/betting-odds/nba-basketball/totals/?date=20190419"
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage,"html.parser")
for lines in soup.findAll('span',{"class":"_3Nv_7 opener"}):
print(lines.get_text())

Related

Elements duplicated with Beautifulsoup

This is the url: https://yorkathletics.com/sports/mens-swimming-and-diving/roster"
If I run this command:
soup.find_all('span', class_="sidearm-roster-player-height")
then I try to get the length of the output, it is mentioned 20 while it is supposed to be 10.
I can't see why this happens.

Change your class selector as follows:
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://yorkathletics.com/sports/mens-swimming-and-diving/roster')
soup = bs(r.content, 'lxml')
print([i.text for i in soup.select('.height')])
Note: You can grab the whole table with pandas:
import pandas as pd
table = pd.read_html('https://yorkathletics.com/sports/mens-swimming-and-diving/roster')[2]
print(table)

python webscraping with BeautifulSoup Problem

Im trying to scrape the box score table from https://www.nascar.com/stats/2021/1/box-score
my code is not working if someone could take a look and point me in right direction.
`import requests
from bs4 import BeautifulSoup
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = ('https://www.nascar.com/stats/2021/1/box-score')
response = requests.get(url, headers = headers)
response.content
soup = BeautifulSoup(response.content, 'html.parser')
stats = soup.find_all('table', class_ = "stats-box-score-table-driver")
stats
for row in stats.find_all('tr'):
for cell in row.find_all('td'):
print(cell.text)

I am trying to extract text inside span_id, but getting blank output using python beautifulsoup

i am tring to extract text inside span-id tag but getting blank output screen.
i have tried using parent element div text also , but fail to extract, please anyone help me.
below is my code.
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.paperplatemakingmachines.com/')
soup = BeautifulSoup(r.text,'lxml')
mob = soup.find('span',{"id":"tollfree"})
print(mob.text)
i want the text inside that span which is given mobile number.

You'll have to use Selenium as that text is not present in the initial request, or at least no without searching through <script> tags.
from bs4 import BeautifulSoup as soup
from selenium import webdriver
import time
driver = webdriver.Chrome('C:\chromedriver_win32\chromedriver.exe')
url='https://www.paperplatemakingmachines.com/'
driver.get(url)
# It's better to use Selenium's WebDriverWait, but I'm still learning how to use that correctly
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.close()
mob = soup.find('span',{"id":"tollfree"})
print(mob.text)

The Data is actually rending dynamically through script. What you need to do is parse the data from script:
import requests
import re
from bs4 import BeautifulSoup
r = requests.get('https://www.paperplatemakingmachines.com/')
soup = BeautifulSoup(r.text,'lxml')
script= soup.find('script')
mob = re.search("(?<=pns_no = \")(.*)(?=\";)", script.text).group()
print(mob)

Another way of using regex to find the number
import requests
import re
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.paperplatemakingmachines.com/',)
soup = bs(r.content, 'lxml')
r = re.compile(r'var pns_no = "(\d+)"')
data = soup.find('script', text=r).text
script = r.findall(data)[0]
print('+91-' + script)

I am trying to scrape info from a website(Program name and program ID and location) using BeautifulSoup

I am trying to scrape info from a website(Program name and program ID) and it is returning empty list.
I am not sure if i am mixing up the syntax but this is what i have
soup.find_all('h3', class_='ama__h3')
the website link is https://freida.ama-assn.org/Freida/#/programs?program=residencies&specialtiesToSearch=140
from urllib.request import urlopen
from bs4 import BeautifulSoup as BS
import pandas as pd
from urllib.parse import urlparse, urlsplit
import requests
res = requests.get('https://freida.ama-assn.org/Freida/#/programs?program=residencies&specialtiesToSearch=140')
soup = bs4.BeautifulSoup(res.text, 'html5lib')
print(soup.prettify())
soup.find_all('h3', class_='ama__h3')

Your error is because you are parsing with html5lib. For any well formed html, the parser choice is not really important. However for a non well formed html (like this one), html5lib seems to have issues. You should use html.parser or lxml (apparently html.parser is safer)
However this code is doing what you want to do :
soup = BeautifulSoup(res.text, 'html.parser')
programs = soup.find_all("a", class_='ama__promo--background')
for program in programs:
program_name = program.find("h3").text
program_id = program.find_all("small")[-1].text.split(': ')[1].strip()
print(program_name, program_id

error while using selenium and writing to file

import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
import pandas as pd
from pytube import YouTube
browser = webdriver.Chrome("C:/Users/Downloads/chromedriver_win32/chromedriver.exe")
browser.get("https://www.youtube.com/channel/UCaKt8dvEIPnEHWSbLYhzrxg/videos")
time.sleep(1)
elem = browser.find_element_by_tag_name("body")
no_of_pagedowns = 100
while no_of_pagedowns:
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)
no_of_pagedowns-=1
html = browser.page_source
soup = BeautifulSoup(html, "lxml")
tags = soup.find_all('a')
fname = "C:/Stock_in_CFD/Output.txt"
text_file = open(fname, "w+", encoding="utf-8")
for tag in tags:
t = tag.get('href')
text_file.write(t)
When I am running the above code. I am getting error
TypeError: write() argument must be str, not None
When I am not using selenium I am able to do it.
I am using selenium since I want scroll down entire page before parsing before using BeautifulSoup

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Python web scrape from multiple columns - python-3.x

Related

Elements duplicated with Beautifulsoup

python webscraping with BeautifulSoup Problem

I am trying to extract text inside span_id, but getting blank output using python beautifulsoup

I am trying to scrape info from a website(Program name and program ID and location) using BeautifulSoup

error while using selenium and writing to file

Categories

Resources