Search Number Range on Website in Python - python-3.x

Below is a script that searches for a word on a website. In this case, it's "S&P" on https://finance.yahoo.com/quote/%5EGSPC?p=^GSPC.
My question is how can I search for a number range between 2300 and 2400......or number under/greater than 2400. Basically, I'm making a script that tells me when the price reaches a certain point.
thanks for your help!
import webbrowser
import urllib.request
page = urllib.request.urlopen("https://finance.yahoo.com/quote/%5EGSPC?p=^GSPC")
content = page.read().decode('utf-8')
if "S&P" in content :
webbrowser.open("https://finance.yahoo.com/quote/%5EGSPC?p=^GSPC")
updated 2017-5-16
Person below helped me. Thanks and thanks to all that replied. I tinkerered around and did below.
import urllib.request
import webbrowser
from bs4 import BeautifulSoup
page = urllib.request.urlopen("https://finance.yahoo.com/quote/%5EGSPC?p=^GSPC")
content = page.read().decode('utf-8')
soup = BeautifulSoup(content, 'html.parser')
val = soup.find("span", {"data-reactid": "36"}).decode_contents(formatter="html")
if val >= "2,400.00":
webbrowser.open("https://finance.yahoo.com/quote/%5EGSPC?p=^GSPC")

Try the following with BeautifulSoup:
>>> import urllib2
>>> from bs4 import BeautifulSoup
>>> page = urllib2.urlopen("https://finance.yahoo.com/quote/%5EGSPC?p=^GSPC")
>>> page = urllib2.urlopen("https://finance.yahoo.com/quote/%5EGSPC?p=^GSPC")
>>> content = page.read().decode('utf-8')
>>> soup = BeautifulSoup(content, 'html.parser')
>>> val = soup.find("span", {"data-reactid": "36"}).decode_contents(formatter="html")
>>> val
u'2,402.32'
>>>
Then, convert to a float to check whether it matches your breakpoint:
>>> val = float(val.replace(',',''))
>>> val
2402.32
>>>

You can use BS4's Beautiful Soup. Use pip to install it.
from bs4 import BeautifulSoup as soup
content=soup(content,'lxml')
stuffs=content.findAll(class_="Trsdu(0.3s) Fw(b) Fz(36px) Mb(-4px) D(ib)")
#stuffs will contain your stock price. you can check what range its in and stuff with it

Related

Appending extracted links in list but the list give the whole tag instead of link while printing

This is my code
from bs4 import BeautifulSoup
import requests, lxml
import re
from urllib.parse import urljoin
from googlesearch import search
import pandas as pd
query = 'A M C College of Engineering, Bangalore'
link = []
for i in search(query, tld='co.in', start=0, stop=1):
print(i)
soup = BeautifulSoup(requests.get(i).text, 'lxml')
for link in soup.select("a[href$='.pdf']"):
if re.search(r'nirf', str(link), flags=re.IGNORECASE):
fUrl = urljoin(i, link['href'])
print(fUrl)
link.append(fUrl)
print(link)
df = pd.DataFrame(link, columns=['PDF LINKS'])
print(df)
Here is my output after running the code:
https://www.amcgroup.edu.in/AMCEC/index.php
https://www.amcgroup.edu.in/AMCEC/image/Download/NIRFENGG.pdf
https://www.amcgroup.edu.in/AMCEC/image/Download/NIRFMBA.pdf
https://www.amcgroup.edu.in/AMCEC/image/Download/NIRF_2019.pdf
https://www.amcgroup.edu.in/AMCEC/image/Download/NIRF_2020.pdf
# Printing list with links but getting tags
For Invitation Click here...
# Dataframe where I want to store list
PDF LINKS
0 For Invitation Click here...
I should get the list of links which is shown in the output but when printing the list it gives me the whole tag instead of link. Also I want to push the all the links that I got into a single row of dataframe like this:
PDF LINKS
0 link1 link2 link3 #for query1
1 link1 link2 #for another query
How can I achieve this. And what is the problem with my code why I am getting tag instead of list.
Thanks in advance.
Use different variable name for the list and for the tag in for-loop:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
query = "A M C College of Engineering, Bangalore"
all_data = []
for i in ["https://www.amcgroup.edu.in/AMCEC/index.php"]:
soup = BeautifulSoup(requests.get(i).text, "lxml")
for link in soup.select("a[href$='.pdf']"): # <-- `link` is different than `all_data` here!
if re.search(r"nirf", link["href"], flags=re.IGNORECASE):
fUrl = urljoin(i, link["href"])
all_data.append(fUrl)
df = pd.DataFrame(all_data, columns=["PDF LINKS"])
print(df)
Prints:
PDF LINKS
0 https://www.amcgroup.edu.in/AMCEC/image/Download/NIRFENGG.pdf
1 https://www.amcgroup.edu.in/AMCEC/image/Download/NIRFMBA.pdf
2 https://www.amcgroup.edu.in/AMCEC/image/Download/NIRF_2019.pdf
3 https://www.amcgroup.edu.in/AMCEC/image/Download/NIRF_2020.pdf
EDIT: To have results in one row:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
query = "A M C College of Engineering, Bangalore"
all_data = []
for i in ["https://www.amcgroup.edu.in/AMCEC/index.php"]:
soup = BeautifulSoup(requests.get(i).text, "lxml")
row = []
for link in soup.select(
"a[href$='.pdf']"
): # <-- `link` is different than `all_data` here!
if re.search(r"nirf", link["href"], flags=re.IGNORECASE):
fUrl = urljoin(i, link["href"])
row.append(fUrl)
if row:
all_data.append(row)
df = pd.DataFrame({"PDF LINKS": all_data})
print(df)
Prints:
PDF LINKS
0 [https://www.amcgroup.edu.in/AMCEC/image/Download/NIRFENGG.pdf, https://www.amcgroup.edu.in/AMCEC/image/Download/NIRFMBA.pdf, https://www.amcgroup.edu.in/AMCEC/image/Download/NIRF_2019.pdf, https://www.amcgroup.edu.in/AMCEC/image/Download/NIRF_2020.pdf]

Elements duplicated with Beautifulsoup

This is the url: https://yorkathletics.com/sports/mens-swimming-and-diving/roster"
If I run this command:
soup.find_all('span', class_="sidearm-roster-player-height")
then I try to get the length of the output, it is mentioned 20 while it is supposed to be 10.
I can't see why this happens.
Change your class selector as follows:
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://yorkathletics.com/sports/mens-swimming-and-diving/roster')
soup = bs(r.content, 'lxml')
print([i.text for i in soup.select('.height')])
Note: You can grab the whole table with pandas:
import pandas as pd
table = pd.read_html('https://yorkathletics.com/sports/mens-swimming-and-diving/roster')[2]
print(table)

I am trying to extract text inside span_id, but getting blank output using python beautifulsoup

i am tring to extract text inside span-id tag but getting blank output screen.
i have tried using parent element div text also , but fail to extract, please anyone help me.
below is my code.
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.paperplatemakingmachines.com/')
soup = BeautifulSoup(r.text,'lxml')
mob = soup.find('span',{"id":"tollfree"})
print(mob.text)
i want the text inside that span which is given mobile number.
You'll have to use Selenium as that text is not present in the initial request, or at least no without searching through <script> tags.
from bs4 import BeautifulSoup as soup
from selenium import webdriver
import time
driver = webdriver.Chrome('C:\chromedriver_win32\chromedriver.exe')
url='https://www.paperplatemakingmachines.com/'
driver.get(url)
# It's better to use Selenium's WebDriverWait, but I'm still learning how to use that correctly
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.close()
mob = soup.find('span',{"id":"tollfree"})
print(mob.text)
The Data is actually rending dynamically through script. What you need to do is parse the data from script:
import requests
import re
from bs4 import BeautifulSoup
r = requests.get('https://www.paperplatemakingmachines.com/')
soup = BeautifulSoup(r.text,'lxml')
script= soup.find('script')
mob = re.search("(?<=pns_no = \")(.*)(?=\";)", script.text).group()
print(mob)
Another way of using regex to find the number
import requests
import re
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.paperplatemakingmachines.com/',)
soup = bs(r.content, 'lxml')
r = re.compile(r'var pns_no = "(\d+)"')
data = soup.find('script', text=r).text
script = r.findall(data)[0]
print('+91-' + script)

Python web scrape from multiple columns

I am trying to pull data from various columns in the odds table from this website:
https://www.sportsbookreview.com/betting-odds/nba-basketball/totals/?date=20190419
I have tried using the following code but I am only getting the open lines. I want to be able to get exact columns. For example, the pinnacle and bookmaker columns.
import urllib
import urllib.request
from bs4 import BeautifulSoup
theurl = "https://www.sportsbookreview.com/betting-odds/nba-
basketball/totals/?date=20190419"
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage,"html.parser")
for lines in soup.findAll('span',{"class":"_3Nv_7"}):
print(lines.get_text())
import urllib
import urllib.request
from bs4 import BeautifulSoup
theurl = "https://www.sportsbookreview.com/betting-odds/nba-basketball/totals/?date=20190419"
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage,"html.parser")
for lines in soup.findAll('span',{"class":"_3Nv_7 opener"}):
print(lines.get_text())

Scraping multiple web pages has the same results as the first page using Python

My question is about that I tried to get the product names from CME group website. However, why the code be wouldn't be able to access the next page although I changed the URLs in the loop? Any ideas and opinions on this? Thanks in advance.
from urllib.request import Request
from urllib.request import urlopen
from bs4 import BeautifulSoup
for i in range(1,6):
url='http://www.cmegroup.com/trading/products/#pageNumber='+str(i)+'&sortAsc=false'
CMEacess=Request(url,headers={'User-Agent':'Mozilla/5.0'})
print(url)
print('page: '+str(i))
CMEpage=urlopen(CMEacess).read()
CMEsoup=BeautifulSoup(CMEpage,'html.parser')
namelist=CMEsoup.findAll('th',attrs={'class','cmeTableLeft'})
for name in namelist:
print(name.get_text())
print('\n')
You could try using the requests library rather than urllib. I just accessed page 5 successfully using code similar to yours with this difference.
Note that the literal 'D3' appears on page five but not on page one.
>>> import requests
>>> i = 5
>>> url='http://www.cmegroup.com/trading/products/#pageNumber='+str(i)+'&sortAsc=false'
>>> page = requests.get(url).content
>>> import bs4
>>> soup = bs4.BeautifulSoup(page, 'lxml')
>>> soup.find_all(string='D3')
['D3', 'D3']

Resources