This is the code that wrote to fetch data from webpage:
import urllib.request
import urllib
from bs4 import BeautifulSoup
def make_soup(url):
req=urllib.request.Request(url,headers={'User-Agent': 'Mozilla/5.0'})
thepage=urllib.request.urlopen(req)
soupdata=BeautifulSoup(thepage,'html5lib')
return soupdata
soup=make_soup("https://www.nseindia.com/live_market/dynaContent/live_analysis/top_gainers_losers.htm?cat=G")
t=soup.findAll('table')[0]
for record in t.findAll('tr'):
print(record.td.text)
'''
for record in t.findAll('tr'):
for data in record.findAll('td'):
print(data.text)
'''
But this code fetches only first tr. how to get values for remaining tr
Related
Im trying to scrape the box score table from https://www.nascar.com/stats/2021/1/box-score
my code is not working if someone could take a look and point me in right direction.
`import requests
from bs4 import BeautifulSoup
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = ('https://www.nascar.com/stats/2021/1/box-score')
response = requests.get(url, headers = headers)
response.content
soup = BeautifulSoup(response.content, 'html.parser')
stats = soup.find_all('table', class_ = "stats-box-score-table-driver")
stats
for row in stats.find_all('tr'):
for cell in row.find_all('td'):
print(cell.text)
I am trying to read data from this link: https://www.nseindia.com/api/option-chain-indices?symbol=NIFTY
using python request and urllib library. I tried both libraries but not able to see even the status code of url. Please suggest what is wrong with this. I am attaching my code as well please look for it. and tell me where i am doing wrong
import csv
import requests
from csv import reader
import xlrd
import pandas
import urllib.request
from bs4 import BeautifulSoup
# open a connection to a URL using urllib
webUrl = urllib.request.urlopen('https://www.nseindia.com/api/option-chain-indices?symbol=NIFTY')
#get the result code and print it
print ("result code: " + str(webUrl.getcode()))
read the data from the URL and print it
data = webUrl.read()
print (data)
You need to add headers
I made this just to test it out
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'My User Agent 1.0',
'From': 'youremail#domain.com' # This is another valid field
}
# open a connection to a URL using urllib
webUrl = requests.get('https://www.nseindia.com/api/option-chain-indices?symbol=NIFTY', headers=headers).text
#read the data from the URL and print it
soup = BeautifulSoup(webUrl, 'html.parser')
print (soup.prettify())
I am trying to pull data from various columns in the odds table from this website:
https://www.sportsbookreview.com/betting-odds/nba-basketball/totals/?date=20190419
I have tried using the following code but I am only getting the open lines. I want to be able to get exact columns. For example, the pinnacle and bookmaker columns.
import urllib
import urllib.request
from bs4 import BeautifulSoup
theurl = "https://www.sportsbookreview.com/betting-odds/nba-
basketball/totals/?date=20190419"
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage,"html.parser")
for lines in soup.findAll('span',{"class":"_3Nv_7"}):
print(lines.get_text())
import urllib
import urllib.request
from bs4 import BeautifulSoup
theurl = "https://www.sportsbookreview.com/betting-odds/nba-basketball/totals/?date=20190419"
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage,"html.parser")
for lines in soup.findAll('span',{"class":"_3Nv_7 opener"}):
print(lines.get_text())
I am trying to scrape info from a website(Program name and program ID) and it is returning empty list.
I am not sure if i am mixing up the syntax but this is what i have
soup.find_all('h3', class_='ama__h3')
the website link is https://freida.ama-assn.org/Freida/#/programs?program=residencies&specialtiesToSearch=140
from urllib.request import urlopen
from bs4 import BeautifulSoup as BS
import pandas as pd
from urllib.parse import urlparse, urlsplit
import requests
res = requests.get('https://freida.ama-assn.org/Freida/#/programs?program=residencies&specialtiesToSearch=140')
soup = bs4.BeautifulSoup(res.text, 'html5lib')
print(soup.prettify())
soup.find_all('h3', class_='ama__h3')
Your error is because you are parsing with html5lib. For any well formed html, the parser choice is not really important. However for a non well formed html (like this one), html5lib seems to have issues. You should use html.parser or lxml (apparently html.parser is safer)
However this code is doing what you want to do :
soup = BeautifulSoup(res.text, 'html.parser')
programs = soup.find_all("a", class_='ama__promo--background')
for program in programs:
program_name = program.find("h3").text
program_id = program.find_all("small")[-1].text.split(': ')[1].strip()
print(program_name, program_id
I have Scraped a REST API, and here is my code:
import json
from pprint import pprint
import sqlite3
import datetime
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import requests
url = "https://cex.io/api/ohlcv/hd/20180125/BTC/USD"
headers = {'User-Agent':'Mozilla/5.0'}
page = requests.get(url)
soup = soup(page.text, "html.parser")
a = soup("data1d")
I want the data of "data1d" from soup but when I try to do this it shows:
File "C:\Users\mubee\Downloads\Annaconda\lib\site-packages\bs4\element.py", line 1011, in __getitem__
return self.attrs[key]
KeyError: 'data1d'
while there is a data present in "data1d" in variable soup. How can I get the data present in "data1d" only, from the variable soup?
As the page is just json it is simple, no need for soup:
import requests
url = "https://cex.io/api/ohlcv/hd/20180125/BTC/USD"
page = requests.get(url)
page_dict=page.json()
print(page_dict['data1d'])