How do I extract data from the table using python? - python-3.x

I am trying to get the data from the table shown on this website 'https://www.qualitydiamonds.co.uk/one-carat-loose-diamonds/#'
I have tried the code below, but I am only able to get the data from the 4 main diamonds appearing on the webpage and none of the data from the actual table.
import requests
from bs4 import BeautifulSoup
url = "https://www.qualitydiamonds.co.uk/one-carat-loose-diamonds/"
response = requests.get(url)
print(response)
soup = BeautifulSoup(response.text, 'html.parser')
one_a_tag = soup.findAll('span', class_='price')
print(one_a_tag)

Hey you can try this code below:
import requests
import bs4 as bs
url = "https://www.qualitydiamonds.co.uk/one-carat-loose-diamonds/"
response = requests.get(url)
soup = bs.BeautifulSoup(response.text, 'lxml')
price = soup.find(class_='price')
print(price.text)

Related

How to only retrieve the tag I specify using BeautifulSoup

I just want the written text out of this website: https://algorithms-tour.stitchfix.com/ so I can put it in Word doc and read it.
When I run the code, I get all the html and the tags, at the very end I get what I want, but I just want to separate the text.
import requests
from bs4 import BeautifulSoup
url = "https://algorithms-tour.stitchfix.com"
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
item = soup.find_all("p")
print(item)
Is there a way to get just content so I can clean it up some more?
You have a few options for this. If you only want text found within p tags, you can do this:
import requests
from bs4 import BeautifulSoup
url = "https://algorithms-tour.stitchfix.com"
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
items = soup.find_all("p")
result = []
for item in items:
result.append(item.string)
print(result)
Note that soup.find_all returns an iterable list, and not a single object.
An alternative, and easier method is to just use soup.get_text:
import requests
from bs4 import BeautifulSoup
url = "https://algorithms-tour.stitchfix.com"
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
print(soup.get_text())

How to scrape from web all children of an attribute with one class?

I have tried to get the highlighted area (in the screenshot) in the website using BeautifulSoup4, but I cannot get what I want. Maybe you have a recommendation doing it with another way.
Screenshot of the website I need to get data from
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
import urllib
import csv
import html5lib
urls = ['https://e-mehkeme.gov.az/Public/Cases?page=1',
'https://e-mehkeme.gov.az/Public/Cases?page=2'
]
# scrape elements
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
content = soup.findAll("input", class_="casedetail filled")
print(content)
My expected output is like this:
Ətraflı məlumat:
İşə baxan hakim və ya tərkib
Xəyalə Cəmilova - sədrlik edən hakim
İlham Kərimli - tərkib üzvü
İsmayıl Xəlilov - tərkib üzvü
Tərəflər
Cavabdeh: MAHMUDOV MAQSUD SOLTAN OĞLU
Cavabdeh: MAHMUDOV MAHMUD SOLTAN OĞLU
İddiaçı: QƏHRƏMANOVA AYNA NUĞAY QIZI
İşin mahiyyəti
Mənzil mübahisələri - Mənzildən çıxarılma
Using the base url first get all the caseid and then pass those caseid to target url and then get the value of the first td tag.
import requests
from bs4 import BeautifulSoup
urls = ['https://e-mehkeme.gov.az/Public/Cases?page=1',
'https://e-mehkeme.gov.az/Public/Cases?page=2'
]
target_url="https://e-mehkeme.gov.az/Public/CaseDetail?caseId={}"
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
for caseid in soup.select('input.casedetail'):
#print(caseid['value'])
soup1=BeautifulSoup(requests.get(target_url.format(caseid['value'])).content,'html.parser')
print(soup1.select_one("td").text)
I would write it this way. Extracting the id that needs to be put in GET request for detailed info
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://e-mehkeme.gov.az/Public/Cases?page=1','https://e-mehkeme.gov.az/Public/Cases?page=2']
def get_soup(url):
r = s.get(url)
soup = bs(r.content, 'lxml')
return soup
with requests.Session() as s:
for url in urls:
soup = get_soup(url)
detail_urls = [f'https://e-mehkeme.gov.az/Public/CaseDetail?caseId={i["value"]}' for i in soup.select('.caseId')]
for next_url in detail_urls:
soup = get_soup(next_url)
data = [string for string in soup.select_one('[colspan="4"]').stripped_strings]
print(data)

BeautifulSoup python: Get the text with no tags and get the adjacent links

I am trying to extract the movie titles and links for it from this site
from bs4 import BeautifulSoup
from requests import get
link = "https://tamilrockerrs.ch"
r = get(link).content
#r = open('json.html','rb').read()
b = BeautifulSoup(r,'html5lib')
a = b.findAll('p')[1]
But the problem is there is no tag for the titles. I can't extract the titles and if I could do that how can I bind the links and title together.
Thanks in Advance
You can find title and link by this way.
from bs4 import BeautifulSoup
import requests
url= "http://tamilrockerrs.ch"
response= requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'html.parser')
data = soup.find_all('div', {"class":"title"})
for film in data:
print("Title:", film.find('a').text) # get the title here
print("Link:", film.find('a').get("href")) #get the link here

Not able to use BeautifulSoup to get span content of Nasdaq100 future

from bs4
import BeautifulSoup
import re
import requests
url = 'www.barchart.com/futures/quotes/NQU18'
r = requests.get("https://" +url)
data = r.text
soup = BeautifulSoup(data)
price = soup.find('span', {'class': 'last-change',
'data-ng-class': "highlightValue('priceChange’)”}).text
print(price)
Result:
[[ item.priceChange ]]
It is not the span content. The result should be price. Where am I going wrong?
The following is the span tag of the page:
2nd screenshot: How can I get the time?
Use price = soup.find('span', {'class': 'up'}).text instead to get the +X.XX value:
from bs4 import BeautifulSoup
import requests
url = 'www.barchart.com/futures/quotes/NQU18'
r = requests.get("https://" +url)
data = r.text
soup = BeautifulSoup(data, "lxml")
price = soup.find('span', {'class': 'up'}).text
print(price)
Output currently is:
+74.75
The tradeTime you seek seems to not be present in the page_source, since it's dynamically generated through JavaScript. You can, however, find it elsewhere if you're a little clever, and use the json library to parse the JSON data from a certain script element:
import json
trade_time = soup.find('script', {"id": 'barchart-www-inline-data'}).text
json_data = json.loads(trade_time)
print(json_data["NQU18"]["quote"]["tradeTime"])
This outputs:
2018-06-14T18:14:05
If these don't solve your problem then you will have to resort to something like Selenium that can run JavaScript to get what you're looking for:
from selenium import webdriver
driver = webdriver.Chrome()
url = ("https://www.barchart.com/futures/quotes/NQU18")
driver.get(url)
result = driver.find_element_by_xpath('//*[#id="main-content-column"]/div/div[1]/div[2]/span[2]/span[1]')
print(result.text)
Currently the output is:
-13.00

Parsing through HTML to extract data from table rows with beautiful soup

I'm using BeautifulSoup to extract stock information from the NASDAQ website. I want to retrieve information specifically from the table rows on the HTML page but I am always getting an error (line 12).
#import html-parser
from bs4 import BeautifulSoup
from requests import get
url = 'https://www.nasdaq.com/symbol/amzn' #AMZN is just an example
response = get(url)
#Create parse tree (BeautifulSoup Object)
soup = BeautifulSoup(response.text, 'html.parser')
data = soup.find_all(class_= 'column span-1-of-2')
table = data.find(class_= 'table-row') #This is where the error occurs
print(table)
You can do something like this to get the data from table rows.
import requests
from bs4 import BeautifulSoup
import re
r = requests.get("https://www.nasdaq.com/")
print(r)
soup = BeautifulSoup(r.content, 'html.parser')
data = soup.find('table',{'id':'indexTable', 'class':'floatL marginB5px'}).script.text
matches = re.findall(r'nasdaqHomeIndexChart.storeIndexInfo(.*);\r\n', data)
table_rows = [re.findall(r'\".*\"', row) for row in matches]
print(table_rows)
table_rows is list of lists containing table data.

Resources