Can't find specific table using BeautifulSoup - python-3.x

I have been using BeautifulSoup to scrape the pricing information from
"https://www.huaweicloud.com/pricing.html#/ecs"
I want to extract the table information of that website, but I get nothing.
I am using Windows 10 , the latest BeautifulSoup , Request and Python3.7
import requests
from bs4 import BeautifulSoup
url = 'https://www.huaweicloud.com/pricing.html#/ecs'
headers = {'User-Agent':'Mozilla/5.0'}
response = requests.get(url,headers=headers)
soup = BeautifulSoup(response.content,'html.parser')
soup.find_all('table')
After running the soup.find_all('table') , it returns an empty list: []

I know this is not the answer to your question, but this might help you. This is the code I came up with using selenium & BeautifulSoup. You just have to specify the location of chromedriver, and the script is good to go.
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.huaweicloud.com/pricing.html#/ecs'
driver = webdriver.Chrome("location of chrome driver")
driver.get(str(url))
driver.find_element_by_id("calculator_tab0").click()
time.sleep(3)
html_source = driver.page_source
soup = BeautifulSoup(html_source, features="lxml")
table_all = soup.findAll("table")
output_rows = []
for table in table_all[:2]:
for table_row in table.findAll('tr'):
thead = table_row.findAll('th')
columns = table_row.findAll('td')
_thead = []
for th in thead:
_thead.append(th.text)
output_rows.append(_thead)
_row = []
for column in columns:
_row.append(column.text)
output_rows.append(_row)
output_rows = [x for x in output_rows if x != []]
df = pd.DataFrame(output_rows)

Related

How to scrape dynamic content with beautifulsoup?

Here's my script :
import warnings
warnings.filterwarnings("ignore")
import re
import json
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
URLs = ['https://www.frayssinet-joaillier.fr/fr/p/montre-the-longines-legend-diver-l37744302-bdc2']
TypeVendor = []
NameVendor = []
Marques = []
Brands = []
Refs = []
Prices = []
#Carts = []
#Links = []
Links = []
#df = pd.read_csv('testlink4.csv')
n=1
for url in URLs:
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
TypeVendor.append('Distributeur')
NameVendor.append('Frayssinet')
Marques.append('Longines')
Brands.append(soup.find('span', class_ = 'main-detail__name').text)
Refs.append(soup.find('span', class_ = 'main-detail__ref').text)
Prices.append(soup.find('span', class_ = 'prix').text)
Links.append(url)
I understand why it doesn't work, text isn't adapted for dynamic content. But I cannot figure it out how to scrape this kind of content. I know if you find where the json data is sotred, yo ucan tweak with it and scrape the data.
But I checked on the google developer tools, on the network tab and I didn't find anything.
Set headers to your request and store your information in a more structured way.
Example
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0'}
URLs = ['https://www.frayssinet-joaillier.fr/fr/p/montre-the-longines-legend-diver-l37744302-bdc2']
data = []
for url in URLs:
results = requests.get(url,headers=headers)
soup = BeautifulSoup(results.text, "html.parser")
data.append({
'name': soup.find('span', class_ = 'main-detail__name').get_text(strip=True),
'brand': soup.find('span', class_ = 'main-detail__marque').get_text(strip=True),
'ref':soup.find('span', class_ = 'main-detail__ref').get_text(strip=True),
'price':soup.find('span', {'itemprop':'price'}).get('content'),
'url':url
})
pd.DataFrame(data)
Output
name
brand
ref
price
url
Montre The Longines Legend Diver L3.774.4.30.2
Longines
Référence : L3.774.4.30.2
2240
https://www.frayssinet-joaillier.fr/fr/p/montre-the-longines-legend-diver-l37744302-bdc2

How to extract name and links from a given website - python

For below mentioned website, I am trying to find the name and its corresponding link from that site. But not able to pass/get the data at all.
Using BeautifulSoup
from bs4 import BeautifulSoup
import requests
source = requests.get('https://mommypoppins.com/events/115/los-angeles/all/tag/all/age/all/all/deals/0/near/0/0')
soup = BeautifulSoup(source.text, 'html.parser')
mains = soup.find_all("div", {"class": "list-container-wrapper"})
name = []
lnks = []
for main in mains:
name.append(main.find("a").text)
lnks.append(main.find("a").get('href'))
Using Selenium webdriver
from selenium import webdriver
driver = webdriver.Chrome(executable_path=r"chromedriver_win32\chromedriver.exe")
driver.get("https://mommypoppins.com/events/115/los-angeles/all/tag/all/age/all/all/deals/0/near/0/0")
lnks = []
name = []
for a in driver.find_elements_by_class_name('ng-star-inserted'):
link = a.get_attribute('href')
lnks.append(link)
nm = driver.find_element_by_css_selector("#list-item-0 > div > h2 > a").text
name.append(nm)
I have tried with both 2 above methods.
Example:
name = ['Friday Night Flicks Drive-In at the Roadium', 'Open: Butterfly Pavilion and Nature Gardens']
lnks = ['https://mommypoppins.com/los-angeles-kids/event/in-person/friday-night-flicks-drive-in-at-the-roadium','https://mommypoppins.com/los-angeles-kids/event/in-person/open-butterfly-pavilion-and-nature-gardens']
Here's solution for webdriver:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
driver.get('https://mommypoppins.com/events/115/los-angeles/all/tag/all/age/all/all/deals/0/near/0/0')
time.sleep(3)
elements = driver.find_elements(By.XPATH, "//a[#angularticsaction='expanded-detail']")
attributes = [{el.text: el.get_attribute('href')} for el in elements]
print(attributes)
print(len(attributes))
driver.quit()
Here's solution with webdriver and bs4:
import time
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
driver.get('https://mommypoppins.com/events/115/los-angeles/all/tag/all/age/all/all/deals/0/near/0/0')
time.sleep(3)
soup = BeautifulSoup(driver.page_source, 'html.parser')
mains = soup.find_all("a", {"angularticsaction": "expanded-detail"})
attributes = [{el.text: el.get('href')} for el in mains]
print(attributes)
print(len(attributes))
driver.quit()
Here's solution with requests:
import requests
url = "https://mommypoppins.com"
response = requests.get(f"{url}/contentasjson/custom_data/events_ng-block_1x/0/115/all/all/all/all/all").json()
attributes = [{r.get('node_title'): f"{url}{r['node'][r['nid']]['node_url']}"} for r in response['results']]
print(attributes)
print(len(attributes))
cheers!
The website is loaded dynamically, therefore requests won't support it. However, the data is available in JSON format via sending a GET request to:
https://mommypoppins.com/contentasjson/custom_data/events_ng-block_1x/0/115/all/all/all/all/all.
There's no need for BeautifulSoup or Selenium, using merely requests would work, which will make your code much faster.
import requests
URL = "https://mommypoppins.com/contentasjson/custom_data/events_ng-block_1x/0/115/all/all/all/all/all"
BASE_URL = "https://mommypoppins.com"
response = requests.get(URL).json()
names = []
links = []
for json_data in response["results"]:
data = json_data["node"][json_data["nid"]]
names.append(data["title"])
links.append(BASE_URL + data["node_url"])

How can i scrape a football results from flashscore using python

Web scraping Python
' I am new to scraping. I want to scrape Premier League Season 2018-19 Results(fixtures, results, date), But i am struggling to navigate the web site. all i get is empty list / [None]. if you have a solution that you can share that will be a great help. '
'Here's what i tried.'
'''
import pandas as pd
import requests as uReq
from bs4 import BeautifulSoup
url = uReq.get('https://www.flashscore.com/football/england/premier-league-2018-2019/results/')
soup = BeautifulSoup(url.text, 'html.parser')
divs = soup.find_all('div', attrs={'id': 'live-table'})
Home = []
for div in divs:
anchor = div.find(class_='event__participant event__participant--home')
Home.append(anchor)
print(Home)
'''
You will have to install requests_html for my solution.
Here is how I will go about it:
from requests_html import AsyncHTMLSession
from collections import defaultdict
import pandas as pd
url = 'https://www.flashscore.com/football/england/premier-league-2018-2019/results/'
asession = AsyncHTMLSession()
async def get_scores():
r = await asession.get(url)
await r.html.arender()
return r
results = asession.run(get_scores)
results = results[0]
times = results.html.find("div.event__time")
home_teams = results.html.find("div.event__participant.event__participant--home")
scores = results.html.find("div.event__scores.fontBold")
away_teams = results.html.find("div.event__participant.event__participant--away")
event_part = results.html.find("div.event__part")
dict_res = defaultdict(list)
for ind in range(len(times)):
dict_res['times'].append(times[ind].text)
dict_res['home_teams'].append(home_teams[ind].text)
dict_res['scores'].append(scores[ind].text)
dict_res['away_teams'].append(away_teams[ind].text)
dict_res['event_part'].append(event_part[ind].text)
df_res = pd.DataFrame(dict_res)
This generates the following output:

I need to scrape the job description text for every job title in the mentioned page

I need to scrape the job descriptions in the page () for every job title like section (accounting) job title (staff accountant) job description text inside the title in different columns in a csv file using python beautiful soup module.
I'm new to beautiful soup i tried some ways of doing it but its not working can you please help with the code
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
start = time.time()
url = ""
data = []
while True:
resp = requests.get(url)
soup = BeautifulSoup(resp.content, 'lxml')
jobdesc = soup.find("li",{'class':'col-xs-12 col-sm-4'})
section=soup.find("h4")
jd = {"jobdescription":jobdesc.text,"topic":section.text}
data.append(jd)
df = pd.DataFrame(data)
df.to_csv("JD.csv")
Here is one way leveraging :has in bs4 4.7.1+ to isolate the sections for looping over. zip_longest is used so we can join section title on to each job.
import requests, csv
from bs4 import BeautifulSoup as bs
from itertools import zip_longest
r = requests.get('https://resources.workable.com/job-descriptions/#', headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.content, 'lxml')
with open("data.csv", "w", encoding="utf-8-sig", newline='') as csv_file:
w = csv.writer(csv_file, delimiter = ",", quoting=csv.QUOTE_MINIMAL)
w.writerow(['Section','Job Title'])
for section in soup.select('section:has(.job)'):
title = section.select_one('a').text.strip()
jobs = [job.text for job in section.select('li a')]
rows = list(zip_longest([title], jobs, fillvalue = title))
for row in rows:
w.writerow(row)
I had a 403 forbidden using requests package, so I decide to use selenium
You can try this:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from selenium import webdriver
url = "https://resources.workable.com/job-descriptions/#"
data = []
#resp = requests.get(url)
#soup = BeautifulSoup(resp.text, 'html.parser')
driver = webdriver.Firefox()
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
section = soup.find_all('section',{'class':'box-white'})
for s in section:
title = s.find('h4').text
lis = soup.find_all("li",{'class':'col-xs-12 col-sm-4'})
for li in lis:
jd = {"jobdescription":li.text,"topic":title}
data.append(jd)
df = pd.DataFrame(data)
df.to_csv("JD.csv")
EDIT: To get description for all jobs
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from selenium import webdriver
url = "https://resources.workable.com/job-descriptions/#"
data = []
#resp = requests.get(url)
#soup = BeautifulSoup(resp.text, 'html.parser')
driver = webdriver.Firefox()
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
section = soup.find_all('section',{'class':'box-white'})
for s in section:
title = s.find('h4').text
lis = s.find_all("li",{'class':'col-xs-12 col-sm-4'})
for li in lis:
job = li.text
driver.get(li.find('a').get('href'))
soup2 = BeautifulSoup(driver.page_source, 'html.parser')
jd = {"job":job,"topic":title, "description": soup2.find('div',{'class':'entry-content article-content'}).text}
data.append(jd)
df = pd.DataFrame(data)
df.to_csv("JD.csv")
Scraping data from monster jobs and uploading to Mongo DB.
from time import *
from selenium import webdriver
import pymongo
from pymongo.results import InsertManyResult
import os
client = pymongo.MongoClient()
mydb = client['jobs']
collection = mydb['med_title']
driver = webdriver.Chrome("C:/Users/91798/Desktop/pythn_files/chromedriver.exe")
driver.get("https://www.monsterindia.com/")
driver.implicitly_wait(9)
driver.find_element_by_id("SE_home_autocomplete").send_keys("nursing , Therapist , docter , medical ,nurse , hospital")
#for normal search use this
driver.find_element_by_xpath("//body/div[#id='themeDefault']/section[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/form[1]/div[1]/div[2]/input[1]").click()
driver.implicitly_wait(20)
temp = 1
while(True):
if temp == 5:
break
all_jobs = driver.find_elements_by_class_name("card-apply-content")
link_list = []
for job in all_jobs:
try:
company = ""
com_name = job.find_elements_by_class_name("job-tittle")
driver.implicitly_wait(1)
for ele in com_name:
company = ele.find_element_by_class_name('company-name').text
job_title = ""
for ele in com_name:
job_title = ele.find_element_by_class_name('medium').text
location = job.find_element_by_class_name("loc").text
driver.implicitly_wait(1)
lnks= job.find_elements_by_tag_name("a")
for lnk in lnks:
link_list.append(lnk.get_attribute('href'))
break
driver.implicitly_wait(1)
desc = job.find_element_by_class_name("job-descrip").text
driver.implicitly_wait(1)
skills = job.find_element_by_class_name("descrip-skills").text
except:
desc = 'desc Not Specified'
skills = 'skills Not Specified'
location = ' location Not Specified'
company = 'company Not Specified'
job_title = 'job_title not specified'
s = skills.split(' ')
for i in s:
if i == ',':
s.remove(',')
data = {"job_title" : job_title ,"comapany_name": company,"job_location":
location,"job_desc":desc,"skills":s[2::],"card_link":link_list[0]}
link_list.clear()
y = collection.insert_one(data)
print(y.inserted_id)
driver.find_element_by_xpath("//button[contains(text(),'Next')]").click()
sleep(25)
temp = temp +1

Not able to use BeautifulSoup to get span content of Nasdaq100 future

from bs4
import BeautifulSoup
import re
import requests
url = 'www.barchart.com/futures/quotes/NQU18'
r = requests.get("https://" +url)
data = r.text
soup = BeautifulSoup(data)
price = soup.find('span', {'class': 'last-change',
'data-ng-class': "highlightValue('priceChange’)”}).text
print(price)
Result:
[[ item.priceChange ]]
It is not the span content. The result should be price. Where am I going wrong?
The following is the span tag of the page:
2nd screenshot: How can I get the time?
Use price = soup.find('span', {'class': 'up'}).text instead to get the +X.XX value:
from bs4 import BeautifulSoup
import requests
url = 'www.barchart.com/futures/quotes/NQU18'
r = requests.get("https://" +url)
data = r.text
soup = BeautifulSoup(data, "lxml")
price = soup.find('span', {'class': 'up'}).text
print(price)
Output currently is:
+74.75
The tradeTime you seek seems to not be present in the page_source, since it's dynamically generated through JavaScript. You can, however, find it elsewhere if you're a little clever, and use the json library to parse the JSON data from a certain script element:
import json
trade_time = soup.find('script', {"id": 'barchart-www-inline-data'}).text
json_data = json.loads(trade_time)
print(json_data["NQU18"]["quote"]["tradeTime"])
This outputs:
2018-06-14T18:14:05
If these don't solve your problem then you will have to resort to something like Selenium that can run JavaScript to get what you're looking for:
from selenium import webdriver
driver = webdriver.Chrome()
url = ("https://www.barchart.com/futures/quotes/NQU18")
driver.get(url)
result = driver.find_element_by_xpath('//*[#id="main-content-column"]/div/div[1]/div[2]/span[2]/span[1]')
print(result.text)
Currently the output is:
-13.00

Resources