How to extract hidden li text - python-3.x

I am trying to scrape from this website, jump to each href article and scrape the comments located right after the main body text. However, I am getting blank results. Ive also tried fetching all li by writing soup.find_all('li') to check if any comments exist and found out that even extracting all li did not contain any comments about the article. Can anyone advice please? Im suspecting the website is making it harder to get the text.
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
urls = [
'https://hypebeast.com/brands/jordan-brand'
]
with requests.Session() as s:
for url in urls:
driver = webdriver.Chrome('/Users/Documents/python/Selenium/bin/chromedriver')
driver.get(url)
products = [element for element in WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[#class='post-box ']")))]
soup = bs(driver.page_source, 'lxml')
element = soup.select('.post-box ')
time.sleep(1)
ahref = [item.find('a')['href'] for item in element]
results = list(zip(ahref))
df = pd.DataFrame(results)
for result in results:
res = driver.get(result[0])
soup = bs(driver.page_source, 'lxml')
time.sleep(6)
comments_href = soup.find_all('ul', {'id': 'post-list'})
print(comments_href)

The post/comments are in an <iframe> tag. The tag also has a dynamic attribute that starts with dsq-app. So what you'll need to do is locate that iframe, switch to it, then you can parse. I choose to use BeautifulSoup to pull out the script tag, read that in as a json format and navigate through there. This should hopfully get you going with pull what you're looking for:
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import json
urls = [
'https://hypebeast.com/brands/jordan-brand'
]
with requests.Session() as s:
for url in urls:
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
driver.get(url)
products = [element for element in WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[#class='post-box ']")))]
soup = bs(driver.page_source, 'lxml')
element = soup.select('.post-box ')
time.sleep(1)
ahref = [item.find('a')['href'] for item in element]
results = list(zip(ahref))
df = pd.DataFrame(results)
for result in ahref:
driver.get(result)
time.sleep(6)
iframe = driver.find_element_by_xpath('//iframe[starts-with(#name, "dsq-app")]')
driver.switch_to.frame(iframe)
soup = bs(driver.page_source, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
if 'response' in script.text:
jsonStr = script.text
jsonData = json.loads(jsonStr)
for each in jsonData['response']['posts']:
author = each['author']['username']
message = each['raw_message']
print('%s: %s' %(author, message))
Output:
annvee: Lemme get them BDSM jordans fam
deathb4designer: Lmao
zenmasterchen: not sure why this model needed to exist in the first place
Spawnn: Issa flop.
disqus_lEPADa2ZPn: looks like an AF1
Lekkerdan: Hoodrat shoes.
rubnalntapia: Damn this are sweet
marcellusbarnes: Dope, and I hate Jordan lows
marcellusbarnes: The little jumpman on the back is dumb
chickenboihotsauce: copping those CPFM gonna be aids
lowercasegod: L's inbound
monalisadiamante: Sold out in 4 minutes. 😑
nickpurita: Those CPFM’s r overhyped AF.
...

Related

Unable to scrape texts from URLs

I have been strugling to scrape the contents/text of news articles from each URLs. The extraction of URLs works fine, but scraping the texts from each URLs has been challenging. Below is my code:
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd
# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.iol.co.za/news/south-africa/eastern-cape'
wait = WebDriverWait(driver, 5)
driver.get(url)
time.sleep(3)
# take the articles
articles = wait.until(EC.presence_of_all_elements_located((By.XPATH,
f"//article//*[(name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7') and string-length(text()) > 0]/ancestor::article")))
article_link = []
full_text = []
# For every article we take what we want
for article in articles:
link = article.find_element(By.XPATH, f".//a")
news_link = link.get_attribute('href')
article_link.append(news_link)
for j in article_link:
news_response = requests.get(j)
news_data = news_response.content
news_soup = BeautifulSoup(news_data, 'html.parser')
art_cont = news_soup.find('div', 'Article__StyledArticleContent-sc-uw4nkg-0')
full_text.append(art_cont.text)
print(article_link)
print(full_text)
I tried to use beautifulsoup, but it doesn't seem to work. I will be grateful for any help.
First off you should probably unindent the second for loop, it shouldn't be running inside of the first loop (you will be doubling and getting all of the information countless extra times).
Second. The requests that you send are returning a webpage that has content blocked (I could not figure out a way around this with inserting headers into the request). What you could do is use the driver to load each of the links and grab the text from there, here is how you could do that.
for link in article_link:
driver.get(link)
news_data = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'Article__StyledArticleContent-sc-uw4nkg-0')))
full_text.append(news_data[0].get_attribute('textContent'))
The full script would look like this:
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import sys, time
from bs4 import BeautifulSoup
import requests
import pandas as pd
# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.iol.co.za/news/south-africa/eastern-cape'
wait = WebDriverWait(driver, 5)
driver.get(url)
time.sleep(3)
# take the articles
articles = wait.until(EC.presence_of_all_elements_located((By.XPATH,
f"//article//*[(name() = 'h1' or name()='h2' or name()='h3' or name()='h4' or name()='h5' or name()='h6' or name()='h7') and string-length(text()) > 0]/ancestor::article")))
article_link = []
full_text = []
# For every article we take what we want
for article in articles:
link = article.find_element(By.XPATH, f".//a")
news_link = link.get_attribute('href')
article_link.append(news_link)
for link in article_link[:5]:
driver.get(link)
news_data = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'Article__StyledArticleContent-sc-uw4nkg-0')))
full_text.append(news_data[0].get_attribute('textContent'))
print(article_link)
print(full_text)
The best course of action is to utilize selenium throughout as the site's content is cloudflare secured. Although #Andrew Ryan has already addressed the issue, I thought I'd come up with a shorter version of it since this answer was already halfway through at the time of his posting.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
link = 'https://www.iol.co.za/news/south-africa/eastern-cape'
def get_links_and_texts(driver,url):
driver.get(url)
for article_link in [i.get_attribute('href') for i in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH,"//article/a[starts-with(#class,'Link__StyledLink')]")))]:
driver.get(article_link)
art_content = WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,".article-content"))).text
yield {"Link":article_link,"article_content":art_content}
if __name__ == '__main__':
with webdriver.Chrome() as driver:
for item in get_links_and_texts(driver,link):
print(item)

BeautifulSoup to Data Frame With Pandas

New to programming here, so forgive the silly questions. I've been trying to work out how to Python for web scraping and a lot of the YouTube videos and other questions kinda get me there, I'm having a hard time relating the answer to my actual code.
My code so far is:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
import pandas as pd
url = "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/06&Racecourse=ST&RaceNo=1"
driver = webdriver.Chrome()
driver.get(url)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup_level1=soup(driver.page_source, 'lxml')
race_soup = soup_level1.find("tbody", class_="f_fs13")
print(race_soup.text.strip())
results_soup = soup_level1.find("tbody", class_="f_fs12")
print(results_soup.text.strip())
datalist = [] #empty list
x = 0 #counter
print('good')
driver.close()
This will generate the parsed data, but now I am stuck as how to move it from text to a data frame with pandas. I'm sure it is simple, but all of the instructional material I've seen isn't clicking for me.
Also, The code so far is just sort of copy and pasted chunks from different websites that I got to work with trial and error. I'm not sure if any of it is redundant, so if there is a neater way to go about it, I would appreciate that feedback as well!
Thanks in advance,
Spencer
Give this a try:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
import pandas as pd
url = "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/06&Racecourse=ST&RaceNo=1"
driver = webdriver.Chrome()
driver.get(url)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "f_fs13")))
htmlStr = driver.page_source
soup_level1 = soup(htmlStr, 'html.parser')
race_soup = soup_level1.find('tbody',{'class':'f_fs13'}).find_parent('table')
results_soup = soup_level1.find('tbody',{'class':'f_fs12'}).find_parent('table')
df1 = pd.read_html(str(race_soup))[0]
print(df1)
df2 = pd.read_html(str(results_soup))[0]
print(df2)
datalist = [] #empty list
x = 0 #counter
print('good')
driver.close()

Why is my Selenium code returning only half of the data I requested

Recently, I wrote a selenium web scraper that is meant to extract all the information on a table containing data on all presidential elections that have been held in the united states. The table is on this wikipedia site.
The problem is that the code returns all the info I need when I write the result into a .txt file. But anytime I try to print that same result in my text editor, it returns only half of the data I need. I do not understand what the problem is. Can someone help me out?
Here is my code.
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas
# using selenium and shromedriver to extract the javascript wikipage
scrape_options = Options()
scrape_options.add_argument('--headless')
driver = webdriver.Chrome(r'web scraping master/chromedriver', options=scrape_options)
page_info = driver.get('https://en.wikipedia.org/wiki/United_States_presidential_election')
# waiting for the javascript to load
try:
WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,"wikitable")))
finally:
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
table = soup.find('table', {'class': 'wikitable sortable jquery-tablesorter'})
print(table)
with open("loge.txt","w") as f: #Only part I added to the code
f.write(str(table))
I'm not really sure what was the problem, but this works as expected. I've changed loge.txt to loge.html and the code dumps the entire table.
Mind trying this?
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
scrape_options = Options()
scrape_options.add_argument('--headless')
driver = webdriver.Chrome(options=scrape_options)
page_info = driver.get('https://en.wikipedia.org/wiki/United_States_presidential_election')
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "wikitable")))
finally:
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
table = soup.find('table', {'class': 'wikitable sortable jquery-tablesorter'})
with open("loge.html", "w") as f:
f.write(str(table))

How do I scrape food menu from zomato page?

I am trying to scrape food menu data from zomato. I am using selenium to do the same while inspecting the elements, I can find the class 'category_heading', but using the same in the code gives no result and shows empty list. I am attaching the snippet of the code. Thanks.
I have tried using browser.find_element_by_xpath as well find_element_by_class_name and tag, but nothing seems to work.
order_now = browser.find_element_by_xpath("//*[#id='orig-search-list']/div[1]/div[2]/a").click()
browser.maximize_window()
browser.implicitly_wait(20)
food_item = browser.find_elements_by_class_name("category_heading")
print('food',food_item)
I need the food menu data so that I can store it in a csv.
Page can be slow to load. Try using a wait condition
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as bs
d = webdriver.Chrome()
d.get('https://www.zomato.com/bangalore/burgers-kingdom-indiranagar-bangalore/order')
rows = WebDriverWait(d, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".row")))
soup = bs(d.page_source, 'lxml')
for category in soup.select('.category-container'):
title = category.select_one('h3').text
print(title)
items = [i.text for i in category.select('.category_heading')]
if items:
print(items)
else:
print('No sub-headers')

Beautiful Soup / urllib does not read some HTML tags

I recently started using Beautiful Soup. For practice I'm trying to scrape this website.
There are some div tags that the scraper does not seem to be able to access, even the URL reader does not seem to read these div tags. The HTML does not indicate that it is using JavaScript for the part that is not being read, so theoretically I assume Selenium is not needed. Specifically div tags under an ID named "ajaxTarget" are the concern. The below code returns some elements, but the majority of the div tags under this specific tag are not read.
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
def main():
url_link = 'https://www.countryflags.com/en/'
req = Request(url_link, headers={'User-Agent': 'Mozilla/5.0'})
page = urlopen(req).read()
soup = BeautifulSoup(page, features='lxml')
div_master_container = soup.findAll('div', attrs={'id': 'ajaxTarget'})
print(len(div_master_container))
for item in div_master_container:
print(item)
if __name__ == '__main__':
main()
I would appreciate if someone can point out whether there is an element in HTML that I am missing or any other factor that is contributing to this issue.
Javascript needs to run on the page. Use a method like selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url ='https://www.countryflags.com/en/'
driver = webdriver.Chrome()
driver.get(url)
items = [[item.get_attribute('href'),item.get_attribute('title')] for item in WebDriverWait(driver,30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#ajaxTarget [title]")))]
print(items)
print(len(items))
#driver.quit()

Resources