getting comments in article with phantomjs - python-3.x

I am trying to extract the comments in this website.
Inspection of element
I tried using urllib for this purpose but to no avail. Then I realized that since enabling javascript is necessary to do so, I used selenium and phantomjs to extract the comments as can be seen in the following python3 code:
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.PhantomJS('phantomjs')
url='http://www.faz.net/aktuell/politik/inland/sterbehilfe-im-bundestag-unmoralisches-angebot-13887916.html'
driver.get(url)
htm_doc = driver.page_source
soup = BeautifulSoup(htm_doc, 'html.parser')
print (soup.find('div', attrs={'id','lesermeinungen'}))
Since, the comments load while loading the page I simply access the source and try to see if there are any comments under the tag name 'lesermeinungen' as this is the section that appears when I insect the comments section.
However, it takes the result as None
UPDATE
Tried the following code
from bs4 import BeautifulSoup
import selenium.webdriver.support.ui as ui
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
driver = webdriver.PhantomJS('phantomjs')
url='http://www.faz.net/aktuell/politik/inland/sterbehilfe-im-bundestag-unmoralisches-angebot-13887916.html'
driver.get(url)
wait = ui.WebDriverWait(driver,3)
try:
wait.until(driver.find_element_by_id('lesermeinungen'))
htm_doc = driver.page_source
soup = BeautifulSoup(htm_doc, 'html.parser')
print (soup.find('div', attrs={'id','lesermeinungen'}))
except TimeoutException:
print ("Loading took too much time!")
No result even after 2 hours

You have a typo when searching for the element with beautifulsoup. Instead of
print (soup.find('div', attrs={'id','lesermeinungen'}))
it should be a colon, not a comma
print (soup.find('div', attrs={'id' : 'lesermeinungen'}))
With this correction your first example worked for me.

Related

Python selenium just screenshots the first element multiple times throughout the loop

I'm trying to take a screenshot of each comment in a reddit post using selenium python. All comments have the same id/class and that's what I have used to select them.
Here's my code;
import requests
from bs4 import BeautifulSoup
import pyttsx3, pyautogui
from PIL import Image
from io import BytesIO
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome(executable_path='C:\Selenium_Drivers\chromedriver.exe')
url = 'https://www.reddit.com/user/UoPeople09/comments/wlt4qj/what_made_you_apply_at_uopeople/'
driver.get(url)
driver.implicitly_wait(5)
total_height = int(driver.execute_script("return document.body.scrollHeight"))
u = 1
for i in range(1, total_height*2, 50):
driver.execute_script(f"window.scrollTo(0, {i})")
comment = driver.find_element(By.CSS_SELECTOR, 'div#t1_ikllxsq._3sf33-9rVAO_v4y0pIW_CH')
comment.screenshot(f'E:\WEB SCRAPING PROJECTS\PROJECTS\Reddit Scraping\shot{u}.png')
u += 1
Well my code scrolls down the page and saves screenshots in my desired path. But the problem is that all the screenshots are of the first element(comment) in the reddit post.
I want my code to save a screenshot of each comment separately. Need help
Here you have an exmample including the scroll till the end of the page:
# Needed libs
from selenium.webdriver import ActionChains, Keys
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
# Initialize drivver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://www.reddit.com/user/UoPeople09/comments/wlt4qj/what_made_you_apply_at_uopeople/'
wait = WebDriverWait(driver, 5)
driver.get(url)
# Wait for reject cookies button and push on it
reject_cookies_button = wait.until(EC.presence_of_element_located((By.XPATH, f"(//section[#class='_2BNSty-Ld4uppTeWGfEe8r']//button)[2]")))
reject_cookies_button.click()
# Make scroll till the end of the page
while True:
high_before_scroll = driver.execute_script('return document.body.scrollHeight')
driver.execute_script('window.scrollTo(100, document.body.scrollHeight);')
time.sleep(2)
if driver.execute_script('return document.body.scrollHeight') == high_before_scroll:
break
# We take how many comments we have
comments = wait.until(EC.presence_of_all_elements_located((By.XPATH, f"//div[contains(#class, 'Comment')]")))
# We take an screenshot for every comment and we save it
u = 1
for comment in comments:
driver.execute_script("arguments[0].scrollIntoView();", comment)
comment.screenshot(f'./shot{u}.png')
u += 1
I hope the comments in the code help you to understand what is happening
My code is done for linux, but just initialize the driver with your linux chromedriver
To get the screenshots of each comments, you need to identify the comment elements and then scroll to each comments and then take the screen shot.
This approach works for me.
url='https://www.reddit.com/user/UoPeople09/comments/wlt4qj/what_made_you_apply_at_uopeople/'
driver.get(url)
#disabled coockie button
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//button[contains(.,'Reject non-essential')]"))).click()
#Get all the comments
comments = driver.find_elements(By.CSS_SELECTOR, "[data-testid='comment_author_link']")
print(len(comments))
for i in range(len(comments)):
#Scroll to each comment
comments[i].location_once_scrolled_into_view
time.sleep(2)# slowdown the scripts to take the screenshot
driver.save_screenshot(f'E:\WEB SCRAPING PROJECTS\PROJECTS\Reddit Scraping\shot{i+1}.png')
Note: you have all the libraries, you need import time library only.

Why is my Selenium code returning only half of the data I requested

Recently, I wrote a selenium web scraper that is meant to extract all the information on a table containing data on all presidential elections that have been held in the united states. The table is on this wikipedia site.
The problem is that the code returns all the info I need when I write the result into a .txt file. But anytime I try to print that same result in my text editor, it returns only half of the data I need. I do not understand what the problem is. Can someone help me out?
Here is my code.
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas
# using selenium and shromedriver to extract the javascript wikipage
scrape_options = Options()
scrape_options.add_argument('--headless')
driver = webdriver.Chrome(r'web scraping master/chromedriver', options=scrape_options)
page_info = driver.get('https://en.wikipedia.org/wiki/United_States_presidential_election')
# waiting for the javascript to load
try:
WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CLASS_NAME,"wikitable")))
finally:
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
table = soup.find('table', {'class': 'wikitable sortable jquery-tablesorter'})
print(table)
with open("loge.txt","w") as f: #Only part I added to the code
f.write(str(table))
I'm not really sure what was the problem, but this works as expected. I've changed loge.txt to loge.html and the code dumps the entire table.
Mind trying this?
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
scrape_options = Options()
scrape_options.add_argument('--headless')
driver = webdriver.Chrome(options=scrape_options)
page_info = driver.get('https://en.wikipedia.org/wiki/United_States_presidential_election')
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "wikitable")))
finally:
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
table = soup.find('table', {'class': 'wikitable sortable jquery-tablesorter'})
with open("loge.html", "w") as f:
f.write(str(table))

How do I scrape food menu from zomato page?

I am trying to scrape food menu data from zomato. I am using selenium to do the same while inspecting the elements, I can find the class 'category_heading', but using the same in the code gives no result and shows empty list. I am attaching the snippet of the code. Thanks.
I have tried using browser.find_element_by_xpath as well find_element_by_class_name and tag, but nothing seems to work.
order_now = browser.find_element_by_xpath("//*[#id='orig-search-list']/div[1]/div[2]/a").click()
browser.maximize_window()
browser.implicitly_wait(20)
food_item = browser.find_elements_by_class_name("category_heading")
print('food',food_item)
I need the food menu data so that I can store it in a csv.
Page can be slow to load. Try using a wait condition
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as bs
d = webdriver.Chrome()
d.get('https://www.zomato.com/bangalore/burgers-kingdom-indiranagar-bangalore/order')
rows = WebDriverWait(d, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".row")))
soup = bs(d.page_source, 'lxml')
for category in soup.select('.category-container'):
title = category.select_one('h3').text
print(title)
items = [i.text for i in category.select('.category_heading')]
if items:
print(items)
else:
print('No sub-headers')

How to extract hidden li text

I am trying to scrape from this website, jump to each href article and scrape the comments located right after the main body text. However, I am getting blank results. Ive also tried fetching all li by writing soup.find_all('li') to check if any comments exist and found out that even extracting all li did not contain any comments about the article. Can anyone advice please? Im suspecting the website is making it harder to get the text.
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
urls = [
'https://hypebeast.com/brands/jordan-brand'
]
with requests.Session() as s:
for url in urls:
driver = webdriver.Chrome('/Users/Documents/python/Selenium/bin/chromedriver')
driver.get(url)
products = [element for element in WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[#class='post-box ']")))]
soup = bs(driver.page_source, 'lxml')
element = soup.select('.post-box ')
time.sleep(1)
ahref = [item.find('a')['href'] for item in element]
results = list(zip(ahref))
df = pd.DataFrame(results)
for result in results:
res = driver.get(result[0])
soup = bs(driver.page_source, 'lxml')
time.sleep(6)
comments_href = soup.find_all('ul', {'id': 'post-list'})
print(comments_href)
The post/comments are in an <iframe> tag. The tag also has a dynamic attribute that starts with dsq-app. So what you'll need to do is locate that iframe, switch to it, then you can parse. I choose to use BeautifulSoup to pull out the script tag, read that in as a json format and navigate through there. This should hopfully get you going with pull what you're looking for:
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
import json
urls = [
'https://hypebeast.com/brands/jordan-brand'
]
with requests.Session() as s:
for url in urls:
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
driver.get(url)
products = [element for element in WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[#class='post-box ']")))]
soup = bs(driver.page_source, 'lxml')
element = soup.select('.post-box ')
time.sleep(1)
ahref = [item.find('a')['href'] for item in element]
results = list(zip(ahref))
df = pd.DataFrame(results)
for result in ahref:
driver.get(result)
time.sleep(6)
iframe = driver.find_element_by_xpath('//iframe[starts-with(#name, "dsq-app")]')
driver.switch_to.frame(iframe)
soup = bs(driver.page_source, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
if 'response' in script.text:
jsonStr = script.text
jsonData = json.loads(jsonStr)
for each in jsonData['response']['posts']:
author = each['author']['username']
message = each['raw_message']
print('%s: %s' %(author, message))
Output:
annvee: Lemme get them BDSM jordans fam
deathb4designer: Lmao
zenmasterchen: not sure why this model needed to exist in the first place
Spawnn: Issa flop.
disqus_lEPADa2ZPn: looks like an AF1
Lekkerdan: Hoodrat shoes.
rubnalntapia: Damn this are sweet
marcellusbarnes: Dope, and I hate Jordan lows
marcellusbarnes: The little jumpman on the back is dumb
chickenboihotsauce: copping those CPFM gonna be aids
lowercasegod: L's inbound
monalisadiamante: Sold out in 4 minutes. 😑
nickpurita: Those CPFM’s r overhyped AF.
...

Beautiful Soup / urllib does not read some HTML tags

I recently started using Beautiful Soup. For practice I'm trying to scrape this website.
There are some div tags that the scraper does not seem to be able to access, even the URL reader does not seem to read these div tags. The HTML does not indicate that it is using JavaScript for the part that is not being read, so theoretically I assume Selenium is not needed. Specifically div tags under an ID named "ajaxTarget" are the concern. The below code returns some elements, but the majority of the div tags under this specific tag are not read.
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
def main():
url_link = 'https://www.countryflags.com/en/'
req = Request(url_link, headers={'User-Agent': 'Mozilla/5.0'})
page = urlopen(req).read()
soup = BeautifulSoup(page, features='lxml')
div_master_container = soup.findAll('div', attrs={'id': 'ajaxTarget'})
print(len(div_master_container))
for item in div_master_container:
print(item)
if __name__ == '__main__':
main()
I would appreciate if someone can point out whether there is an element in HTML that I am missing or any other factor that is contributing to this issue.
Javascript needs to run on the page. Use a method like selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url ='https://www.countryflags.com/en/'
driver = webdriver.Chrome()
driver.get(url)
items = [[item.get_attribute('href'),item.get_attribute('title')] for item in WebDriverWait(driver,30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#ajaxTarget [title]")))]
print(items)
print(len(items))
#driver.quit()

Resources