from bs4 import BeautifulSoup
from selenium import webdriver
import pdfkit
import time
import logging
driver=webdriver.Chrome()
driver.get('https://www.linkedin.com/')
time.sleep(5)
driver.get('https://www.linkedin.com/in/ankurkhandelwal1/')
time.sleep(20)
soup= BeautifulSoup(driver.page_source, 'lxml')
elem=driver.find_element_by_css_selector(".contact-see-more-less.link-without-visited-state")
elem.click()
for div in soup.select('.pv-contact-info__ci-container'):
for link in soup.find_all('a', {'class': 'pv-contact-info__contact-link Sans-15px-black-55%'}):
old=link.get('href')
mobile=old.replace("tel:", " ")
print(mobile)
elem.click() is working but after this line program doesn't go next and shows mobile blank. When I remove this line and manually click then it's working.
Related
I'm having trouble trying to web scraping using BeautifulSoup and Selenium. The problem I have is i want to try pulling data from pages 1-20. But somehow the data that was successfully pulled was only up to page 10. It is possible that the number of the last page limit that I would take could be more than 20, but the results of the code I made could only pull 10 pages. Does anyone have an understanding for the problem to be able to pull a lot of data without page limit?
options = webdriver.ChromeOptions()
options.add_argument('-headless')
options.add_argument('-no-sandbox')
options.add_argument('-disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options=options)
apartment_urls = []
try:
for page in range(1,20):
print(f"Extraction Page# {page}")
page="https://www.99.co/id/sewa/apartemen/jakarta?kamar_tidur_min=1&kamar_tidur_maks=4&kamar_mandi_min=1&kamar_mandi_maks=4&tipe_sewa=bulanan&hlmn=" + str(page)
driver.get(page)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
apart_info_list = soup.select('h2.search-card-redesign__address a[href]')
for link in apart_info_list:
get_url = '{0}{1}'.format('https://www.99.co', link['href'])
print(get_url)
apartment_urls.append(get_url)
except:
print("Good Bye!")
This is the output of the code. When pages 10,11,12 and so on I can't get the data
Now, pagination is working fine without page limit.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.99.co/id/sewa/apartemen/jakarta?kamar_tidur_min=1&kamar_tidur_maks=4&kamar_mandi_min=1&kamar_mandi_maks=4&tipe_sewa=bulanan')
time.sleep(5)
driver.maximize_window()
while True:
soup = BeautifulSoup(driver.page_source, 'html.parser')
apart_info_list = soup.select('h2.search-card-redesign__address a')
for link in apart_info_list:
get_url = '{0}{1}'.format('https://www.99.co', link['href'])
print(get_url)
next_button = driver.find_element(By.CSS_SELECTOR,'li.next > a ')
if next_button:
button = next_button.click()
time.sleep(3)
else:
break
If you would prefer to use: webdriverManager
Alternative solution: As the next page url isn't dynamic, It's also working fine.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://www.99.co/id/sewa/apartemen/jakarta?kamar_tidur_min=1&kamar_tidur_maks=4&kamar_mandi_min=1&kamar_mandi_maks=4&tipe_sewa=bulanan')
time.sleep(5)
driver.maximize_window()
while True:
soup = BeautifulSoup(driver.page_source, 'html.parser')
apart_info_list = soup.select('h2.search-card-redesign__address a')
for link in apart_info_list:
get_url = '{0}{1}'.format('https://www.99.co', link['href'])
print(get_url)
# next_button = driver.find_element(By.CSS_SELECTOR,'li.next > a ')
# if next_button:
# button = next_button.click()
# time.sleep(3)
next_page = soup.select_one('li.next > a ')
if next_page:
next_page = f'https://www.99.co{next_page}'
else:
break
i am tring to extract text inside span-id tag but getting blank output screen.
i have tried using parent element div text also , but fail to extract, please anyone help me.
below is my code.
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.paperplatemakingmachines.com/')
soup = BeautifulSoup(r.text,'lxml')
mob = soup.find('span',{"id":"tollfree"})
print(mob.text)
i want the text inside that span which is given mobile number.
You'll have to use Selenium as that text is not present in the initial request, or at least no without searching through <script> tags.
from bs4 import BeautifulSoup as soup
from selenium import webdriver
import time
driver = webdriver.Chrome('C:\chromedriver_win32\chromedriver.exe')
url='https://www.paperplatemakingmachines.com/'
driver.get(url)
# It's better to use Selenium's WebDriverWait, but I'm still learning how to use that correctly
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.close()
mob = soup.find('span',{"id":"tollfree"})
print(mob.text)
The Data is actually rending dynamically through script. What you need to do is parse the data from script:
import requests
import re
from bs4 import BeautifulSoup
r = requests.get('https://www.paperplatemakingmachines.com/')
soup = BeautifulSoup(r.text,'lxml')
script= soup.find('script')
mob = re.search("(?<=pns_no = \")(.*)(?=\";)", script.text).group()
print(mob)
Another way of using regex to find the number
import requests
import re
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.paperplatemakingmachines.com/',)
soup = bs(r.content, 'lxml')
r = re.compile(r'var pns_no = "(\d+)"')
data = soup.find('script', text=r).text
script = r.findall(data)[0]
print('+91-' + script)
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
import pandas as pd
from pytube import YouTube
browser = webdriver.Chrome("C:/Users/Downloads/chromedriver_win32/chromedriver.exe")
browser.get("https://www.youtube.com/channel/UCaKt8dvEIPnEHWSbLYhzrxg/videos")
time.sleep(1)
elem = browser.find_element_by_tag_name("body")
no_of_pagedowns = 100
while no_of_pagedowns:
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)
no_of_pagedowns-=1
html = browser.page_source
soup = BeautifulSoup(html, "lxml")
tags = soup.find_all('a')
fname = "C:/Stock_in_CFD/Output.txt"
text_file = open(fname, "w+", encoding="utf-8")
for tag in tags:
t = tag.get('href')
text_file.write(t)
When I am running the above code. I am getting error
TypeError: write() argument must be str, not None
When I am not using selenium I am able to do it.
I am using selenium since I want scroll down entire page before parsing before using BeautifulSoup
from bs4 import BeautifulSoup
from selenium import webdriver
import pdfkit
import time
from pdfrw import PdfWriter
driver=webdriver.Chrome()
driver.get('https://www.linkedin.com/in/ankurkhandelwal1/')
time.sleep(40)
soup= BeautifulSoup(driver.page_source, 'lxml')
for div in soup.select('.pv-contact-info__ci-container'):
for link in soup.find_all('a', href=True):
href=link.get('href')
print(href)
I want to print only mobile No. and Email id of user, but it prints so many unnecessary line. how to do for fetching exactly email and Mobile No?
I am trying to extract the comments in this website.
Inspection of element
I tried using urllib for this purpose but to no avail. Then I realized that since enabling javascript is necessary to do so, I used selenium and phantomjs to extract the comments as can be seen in the following python3 code:
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.PhantomJS('phantomjs')
url='http://www.faz.net/aktuell/politik/inland/sterbehilfe-im-bundestag-unmoralisches-angebot-13887916.html'
driver.get(url)
htm_doc = driver.page_source
soup = BeautifulSoup(htm_doc, 'html.parser')
print (soup.find('div', attrs={'id','lesermeinungen'}))
Since, the comments load while loading the page I simply access the source and try to see if there are any comments under the tag name 'lesermeinungen' as this is the section that appears when I insect the comments section.
However, it takes the result as None
UPDATE
Tried the following code
from bs4 import BeautifulSoup
import selenium.webdriver.support.ui as ui
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
driver = webdriver.PhantomJS('phantomjs')
url='http://www.faz.net/aktuell/politik/inland/sterbehilfe-im-bundestag-unmoralisches-angebot-13887916.html'
driver.get(url)
wait = ui.WebDriverWait(driver,3)
try:
wait.until(driver.find_element_by_id('lesermeinungen'))
htm_doc = driver.page_source
soup = BeautifulSoup(htm_doc, 'html.parser')
print (soup.find('div', attrs={'id','lesermeinungen'}))
except TimeoutException:
print ("Loading took too much time!")
No result even after 2 hours
You have a typo when searching for the element with beautifulsoup. Instead of
print (soup.find('div', attrs={'id','lesermeinungen'}))
it should be a colon, not a comma
print (soup.find('div', attrs={'id' : 'lesermeinungen'}))
With this correction your first example worked for me.