from bs4 import BeautifulSoup
from selenium import webdriver
import pdfkit
import time
from pdfrw import PdfWriter
driver=webdriver.Chrome()
driver.get('https://www.linkedin.com/in/ankurkhandelwal1/')
time.sleep(40)
soup= BeautifulSoup(driver.page_source, 'lxml')
for div in soup.select('.pv-contact-info__ci-container'):
for link in soup.find_all('a', href=True):
href=link.get('href')
print(href)
I want to print only mobile No. and Email id of user, but it prints so many unnecessary line. how to do for fetching exactly email and Mobile No?
Related
i am tring to extract text inside span-id tag but getting blank output screen.
i have tried using parent element div text also , but fail to extract, please anyone help me.
below is my code.
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.paperplatemakingmachines.com/')
soup = BeautifulSoup(r.text,'lxml')
mob = soup.find('span',{"id":"tollfree"})
print(mob.text)
i want the text inside that span which is given mobile number.
You'll have to use Selenium as that text is not present in the initial request, or at least no without searching through <script> tags.
from bs4 import BeautifulSoup as soup
from selenium import webdriver
import time
driver = webdriver.Chrome('C:\chromedriver_win32\chromedriver.exe')
url='https://www.paperplatemakingmachines.com/'
driver.get(url)
# It's better to use Selenium's WebDriverWait, but I'm still learning how to use that correctly
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.close()
mob = soup.find('span',{"id":"tollfree"})
print(mob.text)
The Data is actually rending dynamically through script. What you need to do is parse the data from script:
import requests
import re
from bs4 import BeautifulSoup
r = requests.get('https://www.paperplatemakingmachines.com/')
soup = BeautifulSoup(r.text,'lxml')
script= soup.find('script')
mob = re.search("(?<=pns_no = \")(.*)(?=\";)", script.text).group()
print(mob)
Another way of using regex to find the number
import requests
import re
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.paperplatemakingmachines.com/',)
soup = bs(r.content, 'lxml')
r = re.compile(r'var pns_no = "(\d+)"')
data = soup.find('script', text=r).text
script = r.findall(data)[0]
print('+91-' + script)
I am trying to scrape info from a website(Program name and program ID) and it is returning empty list.
I am not sure if i am mixing up the syntax but this is what i have
soup.find_all('h3', class_='ama__h3')
the website link is https://freida.ama-assn.org/Freida/#/programs?program=residencies&specialtiesToSearch=140
from urllib.request import urlopen
from bs4 import BeautifulSoup as BS
import pandas as pd
from urllib.parse import urlparse, urlsplit
import requests
res = requests.get('https://freida.ama-assn.org/Freida/#/programs?program=residencies&specialtiesToSearch=140')
soup = bs4.BeautifulSoup(res.text, 'html5lib')
print(soup.prettify())
soup.find_all('h3', class_='ama__h3')
Your error is because you are parsing with html5lib. For any well formed html, the parser choice is not really important. However for a non well formed html (like this one), html5lib seems to have issues. You should use html.parser or lxml (apparently html.parser is safer)
However this code is doing what you want to do :
soup = BeautifulSoup(res.text, 'html.parser')
programs = soup.find_all("a", class_='ama__promo--background')
for program in programs:
program_name = program.find("h3").text
program_id = program.find_all("small")[-1].text.split(': ')[1].strip()
print(program_name, program_id
I want to scrape Daily Observation table from below given url https://www.wunderground.com/history/daily/in/chennai/VOMM/date/2017-1-1
I want to use table id for scraping. I am using this code
from bs4 import BeautifulSoup
import requests
import lxml
url = 'https://www.wunderground.com/history/daily/in/chennai/VOMM/date/2017-1-1';
content = requests.get(url).content
soup = BeautifulSoup(content, 'lxml')
table = soup.find('table', {'id' : 'history-observation-table'})
print(table)
But this is returning None. How can I scrape table?
It dynamic page, you can use json data from URL like
https://api.weather.com/v1/geocode/12.99361134/80.17694092/observations/historical.json?apiKey=*********&startDate=20170101&endDate=20170101&units=e
you can see real API key it in browser Console -> Network
or use selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
driver = webdriver.Chrome()
driver.get("https://www.wunderground.com/history/daily/in/chennai/VOMM/date/2017-1-1")
table = WebDriverWait(driver, 15).until(lambda d: d.find_element_by_id('history-observation-table'))
print(table.text)
from bs4 import BeautifulSoup
from selenium import webdriver
import pdfkit
import time
import logging
driver=webdriver.Chrome()
driver.get('https://www.linkedin.com/')
time.sleep(5)
driver.get('https://www.linkedin.com/in/ankurkhandelwal1/')
time.sleep(20)
soup= BeautifulSoup(driver.page_source, 'lxml')
elem=driver.find_element_by_css_selector(".contact-see-more-less.link-without-visited-state")
elem.click()
for div in soup.select('.pv-contact-info__ci-container'):
for link in soup.find_all('a', {'class': 'pv-contact-info__contact-link Sans-15px-black-55%'}):
old=link.get('href')
mobile=old.replace("tel:", " ")
print(mobile)
elem.click() is working but after this line program doesn't go next and shows mobile blank. When I remove this line and manually click then it's working.
This is the code that wrote to fetch data from webpage:
import urllib.request
import urllib
from bs4 import BeautifulSoup
def make_soup(url):
req=urllib.request.Request(url,headers={'User-Agent': 'Mozilla/5.0'})
thepage=urllib.request.urlopen(req)
soupdata=BeautifulSoup(thepage,'html5lib')
return soupdata
soup=make_soup("https://www.nseindia.com/live_market/dynaContent/live_analysis/top_gainers_losers.htm?cat=G")
t=soup.findAll('table')[0]
for record in t.findAll('tr'):
print(record.td.text)
'''
for record in t.findAll('tr'):
for data in record.findAll('td'):
print(data.text)
'''
But this code fetches only first tr. how to get values for remaining tr