Some data is not getting scraped in Python Selenium, webscraping? - python-3.x

I am scraping booking.com and taking hotel name, ratings, hotel description, a number of reviews, ratings, address, category (check whether it is hotel or hostel) and locations.
My questions:
1) I was scraping 30 pages, and by the end, I got missing values in a number of reviews, ratings and category. When I went back and checked the site all the values are available for these fields. I am not sure why it's not capturing. Please advise?
2) There is no missing value if I scrape the individual page. What would be the reason?
#Importing necessary library
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.firefox.options import Options
import pandas as pd
import time
import re
import requests
from itertools import zip_longest
from webdriver_manager.chrome import ChromeDriverManager
category = []
name = []
address = []
reviews = []
review_title = []
ratings = []
description = []
facilit = []
driver = webdriver.Chrome(ChromeDriverManager().install())
for pageno in range(0,750, 25):
print(pageno)
#driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.booking.com/searchresults.en-gb.html?aid=397642&label=gog235jc-1FCAEoggI46AdIM1gDaK4BiAEBmAEJuAEXyAEM2AEB6AEB-AEMiAIBqAIDuALm3eDyBcACAQ&sid=422b3ff3c0e98b522259ad1cad2505ea&tmpl=searchresults&ac_click_type=b&ac_position=0&checkin_month=7&checkin_monthday=1&checkin_year=2020&checkout_month=7&checkout_monthday=15&checkout_year=2020&class_interval=1&dest_id=-1506909&dest_type=city&dtdisc=0&from_sf=1&group_adults=2&group_children=0&iata=AKL&inac=0&index_postcard=0&label_click=undef&no_rooms=1&postcard=0&raw_dest_type=city&room1=A%2CA&sb_price_type=total&search_selected=1&shw_aparth=1&slp_r_match=0&src=index&src_elem=sb&srpvid=0da61131394c0103&ss=Auckland%2C%20Auckland%20Region%2C%20New%20Zealand&ss_all=0&ss_raw=Auckland&ssb=empty&sshis=0&top_ufis=1&rows=25&offset=0" + str(pageno))
time.sleep(1)
summaryItems = driver.find_elements_by_xpath("//a[contains(#class, 'hotel_name_link url')]")
job_links = [summaryItem.get_attribute("href") for summaryItem in summaryItems]
for job_link in job_links:
driver.get(job_link)
time.sleep(1)
try:
job_title = driver.find_element_by_xpath("//*[#class='hp__hotel-type-badge']").text
category.append(job_title)
except:
job_title = "None"
try:
hotel = driver.find_element_by_id('hp_hotel_name').text.strip('Hotel')
name.append(hotel)
except:
hotel = "None"
try:
add = driver.find_element_by_id('showMap2').find_element_by_class_name('hp_address_subtitle').text
address.append(add)
except:
add = "None"
try:
reviews = driver.find_element_by_class_name('bui-review-score--end').find_element_by_class_name('bui-review-score__text').text
review_title.append(reviews)
except:
reviews = "None"
try:
rating = driver.find_element_by_class_name('bui-review-score--end').find_element_by_class_name('bui-review-score__badge').text
ratings.append(rating)
except:
rating = "None"
try:
desc = driver.find_element_by_xpath("//div[#id='property_description_content']").text
description.append(desc)
except:
rating = "None"
driver.close()
# Converting all the details into dataframe and csv file
final = []
for item in zip_longest(name, address, review_title, ratings, description, category):
final.append(item)
df4 = pd.DataFrame(
final, columns=['Hotel_name', 'Address', 'Number_of_review', 'Ratings', 'Description', 'Category' ])
#df.to_csv('booked.csv')

Related

Iterate over list of items and extract the data for the list from web browser and append the data frame as final output

I am trying to extract the stock market related data from the web browser. I am able to open the web browser and extract the data for one stock.
Below is the python code for “One stock” which opens the web browser with Selenium Webdriver and extract the data from the web page using the Beautifulsoup
This is very basic code which requires simplification and be able to extract the data for list of stock like the below
stock_list=['Infosys' , 'Reliance industries', 'wipro' ]
I am not sure how to extract the data for multiple item in the list as mentioned above and to simplify it based on this.
Python code to extract the data for one stock.
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.webdriver.support import expected_conditions as EC
headers = {'User-Agent': 'Mozilla/5.0'}
browser = webdriver.Firefox()
browser.get("https://www.tickertape.in/stocks/")
browser.maximize_window()
inputElement=browser.find_element_by_id('search-stock-input')
inputElement.click()
inputElement.send_keys('Infosys')
inputElement.click()
inputElement = wait(browser, 5).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#search-stock-input")))
inputElement.click()
inputElement.send_keys(Keys.RETURN)
page = requests.get(browser.current_url,headers=headers)
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.text, 'html.parser')
ScriptName = []
ScriptName_elem = soup.find_all( class_ = 'jsx-2256451 security-name')
for item in ScriptName_elem:
ScriptName.append(item.text)
intrinsic_value = []
intrinsic_value_elem = soup.find_all( class_ = 'jsx-3277407410 jsx-1058798148 lh-138 text-13 commentary-desc')
for item in intrinsic_value_elem:
intrinsic_value.append(item.text)
Returns_vs_FD_rates = []
Returns_vs_FD_rates_elem = soup.find_all( class_ = 'jsx-3947392323 jsx-1058798148 lh-138 text-13 commentary-desc')
for item in Returns_vs_FD_rates_elem:
Returns_vs_FD_rates.append(item.text)
Divident_Returns = []
Divident_Returns_elem = soup.find_all( class_ = 'jsx-566496888 jsx-1058798148 lh-138 text-13 commentary-desc')
for item in Divident_Returns_elem:
Divident_Returns.append(item.text)
Entry_Point = []
Entry_Point_elem = soup.find_all( class_ = 'jsx-3697483086 jsx-1058798148 lh-138 text-13 commentary-desc')
for item in Entry_Point_elem:
Entry_Point.append(item.text)
Red_Flag_Indicator = []
Red_Flag_Indicator_elem = soup.find_all( class_ = 'jsx-1920835126 jsx-1058798148 relative no-select tooltip-holder')
for item in Red_Flag_Indicator_elem:
Red_Flag_Indicator.append(item.text)
Red_Flag_Indicator_Reason = []
Red_Flag_Indicator_Reason_elem = soup.find_all( class_ = 'jsx-1920835126 jsx-1058798148 lh-138 text-13 commentary-desc')
for item in Red_Flag_Indicator_Reason_elem:
Red_Flag_Indicator_Reason.append(item.text)
df_array = []
for ScriptName_n, intrinsic_value_n,Returns_vs_FD_rates_n,Divident_Returns_n,Entry_Point_n,Red_Flag_Indicator_n,Red_Flag_Indicator_Reason_n in zip(ScriptName,intrinsic_value,Returns_vs_FD_rates,Divident_Returns,Entry_Point,Red_Flag_Indicator,Red_Flag_Indicator_Reason):
df_array.append({'ScriptName': ScriptName_n, 'intrinsic_value': intrinsic_value_n, 'Returns_vs_FD_rates' : Returns_vs_FD_rates_n, 'Divident_Returns' : Divident_Returns_n, 'Entry_Point' : Entry_Point_n,
'Red_Flag_Indicator' : Red_Flag_Indicator_n , 'Red_Flag_Indicator_Reason' : Red_Flag_Indicator_Reason_n })
df = pd.DataFrame(df_array)
df
Thanks in advance
You can call the same APIs the page does. The first API to get the id, and security name, for the stock to use for the second API which returns those checklist items.
If you create a list of dictionaries, one dictionary per ticker, you can then convert to a dataframe at end. If I have missed an item let me know. I also chose to store a lot of the other data e.g. low, high etc in another dictionary called other_data.
import requests
import pandas as pd
other_data = {}
results = []
stock_list = ['Infosys', 'Reliance industries', 'wipro']
with requests.Session() as s:
for ticker in stock_list:
try:
r = s.get(
f'https://api.tickertape.in/search?text={ticker.lower()}&types=stock,brands,index,etf,mutualfund').json()
stock_id = r['data']['stocks'][0]['sid']
name = r['data']['stocks'][0]['name']
other_data[stock_id] = r
r = s.get(
f'https://api.tickertape.in/stocks/investmentChecklists/{stock_id}?type=basic').json()
d = {i['title']: i['description'] for i in r['data']}
d = {**{'Security': name}, **other_data[stock_id]['data']['stocks'][0]['quote'], **{
'marketCap': other_data[stock_id]['data']['stocks'][0]['marketCap']}, **d}
results.append(d)
except Exception as e:
print(ticker, e)
df = pd.DataFrame(results)
df

How can i scrape a football results from flashscore using python

Web scraping Python
' I am new to scraping. I want to scrape Premier League Season 2018-19 Results(fixtures, results, date), But i am struggling to navigate the web site. all i get is empty list / [None]. if you have a solution that you can share that will be a great help. '
'Here's what i tried.'
'''
import pandas as pd
import requests as uReq
from bs4 import BeautifulSoup
url = uReq.get('https://www.flashscore.com/football/england/premier-league-2018-2019/results/')
soup = BeautifulSoup(url.text, 'html.parser')
divs = soup.find_all('div', attrs={'id': 'live-table'})
Home = []
for div in divs:
anchor = div.find(class_='event__participant event__participant--home')
Home.append(anchor)
print(Home)
'''
You will have to install requests_html for my solution.
Here is how I will go about it:
from requests_html import AsyncHTMLSession
from collections import defaultdict
import pandas as pd
url = 'https://www.flashscore.com/football/england/premier-league-2018-2019/results/'
asession = AsyncHTMLSession()
async def get_scores():
r = await asession.get(url)
await r.html.arender()
return r
results = asession.run(get_scores)
results = results[0]
times = results.html.find("div.event__time")
home_teams = results.html.find("div.event__participant.event__participant--home")
scores = results.html.find("div.event__scores.fontBold")
away_teams = results.html.find("div.event__participant.event__participant--away")
event_part = results.html.find("div.event__part")
dict_res = defaultdict(list)
for ind in range(len(times)):
dict_res['times'].append(times[ind].text)
dict_res['home_teams'].append(home_teams[ind].text)
dict_res['scores'].append(scores[ind].text)
dict_res['away_teams'].append(away_teams[ind].text)
dict_res['event_part'].append(event_part[ind].text)
df_res = pd.DataFrame(dict_res)
This generates the following output:

webscraping to a pandas DF

Appologize if this has been asked before i am trying to scrape web reviews into a dataframe. The problem I have is that it scrapes the same review 10 times and not 10 different reviews.
'''
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.marriott.com/hotels/hotel-reviews/amsnt-amsterdam-marriott-hotel'
for page in range(10):
page = requests.get("https://www.marriott.com/hotels/hotel-reviews/amsnt-amsterdam-marriott-hotel")
soup = BeautifulSoup(page.content, 'html.parser')
general_data = soup.find_all(class_='bvseo-review')
i = 1
first = general_data[i]
i+=1
for item in general_data:
span = first.find_all('span')
description = first.find_all('span', attrs={'itemprop':'description'})
rating = first.find_all('span', attrs={'itemprop':'ratingValue'})
auteur = first.find_all('span', attrs={'itemprop':'author'})
pagereviews = pd.DataFrame({
"description":description,
"ratingValue":rating,
"author":auteur
})
pagereviews
'''
the result would be that the DF would contain 10 unique reviews.
I would replace the for loop with
span = []
description = []
rating = []
auteur = []
for item in general_data:
span.append(item.find_all('span'))
description.append(item.find_all('span', attrs={'itemprop':'description'}))
rating.append(item.find_all('span', attrs={'itemprop':'ratingValue'}))
auteur.append(item.find_all('span', attrs={'itemprop':'author'}))

I need to scrape the job description text for every job title in the mentioned page

I need to scrape the job descriptions in the page () for every job title like section (accounting) job title (staff accountant) job description text inside the title in different columns in a csv file using python beautiful soup module.
I'm new to beautiful soup i tried some ways of doing it but its not working can you please help with the code
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
start = time.time()
url = ""
data = []
while True:
resp = requests.get(url)
soup = BeautifulSoup(resp.content, 'lxml')
jobdesc = soup.find("li",{'class':'col-xs-12 col-sm-4'})
section=soup.find("h4")
jd = {"jobdescription":jobdesc.text,"topic":section.text}
data.append(jd)
df = pd.DataFrame(data)
df.to_csv("JD.csv")
Here is one way leveraging :has in bs4 4.7.1+ to isolate the sections for looping over. zip_longest is used so we can join section title on to each job.
import requests, csv
from bs4 import BeautifulSoup as bs
from itertools import zip_longest
r = requests.get('https://resources.workable.com/job-descriptions/#', headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.content, 'lxml')
with open("data.csv", "w", encoding="utf-8-sig", newline='') as csv_file:
w = csv.writer(csv_file, delimiter = ",", quoting=csv.QUOTE_MINIMAL)
w.writerow(['Section','Job Title'])
for section in soup.select('section:has(.job)'):
title = section.select_one('a').text.strip()
jobs = [job.text for job in section.select('li a')]
rows = list(zip_longest([title], jobs, fillvalue = title))
for row in rows:
w.writerow(row)
I had a 403 forbidden using requests package, so I decide to use selenium
You can try this:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from selenium import webdriver
url = "https://resources.workable.com/job-descriptions/#"
data = []
#resp = requests.get(url)
#soup = BeautifulSoup(resp.text, 'html.parser')
driver = webdriver.Firefox()
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
section = soup.find_all('section',{'class':'box-white'})
for s in section:
title = s.find('h4').text
lis = soup.find_all("li",{'class':'col-xs-12 col-sm-4'})
for li in lis:
jd = {"jobdescription":li.text,"topic":title}
data.append(jd)
df = pd.DataFrame(data)
df.to_csv("JD.csv")
EDIT: To get description for all jobs
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from selenium import webdriver
url = "https://resources.workable.com/job-descriptions/#"
data = []
#resp = requests.get(url)
#soup = BeautifulSoup(resp.text, 'html.parser')
driver = webdriver.Firefox()
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
section = soup.find_all('section',{'class':'box-white'})
for s in section:
title = s.find('h4').text
lis = s.find_all("li",{'class':'col-xs-12 col-sm-4'})
for li in lis:
job = li.text
driver.get(li.find('a').get('href'))
soup2 = BeautifulSoup(driver.page_source, 'html.parser')
jd = {"job":job,"topic":title, "description": soup2.find('div',{'class':'entry-content article-content'}).text}
data.append(jd)
df = pd.DataFrame(data)
df.to_csv("JD.csv")
Scraping data from monster jobs and uploading to Mongo DB.
from time import *
from selenium import webdriver
import pymongo
from pymongo.results import InsertManyResult
import os
client = pymongo.MongoClient()
mydb = client['jobs']
collection = mydb['med_title']
driver = webdriver.Chrome("C:/Users/91798/Desktop/pythn_files/chromedriver.exe")
driver.get("https://www.monsterindia.com/")
driver.implicitly_wait(9)
driver.find_element_by_id("SE_home_autocomplete").send_keys("nursing , Therapist , docter , medical ,nurse , hospital")
#for normal search use this
driver.find_element_by_xpath("//body/div[#id='themeDefault']/section[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/form[1]/div[1]/div[2]/input[1]").click()
driver.implicitly_wait(20)
temp = 1
while(True):
if temp == 5:
break
all_jobs = driver.find_elements_by_class_name("card-apply-content")
link_list = []
for job in all_jobs:
try:
company = ""
com_name = job.find_elements_by_class_name("job-tittle")
driver.implicitly_wait(1)
for ele in com_name:
company = ele.find_element_by_class_name('company-name').text
job_title = ""
for ele in com_name:
job_title = ele.find_element_by_class_name('medium').text
location = job.find_element_by_class_name("loc").text
driver.implicitly_wait(1)
lnks= job.find_elements_by_tag_name("a")
for lnk in lnks:
link_list.append(lnk.get_attribute('href'))
break
driver.implicitly_wait(1)
desc = job.find_element_by_class_name("job-descrip").text
driver.implicitly_wait(1)
skills = job.find_element_by_class_name("descrip-skills").text
except:
desc = 'desc Not Specified'
skills = 'skills Not Specified'
location = ' location Not Specified'
company = 'company Not Specified'
job_title = 'job_title not specified'
s = skills.split(' ')
for i in s:
if i == ',':
s.remove(',')
data = {"job_title" : job_title ,"comapany_name": company,"job_location":
location,"job_desc":desc,"skills":s[2::],"card_link":link_list[0]}
link_list.clear()
y = collection.insert_one(data)
print(y.inserted_id)
driver.find_element_by_xpath("//button[contains(text(),'Next')]").click()
sleep(25)
temp = temp +1

How can I make the phantomJS webdriver to wait until a specific HTML element being loaded and then return the page.source?

I have developed the code below for a web crawling object.
It takes two dates as inputs.Then creates a list of dates between these two dates and attach each one to a webpage url which contains weather information of a location. Then it converts HTML tables of data into Dataframe and after that stores data as csv file in storage (the base link is: https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/2019-1-3 and as you can see in this example the date is 2019-1-3):
from datetime import timedelta, date
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
from furl import furl
import os
import time
class WebCrawler():
def __init__(self, st_date, end_date):
if not os.path.exists('Data'):
os.makedirs('Data')
self.path = os.path.join(os.getcwd(), 'Data')
self.driver = webdriver.PhantomJS()
self.base_url = 'https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/'
self.st_date = st_date
self.end_date = end_date
def date_list(self):
# Create list of dates between two dates given as inputs.
dates = []
total_days = int((self.end_date - self.st_date).days + 1)
for i in range(total_days):
date = self.st_date + timedelta(days=i)
dates.append(date.strftime('%Y-%m-%d'))
return dates
def create_link(self, attachment):
# Attach dates to base link
f = furl(self.base_url)
f.path /= attachment
f.path.normalize()
return f.url
def open_link(self, link):
# Opens link and visits page and returns html source code of page
self.driver.get(link)
html = self.driver.page_source
return html
def table_to_df(self, html):
# Finds table of weather data and converts it into pandas dataframe and returns it
soup = BeautifulSoup(html, 'lxml')
table = soup.find("table",{"class":"tablesaw-sortable"})
dfs = pd.read_html(str(table))
df = dfs[0]
return df
def to_csv(self, name, df):
# Save the dataframe as csv file in the defined path
filename = name + '.csv'
df.to_csv(os.path.join(self.path,filename), index=False)
This is the way I want to use the WebCrawler object:
date1 = date(2018, 12, 29)
date2 = date(2019, 1, 1)
# Initialize WebCrawler object
crawler = WebCrawler(st_date=date1, end_date=date2)
dates = crawler.date_list()
for day in dates:
print('**************************')
print('PROCESSING : ', day)
link = crawler.create_link(day)
print('WAITING... ')
time.sleep(3)
print('VISIT WEBPAGE ... ')
html = crawler.open_link(link)
print('DATA RETRIEVED ... ')
df = crawler.table_to_df(html)
print(df.head(3))
crawler.to_csv(day, df)
print('DATA SAVED ...')
The problem which occurs is that the first iteration of loop runs perfect but the second one stops with an error which says No tables where found (occurs in table = soup.find("table",{"class":"tablesaw-sortable"}) line) and that's because page source is returned by WebCrawler.open_link before the webpage fully load the contents of webpage including the table (containing weather information). there is also a probability that website rejects the request because it's making the servers too busy.
Is there anyway that we could build a loop that keep trying to open the link until when it could find the table, or at least wait until table is loaded and then return the table?
You can have selenium wait for a specific element. In your case it will be the table with the class name of "tablesaw-sortable". I highly recommend that you use CSS selectors to find this element, as it's fast and less error prone that getting all table elements.
Here is the CSS selector, premade for you table.tablesaw-sortable. Set selenium to wait until that element has loaded.
Source: https://stackoverflow.com/a/26567563/4159473
I rewrote the code using the https://stackoverflow.com/a/26567563/4159473 solution which was suggested by #mildmelon and I also used some delays between each time sending request to server and asking for the page source:
from datetime import timedelta, date
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import pandas as pd
from furl import furl
import os
import time
class WebCrawler():
def __init__(self, st_date, end_date):
if not os.path.exists('Data'):
os.makedirs('Data')
self.path = os.path.join(os.getcwd(), 'Data')
self.driver = webdriver.PhantomJS()
self.delay_for_page = 7
self.base_url = 'https://www.wunderground.com/history/daily/ir/mashhad/OIMM/date/'
self.st_date = st_date
self.end_date = end_date
def date_list(self):
# Create list of dates between two dates given as inputs.
dates = []
total_days = int((self.end_date - self.st_date).days + 1)
for i in range(total_days):
date = self.st_date + timedelta(days=i)
dates.append(date.strftime('%Y-%m-%d'))
return dates
def create_link(self, attachment):
# Attach dates to base link
f = furl(self.base_url)
f.path /= attachment
f.path.normalize()
return f.url
def open_link(self, link):
# Opens link and visits page and returns html source code of page
self.driver.get(link)
myElem = WebDriverWait(self.driver, self.delay_for_page)\
.until(EC.presence_of_element_located((By.CLASS_NAME, 'tablesaw-sortable')))
def table_to_df(self, html):
# Finds table of weather data and converts it into pandas dataframe and returns it
soup = BeautifulSoup(html, 'lxml')
table = soup.find("table",{"class":"tablesaw-sortable"})
dfs = pd.read_html(str(table))
df = dfs[0]
return df
def to_csv(self, name, df):
# Save the dataframe as csv file in the defined path
filename = name + '.csv'
df.to_csv(os.path.join(self.path,filename), index=False)
date1 = date(2019, 2, 1)
date2 = date(2019, 3, 5)
# Initialize WebCrawler object
crawler = WebCrawler(st_date=date1, end_date=date2)
dates = crawler.date_list()
for day in few_dates:
print('**************************')
print('DATE : ', day)
link = crawler.create_link(day)
print('WAITING ....')
print('')
time.sleep(12)
print('OPENING LINK ... ')
try:
crawler.open_link(link)
html = crawler.driver.page_source
print( "DATA IS FETCHED")
df = crawler.table_to_df(html)
print(df.head(3))
crawler.to_csv(day, df)
print('DATA SAVED ...')
except TimeoutException:
print( "NOT FETCHED ...!!!")
The weather information is fetched without problem. I guess delays between each request resulted in better performance. The line myElem = WebDriverWait(self.driver, self.delay_for_page)\.until(EC.presence_of_element_located((By.CLASS_NAME, 'tablesaw-sortable'))) has also improved speed.

Resources