error while using selenium and writing to file - python-3.x

import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
import pandas as pd
from pytube import YouTube
browser = webdriver.Chrome("C:/Users/Downloads/chromedriver_win32/chromedriver.exe")
browser.get("https://www.youtube.com/channel/UCaKt8dvEIPnEHWSbLYhzrxg/videos")
time.sleep(1)
elem = browser.find_element_by_tag_name("body")
no_of_pagedowns = 100
while no_of_pagedowns:
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)
no_of_pagedowns-=1
html = browser.page_source
soup = BeautifulSoup(html, "lxml")
tags = soup.find_all('a')
fname = "C:/Stock_in_CFD/Output.txt"
text_file = open(fname, "w+", encoding="utf-8")
for tag in tags:
t = tag.get('href')
text_file.write(t)
When I am running the above code. I am getting error
TypeError: write() argument must be str, not None
When I am not using selenium I am able to do it.
I am using selenium since I want scroll down entire page before parsing before using BeautifulSoup

Related

Scraping without specific strings in Python3

I'm trying to scrape only emoji in Python3. I used starttwith method with if statement but the result got some Unicodes that emoji's HTML tag seems to be same as others. I have no idea why some emoji is converted into Unicode. Could you give me any advice ?? or there is any ways to remove this unicode from list.
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
import re
import os
list0 = []
site_url = "https://www.emojiall.com/zh-hant/categories/A"
get_url = requests.get(site_url)
soup = BeautifulSoup(get_url.text, "lxml")
for script in soup(["span"]):
script.extract()
emojis = soup.select('.emoji_font')
words = soup.select('.emoji_name_truncate')
for emoji0 in emojis:
emoji1 = emoji0.getText()
if not repr(emoji1).startswith(r'\U'):
list0.append(emoji1)
else:
continue
print(list0)
I updated editor and it works well .
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
import re
import os
list0 = []
site_url = "https://www.emojiall.com/zh-hant/categories/A"
get_url = requests.get(site_url)
soup = BeautifulSoup(get_url.text, "lxml")
for script in soup(["span"]):
script.extract()
emojis = soup.select('.emoji_font')
words = soup.select('.emoji_name_truncate')
for emoji0 in emojis:
emoji1 = emoji0.getText()
if not repr(emoji1).startswith(r"'\U"):
list0.append(emoji1)
else:
continue
print(list0)

Python Beautifulsoup / Requests text from span

I'm trying to extract text using beautifulsoup or requests from this facebook page https://www.facebook.com/marketplace/item/1612977352197759/
the text is the item description, the text before the map:
this is what i've tried till now but not working :
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from lxml import html
opt = Options()
opt.add_argument("--disable-infobars")
opt.add_argument("start-maximized")
# Pass the argument 1 to allow and 2 to block
opt.add_experimental_option("prefs", {
"profile.default_content_setting_values.media_stream_mic": 2,
"profile.default_content_setting_values.media_stream_camera": 2,
"profile.default_content_setting_values.geolocation": 2,
"profile.default_content_setting_values.notifications": 2
})
global driver
driver = webdriver.Chrome(chromedriver)
driver.get('https://www.google.com')
page = requests.get('https://www.facebook.com/marketplace/item/1612977352197759/?ref=messenger_banner')
tree = html.fromstring(page.content)
print(tree)
link = tree.xpath("//span[contains(string(),'hello')]")
print(link)
Try this:
import requests
from bs4 import BeautifulSoup
import re
page = requests.get('https://www.facebook.com/marketplace/item/1612977352197759/?ref=messenger_banner')
soup = BeautifulSoup(page.text,'lxml')
span = soup.find("span",string=re.compile(".*hello.*"))
print(span)
Do let me know if this works.

How to scrape price from booking.com using beautifulsoup?

I am trying to scrape price from booking.com but not successful. Any suggestions
My code as follows
#Importing necessary library
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.firefox.options import Options
import pandas as pd
import time
import re
import requests
from itertools import zip_longest
from webdriver_manager.chrome import ChromeDriverManager
price = []
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get('https://www.booking.com/searchresults.en-gb.html?label=gen173nr-1FCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AEB6AEB-AELiAIBqAIDuALnhOzyBcACAQ&lang=en-gb&sid=422b3ff3c0e98b522259ad1cad2505ea&sb=1&src=searchresults&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Fsearchresults.en-gb.html%3Flabel%3Dgen173nr-1FCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AEB6AEB-AELiAIBqAIDuALnhOzyBcACAQ%3Bsid%3D422b3ff3c0e98b522259ad1cad2505ea%3Btmpl%3Dsearchresults%3Bclass_interval%3D1%3Bdest_id%3D-1506909%3Bdest_type%3Dcity%3Bdtdisc%3D0%3Bfrom_sf%3D1%3Bgroup_adults%3D2%3Bgroup_children%3D0%3Binac%3D0%3Bindex_postcard%3D0%3Blabel_click%3Dundef%3Bno_rooms%3D1%3Boffset%3D0%3Bpostcard%3D0%3Braw_dest_type%3Dcity%3Broom1%3DA%252CA%3Bsb_price_type%3Dtotal%3Bshw_aparth%3D1%3Bslp_r_match%3D0%3Bsrc%3Dindex%3Bsrc_elem%3Dsb%3Bsrpvid%3D912403b6d1220012%3Bss%3DAuckland%3Bss_all%3D0%3Bssb%3Dempty%3Bsshis%3D0%3Bssne%3DAuckland%3Bssne_untouched%3DAuckland%3Btop_ufis%3D1%26%3B&sr_autoscroll=1&ss=Auckland&is_ski_area=0&ssne=Auckland&ssne_untouched=Auckland&city=-1506909&checkin_year=2020&checkin_month=9&checkin_monthday=1&checkout_year=2020&checkout_month=9&checkout_monthday=2&group_adults=2&group_children=0&no_rooms=1&from_sf=1')
time.sleep(5)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
for item in soup.findAll('label', {'class': "tpi_price_label tpi_price_label__orange"}):
price.append(item.get_text(strip=True))
print(price)
The above code is not showing any output. It gives an empty list.
You need properly wait for the page to load.
This is done using WebDriverWait and it will throw exception if the page isnt loaded during the specified timeout.
Try running my sample code bellow:
# test_scrape.py
import atexit
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
URL = ("https://www.booking.com/searchresults.en-gb.html?"
"label=gen173nr-1FCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AEB6AEB-AELiAIBqAIDuALnhOzyBcACAQ"
"&lang=en-gb&sid=422b3ff3c0e98b522259ad1cad2505ea&sb=1&src=searchresults&src_elem=sb"
"&error_url=https%3A%2F%2Fwww.booking.com%2Fsearchresults.en-gb.html%3Flabel%3Dgen173nr-"
"1FCAEoggI46AdIM1gEaK4BiAEBmAEJuAEXyAEM2AEB6AEB-AELiAIBqAIDuALnhOzyBcACAQ%3Bsid%3D422b3ff"
"3c0e98b522259ad1cad2505ea%3Btmpl%3Dsearchresults%3Bclass_interval%3D1%3Bdest_id%3D-150690"
"9%3Bdest_type%3Dcity%3Bdtdisc%3D0%3Bfrom_sf%3D1%3Bgroup_adults%3D2%3Bgroup_children%3D0%3"
"Binac%3D0%3Bindex_postcard%3D0%3Blabel_click%3Dundef%3Bno_rooms%3D1%3Boffset%3D0%3Bpostcar"
"d%3D0%3Braw_dest_type%3Dcity%3Broom1%3DA%252CA%3Bsb_price_type%3Dtotal%3Bshw_aparth%3D1%3Bs"
"lp_r_match%3D0%3Bsrc%3Dindex%3Bsrc_elem%3Dsb%3Bsrpvid%3D912403b6d1220012%3Bss%3DAuckland%3B"
"ss_all%3D0%3Bssb%3Dempty%3Bsshis%3D0%3Bssne%3DAuckland%3Bssne_untouched%3DAuckland%3Btop_ufi"
"s%3D1%26%3B&sr_autoscroll=1&ss=Auckland&is_ski_area=0&ssne=Auckland&ssne_untouched=Auckland&ci"
"ty=-1506909&checkin_year=2020&checkin_month=9&checkin_monthday=1&checkout_year=2020&checkout_m"
"onth=9&checkout_monthday=2&group_adults=2&group_children=0&no_rooms=1&from_sf=1'")
class page_loaded:
def __call__(self, driver):
document_ready = driver.execute_script("return document.readyState;") == "complete"
jquery_ready = driver.execute_script("return jQuery.active == 0;")
print(f"document ready: [({type(document_ready).__name__}){document_ready}]")
print(f"jquery ready: [({type(jquery_ready).__name__}){jquery_ready}]")
return document_ready and jquery_ready
def wait_for_page_to_load(driver, timeout_seconds=20):
WebDriverWait(driver, timeout_seconds, 0.2).until(page_loaded(), f"Page could not load in {timeout_seconds} s.!")
def go_to_url(driver, url):
driver.get(url)
wait_for_page_to_load(driver)
def get_orange_prices(soup):
return [price_label.get_text(strip=True)
for price_label
in soup.select("label.tpi_price_label.tpi_price_label__orange")]
def get_normal_prices(soup):
return [price_label.get_text(strip=True)
for price_label
in soup.select("div[class*=bui-price-display__value]")]
def start_driver():
driver = webdriver.Chrome()
atexit.register(driver.quit)
driver.maximize_window()
return driver
def main():
driver = start_driver()
go_to_url(driver, URL)
soup = BeautifulSoup(driver.page_source, 'html.parser')
orange_prices = get_orange_prices(soup)
print(orange_prices)
normal_prices = get_normal_prices(soup)
print(normal_prices)
if __name__ == '__main__':
main()
If you're having issues with the chromedriver not being discovered, try specify exact path to it like this:
def start_driver():
driver = webdriver.Chrome(executable_path="/path/to/cromedriver")
atexit.register(driver.quit)
driver.maximize_window()
return driver

How get the text with BeautifulSoup in this html code: <span id="pass_0" class="text-success">c#</span>

I'm doing a program that crack some hash, through selenium and beautifulsoup with this website: https://hashkiller.co.uk/Cracker
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import requests
import time
target = requests.get("https://hashkiller.co.uk/Cracker")
soup = BeautifulSoup(target.content, 'html.parser')
driver = webdriver.Chrome(executable_path=r"D:\Download\chromedriver.exe")
#driver.set_window_position(-10000,0)
#240aa2cec4b29c56f3bee520a8dcee7e
driver.get("https://hashkiller.co.uk/Cracker")
hash = input("Hash: ")
hash_box = driver.find_element_by_id("txtHashList").send_keys(hash)
hash_submit = driver.find_element_by_id("btnCrack").click()
time.sleep(5)
hash_table = soup.find('span', {'class': 'text-success'})
a = hash_table.text
print(hash_table)
i expect the output is c# [Image: https://imgur.com/kEegEgY ] Html Code: [html <span id="pass_0" class="text-success">c#</span>]
but it returns: html<span class="text-success">$pass</span>
instead of $pass there should be c#
You're actually not parsing the rendered html. You're parsing the html response from your requests.
Secondly, you want to grab the second element, as the first element is the $pass. Also, change hash to a different variable as it's a function in python:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import requests
import time
#target = requests.get("https://hashkiller.co.uk/Cracker")
#soup = BeautifulSoup(target.content, 'html.parser')
driver = webdriver.Chrome("C:/chromedriver.exe")
#driver.set_window_position(-10000,0)
#240aa2cec4b29c56f3bee520a8dcee7e
driver.get("https://hashkiller.co.uk/Cracker")
hash_input = input("Hash: ")
hash_box = driver.find_element_by_id("txtHashList").send_keys(hash_input)
hash_submit = driver.find_element_by_id("btnCrack").click()
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
hash_table = soup.find_all('span', {'class': 'text-success'})
a = hash_table[1].text
print(hash_table)
print(a)
driver.close()
Output:
[<span class="text-success">$pass</span>, <span class="text-success" id="pass_0">c#</span>]
c#

Beautifulsoup image downloading error

I am trying to download the images from the imageurls that come back from a Beautifulsoup scrape. I was trying to get this code to work after reading on some other examples tho the
Getting error:
f.write(requests.get(img))
TypeError: a bytes-like object is required, not 'Response
line: f.write(requests.get(img)[What goes here?]) is causing me trouble now. I use soup = BeautifulSoup(source, 'html.parser')
Where as the reference uses soup = BeautifulSoup(r.content)
import os
dir_path = os.path.dirname(os.path.realpath(__file__))
import urllib.request
from bs4 import BeautifulSoup
import codecs
import sys
import requests
from os.path import basename
lines = open('list.txt').read().splitlines()
for designers in lines:
for i in range(1):
url = 'https://www.example.com/Listings?st=' + author + '{}'.format(i)
source = urllib.request.urlopen(url)
soup = BeautifulSoup(source, 'html.parser')
for products in soup.find_all('li', class_='widget'):
image = products.find('img', class_='lazy-load')
print(image.get('data-src'))
img = (image.get('data-src'))
with open(basename(img), "wb") as f:
f.write(requests.get(img)**[What goes here?]**)
Thanks!

Resources