Saving selenium results/output at run time in text file using Python - python-3.x

I am running a script in Python3 using Selenium. I am getting my output what I expected. Now, I want to save my output to a text, or csv or json file. When I am trying to run my script and save result to a file I am getting an Error with open('bangkok_vendor.txt','a') as wt :
TypeError: 'NoneType' object is not callable
Which means loop in the program runs only one time and does not store data in file called bangkok_vendor.txt. In normal python scraper programs it would n't have any problem storing data but this is first time I am using selenium. Can you please help me with solution thanks.
I am trying to run this script from my terminal command and output is what to save to any file format :
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import csv
import requests
contents =[]
filename = 'link_business_filter.csv'
def copy_json():
with open("bangkok_vendor.text",'w') as wt:
for x in script2:
wt.writer(x)
wt.close()
with open(filename,'rt') as f:
data = csv.reader(f)
for row in data:
links = row[0]
contents.append(links)
for link in contents:
url_html = requests.get(link)
print(link)
browser = webdriver.Chrome('chromedriver')
open = browser.get(link)
source = browser.page_source
data = bs(source,"html.parser")
body = data.find('body')
script = body
x_path = '//*[#id="react-root"]/section/main/div'
script2 = browser.find_element_by_xpath(x_path)
script3 = script2.text
#script2.send_keys(keys.COMMAND + 't')
browser.close()
print(script3)

You need to pass script2 as a parameter for copy_json function and call it when you extract the data from the page.
Change write mode to append, otherwise the file will be reset every time you call copy_json function.
Dont overwrite built-in functions like open, otherwise you won't be able to open a file to write data once you move onto the second iteration.
I refactored your code a bit:
LINK_CSV = 'link_business_filter.csv'
SAVE_PATH = 'bangkok_vendor.txt'
def read_links():
links = []
with open(LINK_CSV) as f:
reader = csv.reader(f)
for row in reader:
links.append(row[0])
return links
def write_data(data):
with open(SAVE_PATH, mode='a') as f:
f.write(data + "\n")
if __name__ == '__main__':
browser = webdriver.Chrome('chromedriver')
links = read_links()
for link in links:
browser.get(link)
# You may have to wait a bit here
# until the page is loaded completely
html = browser.page_source
# Not sure what you're trying to do with body
# soup = BeautifulSoup(html, "html.parser")
# body = soup.find('body')
x_path = '//*[#id="react-root"]/section/main/div'
main_div = browser.find_element_by_xpath(x_path)
text = main_div.text
write_data(text)
# close browser after every link is processed
browser.quit()

Related

Output from web scraping with bs4 returns empty lists

I am trying to scrape specific information from a website of 25 pages but when I run my code i get empty lists. My output is supposed to be dictionary with the specific information scraped. Please any help would be appreciated.
# Loading libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import mitosheet
# Assigning column names using class_ names
name_selector = "af885_1iPzH"
old_price_selector = "f6eb3_1MyTu"
new_price_selector = "d7c0f_sJAqi"
discount_selector = "._6c244_q2qap"
# Placeholder list
data = []
# Looping over each page
for i in range(1,26):
url = "https://www.konga.com/category/phones-tablets-5294?brand=Samsung&page=" +str(i)
website = requests.get(url)
soup = BeautifulSoup(website.content, 'html.parser')
name = soup.select(name_selector)
old_price = soup.select(old_price_selector)
new_price = soup.select(new_price_selector)
discount = soup.select(discount_selector)
# Combining the elements into a zipped list to be able to pull the data simultaneously
for names, old_prices, new_prices, discounts in zip(name, old_price, new_price, discount):
dic = {"Phone Names": names.getText(),"New Prices": new_prices.getText(),"Old Prices": old_prices.getText(),"Discounts": discounts.getText()}
data.append(dic)
data
I tested the below and it works for me getting 40 name values.
I wasn't able to get the values using beautiful soup but directly through selenium.
If you decide to use Chrome and PyCharm as I have then:
Open Chrome. Click on three dots near top right. Click on Settings then About Chrome to see the version of your Chrome. Download the corresponding driver here. Save the driver in the PyCharm PATH folder
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
# Assigning column names using class_ names
name_selector = "af885_1iPzH"
# Looping over each page
for i in range(1, 27):
url = "https://www.konga.com/category/phones-tablets-5294?brand=Samsung&page=" +str(i)
driver.get(url)
xPath = './/*[#class="' + name_selector + '"]'
name = driver.find_elements(By.XPATH, xPath)

web scraping help needed

I was wondering if someone could help me put together some code for
https://finance.yahoo.com/quote/TSCO.l?p=TSCO.L
I currently use this code to scrape the current price
currentPriceData = soup.find_all('div', {'class':'My(6px) Pos(r) smartphone_Mt(6px)'})[0].find('span').text
This works fine but I occasionally get an error not really sure why as the links are all correct. but I would like to try to get the price again
so something like
try:
currentPriceData = soup.find_all('div', {'class':'My(6px) Pos(r) smartphone_Mt(6px)'})[0].find('span').text
except Exception:
currentPriceData = soup.find('span', {'class':'Trsdu(0.3s) Fw(b) Fz(36px) Mb(-4px) D(ib)'})[0].text
The problem is that I can't get it to scrape the number using this method any help would be greatly appreciated.
The data is embedded within the page as Javascript variable. But you can use json module to parse it.
For example:
import re
import json
import requests
url = 'https://finance.yahoo.com/quote/TSCO.l?p=TSCO.L'
html_data = requests.get(url).text
#the next line extracts from the HTML source javascript variable
#that holds all data that is rendered on page.
#BeautifulSoup cannot run Javascript, so we are going to use
#`json` module to extract the data.
#NOTE: When you view source in Firefox/Chrome, you can search for
# `root.App.main` to see it.
data = json.loads(re.search(r'root\.App\.main = ({.*?});\n', html_data).group(1))
# uncomment this to print all data:
# print(json.dumps(data, indent=4))
# We now have the Javascript variable extracted to standard python
# dict, so now we just print contents of some keys:
price = data['context']['dispatcher']['stores']['QuoteSummaryStore']['price']['regularMarketPrice']['fmt']
currency_symbol = data['context']['dispatcher']['stores']['QuoteSummaryStore']['price']['currencySymbol']
print('{} {}'.format(price, currency_symbol))
Prints:
227.30 £

Saving data in Selenium

I am trying to save output data after I am running successful script in python using Selenium. But, I am not able to save result at end of my run/ script. My code is running fine, only problem is I am not able to save out to a file which can be .json, csv or text. I need serious help on this one.
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import csv
import requests
# saving data in bangkok_vendor.text
def copy_info():
with open('bangkok_vendor.text','a') as wt:
for x in script3:
wt.write(x)
wt.close()
return
contents =[]
filename = 'link_business_filter.csv'
with open(filename,'rt') as f:
data = csv.reader(f)
for row in data:
links = row[0]
contents.append(links)
for link in contents:
url_html = requests.get(link)
print(link)
browser = webdriver.Chrome('chromedriver')
open = browser.get(link)
source = browser.page_source
data = bs(source,"html.parser")
body = data.find('body')
script = body
x_path = '//*[#id="react-root"]/section/main/div'
script2 = browser.find_element_by_xpath(x_path)
script3 = script2.text
#script2.send_keys(keys.COMMAND + 't')
browser.close()
print(script3)
copy_info()
Did you try using csv.writer for csv files? Please check out the following link. hope it helps.
Save results to csv file with Python

Beautifulsoup parse thousand pages

I have a script parsing a list with thousands of URLs.
But my problem is, that it would take ages to be done with that list.
The URL request takes around 4 seconds before the page is loaded and can be parsed.
Is there any way to parse really a large amount of URLs fast?
My code looks like this:
from bs4 import BeautifulSoup
import requests
#read url-list
with open('urls.txt') as f:
content = f.readlines()
# remove whitespace characters
content = [line.strip('\n') for line in content]
#LOOP through urllist and get information
for i in range(5):
try:
for url in content:
#get information
link = requests.get(url)
data = link.text
soup = BeautifulSoup(data, "html5lib")
#just example scraping
name = soup.find_all('h1', {'class': 'name'})
EDIT:
how to handle Asynchronous Requests with hooks in this example? I tried the following as mentioned on this site Asynchronous Requests with Python requests:
from bs4 import BeautifulSoup
import grequests
def parser(response):
for url in urls:
#get information
link = requests.get(response)
data = link.text
soup = BeautifulSoup(data, "html5lib")
#just example scraping
name = soup.find_all('h1', {'class': 'name'})
#read urls.txt and store in list variable
with open('urls.txt') as f:
urls= f.readlines()
# you may also want to remove whitespace characters
urls = [line.strip('\n') for line in urls]
# A list to hold our things to do via async
async_list = []
for u in urls:
# The "hooks = {..." part is where you define what you want to do
#
# Note the lack of parentheses following do_something, this is
# because the response will be used as the first argument automatically
rs = grequests.get(u, hooks = {'response' : parser})
# Add the task to our list of things to do via async
async_list.append(rs)
# Do our list of things to do via async
grequests.map(async_list, size=5)
This doesn't work for me. I don't even get any error in the console, it is just running for long time until it stops.

Store scrape results and search in results with Python and Pandas?

as part of my Ph.D. research, I am scraping numerous webpages and search for keywords within the scrape results.
This is how I do it thus far:
# load data with as pandas data frame with column df.url
df = pd.read_excel('sample.xls', header=0)
# define keyword search function
def contains_keywords(link, keywords):
try:
output = requests.get(link).text
return int(any(x in output for x in keywords))
except:
return "Wrong/Missing URL"
# define the relevant keywords
mykeywords = ('for', 'bar')
# store search results in new column 'results'
df['results'] = df.url.apply(lambda l: contains_keywords(l, mykeywords))
This works just fine. I only have one problem: the list of relevant keywords mykeywordschanges frequently, whilst the webpages stay the same. Running the code takes a long time, since I request over and over.
I have two questions:
(1) Is there a way to store the results of request.get(link).text?
(2) And if so, how to I search within the saved file(s) producing the same result as with the current script?
As always, thank you for your time and help! /R
You can download the content of the urls and save them in separate files in a directory (eg: 'links')
def get_link(url):
file_name = os.path.join('/path/to/links', url.replace('/', '_').replace(':', '_'))
try:
r = requests.get(url)
except Exception as e:
print("Failded to get " + url)
else:
with open(file_name, 'w') as f:
f.write(r.text)
Then modify the contains_keywords function to read local files, so you won't have to use requests every time you run the script.
def contains_keywords(link, keywords):
file_name = os.path.join('/path/to/links', link.replace('/', '_').replace(':', '_'))
try:
with open(file_name) as f:
output = f.read()
return int(any(x in output for x in keywords))
except Exception as e:
print("Can't access file: {}\n{}".format(file_name, e))
return "Wrong/Missing URL"
Edit: i just added a try-except block in get_link and used absolute path for file_name

Resources