Saving data in Selenium - python-3.x

I am trying to save output data after I am running successful script in python using Selenium. But, I am not able to save result at end of my run/ script. My code is running fine, only problem is I am not able to save out to a file which can be .json, csv or text. I need serious help on this one.
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import csv
import requests
# saving data in bangkok_vendor.text
def copy_info():
with open('bangkok_vendor.text','a') as wt:
for x in script3:
wt.write(x)
wt.close()
return
contents =[]
filename = 'link_business_filter.csv'
with open(filename,'rt') as f:
data = csv.reader(f)
for row in data:
links = row[0]
contents.append(links)
for link in contents:
url_html = requests.get(link)
print(link)
browser = webdriver.Chrome('chromedriver')
open = browser.get(link)
source = browser.page_source
data = bs(source,"html.parser")
body = data.find('body')
script = body
x_path = '//*[#id="react-root"]/section/main/div'
script2 = browser.find_element_by_xpath(x_path)
script3 = script2.text
#script2.send_keys(keys.COMMAND + 't')
browser.close()
print(script3)
copy_info()

Did you try using csv.writer for csv files? Please check out the following link. hope it helps.
Save results to csv file with Python

Related

How to download an image from the internet using google colab jupyter

I need to download an image using a url. I managed to obtain the urls of the images I need to download, but now I'm lost on how to download it to my local computer. I'm using google colab/ jupyter. Thank you!
here's my code so far:
from bs4 import BeautifulSoup
import requests
import json
import urllib.request
#use Globe API to get data
#input userid - plan: have program read userids from csv or excel file
userid = xxxxxxxx
#use Globe API to get data
source = requests.get('https://api.globe.gov/search/v1/measurement/protocol/measureddate/userid/?protocols=land_covers&startdate=2020-05-04&enddate=2020-07-16&userid=' + str(userid) +'&geojson=FALSE&sample=FALSE').text
#set up BeautifulSoup4
soup = BeautifulSoup(source, 'lxml')
#Isolate the Json data and put it into a string called "paragraph"
body = soup.find('body')
paragraph = body.p.text
#load the string into a python object
data = json.loads(paragraph)
#pick out the needed information and store them
for landcover in data['results']:
siteId = landcover['siteId']
measuredDate = landcover['measuredDate']
latitude = landcover['latitude']
longitude = landcover['longitude']
protocol = landcover['protocol']
DownURL = landcover['data']['landcoversDownwardPhotoUrl']
#Here is where I want to download the url contained in 'DownURL'
Try
from google.colab import files as FILE
import os
img_data = requests.get(DownURL).content
with open('image_name.jpg', 'wb') as handler:
handler.write(img_data)
FILE.download('image_name.jpg')
os.remove('image_name.jpg') # to save up space
You can call a random function in case you do not wish to set an image name or a counter variable which keeps increments at each loop iteration.

Saving selenium results/output at run time in text file using Python

I am running a script in Python3 using Selenium. I am getting my output what I expected. Now, I want to save my output to a text, or csv or json file. When I am trying to run my script and save result to a file I am getting an Error with open('bangkok_vendor.txt','a') as wt :
TypeError: 'NoneType' object is not callable
Which means loop in the program runs only one time and does not store data in file called bangkok_vendor.txt. In normal python scraper programs it would n't have any problem storing data but this is first time I am using selenium. Can you please help me with solution thanks.
I am trying to run this script from my terminal command and output is what to save to any file format :
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import csv
import requests
contents =[]
filename = 'link_business_filter.csv'
def copy_json():
with open("bangkok_vendor.text",'w') as wt:
for x in script2:
wt.writer(x)
wt.close()
with open(filename,'rt') as f:
data = csv.reader(f)
for row in data:
links = row[0]
contents.append(links)
for link in contents:
url_html = requests.get(link)
print(link)
browser = webdriver.Chrome('chromedriver')
open = browser.get(link)
source = browser.page_source
data = bs(source,"html.parser")
body = data.find('body')
script = body
x_path = '//*[#id="react-root"]/section/main/div'
script2 = browser.find_element_by_xpath(x_path)
script3 = script2.text
#script2.send_keys(keys.COMMAND + 't')
browser.close()
print(script3)
You need to pass script2 as a parameter for copy_json function and call it when you extract the data from the page.
Change write mode to append, otherwise the file will be reset every time you call copy_json function.
Dont overwrite built-in functions like open, otherwise you won't be able to open a file to write data once you move onto the second iteration.
I refactored your code a bit:
LINK_CSV = 'link_business_filter.csv'
SAVE_PATH = 'bangkok_vendor.txt'
def read_links():
links = []
with open(LINK_CSV) as f:
reader = csv.reader(f)
for row in reader:
links.append(row[0])
return links
def write_data(data):
with open(SAVE_PATH, mode='a') as f:
f.write(data + "\n")
if __name__ == '__main__':
browser = webdriver.Chrome('chromedriver')
links = read_links()
for link in links:
browser.get(link)
# You may have to wait a bit here
# until the page is loaded completely
html = browser.page_source
# Not sure what you're trying to do with body
# soup = BeautifulSoup(html, "html.parser")
# body = soup.find('body')
x_path = '//*[#id="react-root"]/section/main/div'
main_div = browser.find_element_by_xpath(x_path)
text = main_div.text
write_data(text)
# close browser after every link is processed
browser.quit()

How can I scrape data which is not having any of the source code?

scrape.py
# code to scrape the links from the html
from bs4 import BeautifulSoup
import urllib.request
data = open('scrapeFile','r')
html = data.read()
data.close()
soup = BeautifulSoup(html,features="html.parser")
# code to extract links
links = []
for div in soup.find_all('div', {'class':'main-bar z-depth-1'}):
# print(div.a.get('href'))
links.append('https://godamwale.com' + str(div.a.get('href')))
print(links)
file = open("links.txt", "w")
for link in links:
file.write(link + '\n')
print(link)
I have successfully got the list of links by using this code. But When I want to scrape the data from those links from their html page, these don't have any of the source code that contains data,and to extract them it my job tough . I have used selenium driver , but it won't work well for me.
I want to scrape the data from the below link , that contains data in the html sections , which have Customer details, licence and automation, commercial details, Floor wise, operational details . I want to extract these data with name , location , contact number and type.
https://godamwale.com/list/result/591359c0d6b269eecc1d8933
it 's link here . If someone finds solution , please give it to me.
Using Developer tools in your browser, you'll notice whenever you visit that link there is a request for https://godamwale.com/public/warehouse/591359c0d6b269eecc1d8933 that returns a json response probably containing the data you're looking for.
Python 2.x:
import urllib2, json
contents = json.loads(urllib2.urlopen("https://godamwale.com/public/warehouse/591359c0d6b269eecc1d8933").read())
print contents
Python 3.x:
import urllib.request, json
contents = json.loads(urllib.request.urlopen("https://godamwale.com/public/warehouse/591359c0d6b269eecc1d8933").read().decode('UTF-8'))
print(contents)
Here you go , the main problem with the site seems to be it takes time to load that's why it was returning incomplete page source. you have to wait until page loads completely. notice time.sleep(8) this line in code below :
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import time
CHROMEDRIVER_PATH ="C:\Users\XYZ\Downloads/Chromedriver.exe"
wd = webdriver.Chrome(CHROMEDRIVER_PATH)
responce = wd.get("https://godamwale.com/list/result/591359c0d6b269eecc1d8933")
time.sleep(8) # wait untill page loads completely
soup = BeautifulSoup(wd.page_source, 'lxml')
props_list = []
propvalues_list = []
div = soup.find_all('div', {'class':'row'})
for childtags in div[6].findChildren('div',{'class':'col s12 m4 info-col'}):
props = childtags.find("span").contents
props_list.append(props)
propvalue = childtags.find("p",recursive=True).contents
propvalues_list.append(propvalue)
print(props_list)
print(propvalues_list)
note: code will return Construction details in 2 seperate list.

Scraping the stackoverflow user data

import requests
from bs4 import BeautifulSoup
import csv
response = requests.get('https://stackoverflow.com/users?page=3&tab=reputation&filter=week').text
soup = BeautifulSoup(response, 'lxml')
for items in soup.select('.user-details'):
name = items.select("a")[0].text
location = items.select(".user-location")[0].text
reputation = items.select(".reputation-score")[0].text
print(name,location,reputation)
with open('stackdata.csv','a',newline='') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([name,location,reputation])
When we change the url of this code the output remains same.
I came across a similar problem. The solution that works for me is using selenium. Though I used headless browser i.e phantomjs I assume it should work for other browsers too.
driver = webdriver.PhantomJS('/home/practice/selenium/webdriver/phantomjs/bin/phantomjs')
users = []
page_num = 1
driver.get('https://stackoverflow.com/users?page={page_num}&tab=reputation&filter=week'.format(page_num=page_num))
content = driver.find_element_by_id('content')
for details in content.find_elements_by_class_name('user-details'):
users.append(details.text)
print(users)
Change the page_num to get the desired result.
Hope this will help!

Unable to download a csv file from the URL with python script

I am accessing a url with username and password to download a csv and save the file with today's date nad time, there is only one download link on the page.
Is there any way I can achieve this task via python
I am using this below script I see the print output. but how can I download the the download csv button on the web-page. Normally when I click on the download csv button it asking me to save the file.
import re
import requests
from bs4 import BeautifulSoup
url = 'https://url.com'
login_data = dict(login='user#example.com', password='password-g')
session = requests.session()
link = 'https://url.com'
r = requests.get(link)
soup = BeautifulSoup(r.text, "html.parser")
for i in soup.find_all('a', {'class': "app-btn-down"}):
print(re.search('http://.*\b_file', i.get('href')).group(0)) # the CSV file name is b_file
print ("r.text")
As I'm new to python, so please forgive me for my bad explanation.
This is mostly pseudocode since i don't know the html data, but i think you'll get the idea.
First you have to submit your data to get the necessary cookies in your session (you can check the cookies with s.cookies). Keep in mind that there may be more fields that you have to submit other than login and password. Use this session for all your requests.
Then you can get the csv link with bs4 assuming it's not generated by js, otherwise you may have to use selenium.
import requests
from bs4 import BeautifulSoup
from time import gmtime, strftime
import os
s = requests.session()
url = 'https://url.com'
login_data = dict(login='user#example.com', password='password-g')
s.post(url, data=login_data)
link = 'https://url.com'
r = s.get(link)
soup = BeautifulSoup(r.text, "html.parser")
csv_link = soup.find('a', {'class':'app-btn-down', 'href':lambda h:'b_file' in h})['href']
csv_file = s.get(csv_link).text
Finally you can get the date and time with gmtime, and use strftime to format it.
date_time = strftime("%Y-%m-%d_%H-%M-%S", gmtime())
path = os.path.join('/some/dir', date_time)
with open(path, 'w') as f:
f.write(csv_file)

Resources