Store scrape results and search in results with Python and Pandas? - python-3.x

as part of my Ph.D. research, I am scraping numerous webpages and search for keywords within the scrape results.
This is how I do it thus far:
# load data with as pandas data frame with column df.url
df = pd.read_excel('sample.xls', header=0)
# define keyword search function
def contains_keywords(link, keywords):
try:
output = requests.get(link).text
return int(any(x in output for x in keywords))
except:
return "Wrong/Missing URL"
# define the relevant keywords
mykeywords = ('for', 'bar')
# store search results in new column 'results'
df['results'] = df.url.apply(lambda l: contains_keywords(l, mykeywords))
This works just fine. I only have one problem: the list of relevant keywords mykeywordschanges frequently, whilst the webpages stay the same. Running the code takes a long time, since I request over and over.
I have two questions:
(1) Is there a way to store the results of request.get(link).text?
(2) And if so, how to I search within the saved file(s) producing the same result as with the current script?
As always, thank you for your time and help! /R

You can download the content of the urls and save them in separate files in a directory (eg: 'links')
def get_link(url):
file_name = os.path.join('/path/to/links', url.replace('/', '_').replace(':', '_'))
try:
r = requests.get(url)
except Exception as e:
print("Failded to get " + url)
else:
with open(file_name, 'w') as f:
f.write(r.text)
Then modify the contains_keywords function to read local files, so you won't have to use requests every time you run the script.
def contains_keywords(link, keywords):
file_name = os.path.join('/path/to/links', link.replace('/', '_').replace(':', '_'))
try:
with open(file_name) as f:
output = f.read()
return int(any(x in output for x in keywords))
except Exception as e:
print("Can't access file: {}\n{}".format(file_name, e))
return "Wrong/Missing URL"
Edit: i just added a try-except block in get_link and used absolute path for file_name

Related

save a dictionary to a .db file

for the past couple of hours, I've been trying to find a solution to this issue. Any knowledge share is very helpful.
The objective is to save the dictionary created from the program. I am getting an error in line 3.
def save_dict(dictionary_to_be_saved):
with shelve.open('OperationNamesDictionary.db', 'c') as s: #create OperationNamesDictionary.db
s = dictionary_to_be_saved
What am I planning to achieve? : I have to pass the dictionary name to this function and it should create a (****.db) file, which I can use later.
Thanks in advance
Code used to save a dictionary:
def save_dict(dictionary_to_be_saved):
with shelve.open('OperationNamesDictionary.db', 'c') as s: # "c" flag used to create dictionary
s = dictionary_to_be_saved
Code used to retrieve a dictionary from created file:
def load_dict():
try:
with shelve.open('TOperationNamesDictionary.db', 'r') as s:
operation_names = s
return(operation_names)
except dbm.error:
print("Could not find a saved dictionary, using a pre-coded one.")
operation_names = load_dict()
print(operation_names)
output:<shelve.DbfilenameShelf object at 0x0000021970AD89A0>
Expected output: data inside the operation_names (A dictionary)
I think this is what you are after, more or less. This is untested code so I hope it works! What I have added to your attempts in the question is to provide a key value for the items you are shelving (I used the uninteresting identifier "mydictionary" as my key). So we put items in by that name, and take them out again by that same name.
def save_dict(dictionary_to_be_saved):
with shelve.open('OperationNamesDictionary.db', 'c') as s:
s['mydictionary'] = dictionary_to_be_saved
def load_dict():
try:
with shelve.open('TOperationNamesDictionary.db', 'r') as s:
return s['mydictionary']
except KeyError:
print("Could not find a saved dictionary")
For my specific case, creating a (.npy) file worked for me.
Again, the objective of my code is to use a dictionary that is available (the type of file that stores this dictionary doesn't matter) and at the end of the program save the updated dictionary to the same file.
import numpy as np
try:
operation_names = np.load("Operation_Dictionary.npy", allow_pickle = 'TRUE').item()
print("Using the Dictionary which is found in Path")
except FileNotFoundError:
print("Using Pre-coded Dictionary from the code")
operation_names = {"Asd-013we":"No Influence", "rdfhyZ1":"TRM"}# default dictionary
.....
.....
# the program will update the dictionary
np.save("Operation_Dictionary.npy",operation_names) #the updated dictionary is saved to same file.

List is empty when appending when using recursion

I have two functions. The first one is used to get a list of paths to text files, and the second one is used to iterate over this list of paths and then check if they include the word password. But because of the Try Except statement in the second function, I had to use recursion to make it continue running unless there's another way if possible to provide below. My problem is that the list returned in the second function is empty why and how to fix it?
def search_txt():
"""Function to search the C:\\ for .txt files -> then add them (including full path to file) to a list."""
list_of_txt = []
for dir_path, sub_dir, files in os.walk("C:\\"):
"""Method 1 -> checks the end of the file name (could be used for specific extensions)"""
for file in files:
if file.endswith(".txt"):
list_of_txt.append(os.path.join(dir_path, file))
return list_of_txt
def search_pass_file(list_of_files: list):
"""Function to iterate over each text file, searching if the word "password" is included -> Returns the text
file's path """
list_of_pass = []
if len(list_of_files) != 0:
for i in range(len(list_of_files)):
file = list_of_files.pop()
try:
with open(file, encoding="utf8") as f:
for line in f.readlines():
if "password" in line:
list_of_pass.append(file)
except UnicodeDecodeError:
return search_pass_file(list_of_files)
except PermissionError:
return search_pass_file(list_of_files)
else:
return list_of_pass
if __name__ == '__main__':
myList = search_txt()
print(search_pass_file(myList))
You're returning list_of_pass only if len(list_of_files) == 0 (it's in the else block). Your return statement should occur after the loop (which should be a while one btw)
You can except several errors in one line by putting them in parenthesis: except (UnicodeDecodeError, PermissionError) of except all exceptions (for instance, you're not handling FileNotFoundError).
I'd reduce your function to:
def search_pass_file(list_of_files: list):
"""Function to iterate over each text file, searching if the word "password" is included -> Returns the text
file's path """
list_of_pass = []
while list_of_files:
file = list_of_files.pop()
try:
with open(file, encoding="utf8") as f:
for line in f.readlines():
if "password" in line:
list_of_pass.append(file)
break
except Exception:
list_of_pass += search_pass_file(list_of_files)
return list_of_pass
Edit: also in your except block, you should append the returned value of the recursive function to list_of_pass otherwise you'll lose the files found after the error occurs.

Using Map in Open Binary

trying to use map() in my script which reads files and converts them into binary form.
Cant get the below to work, any help?
def binary_file_reader(file_data):
with open(file_data, 'rb') as binary_file_data:
binary_file_data = binary_file_data.read()
print(binary_file_data)
binary_data = binascii.hexlify(binary_file_data)
binary_data = binary_data.decode("utf-8")
return binary_data
Then the main which calls the above
if __name__ == "__main__":
device_directory = os.getcwd()
for r, d, f in os.walk(device_directory):
for file in f:
file_data = os.path.join(r, file)
all_file_names.append(file_data)
try:
binary_data = map(binary_file_reader, all_file_names)
print(binary_data)
except IOError:
print("cannot read")
Because map applies binary_file_reader to every element inside file_data, it doesn't do what you think it is.
In your case, file_data is your actual file path as a str, e.g., /tmp/a.txt. If you use map on a str, it is applied on every letter, so what you do will be expanded to
binary_file_reader('/')
binary_file_reader('t')
binary_file_reader('m')
binary_file_reader('p')
binary_file_reader('/')
binary_file_reader('a')
binary_file_reader('.')
binary_file_reader('t')
binary_file_reader('x')
binary_file_reader('t')
binary_file_reader(file_data) should produce the desired result.

Web Scraping Python fails to load the url on button.click()

The CSV file contains the names of the countries used. However, after Argentina, it fails to recover the url. And it returns a empty string.
country,country_url
Afghanistan,https://openaq.org/#/locations?parameters=pm25&countries=AF&_k=tomib2
Algeria,https://openaq.org/#/locations?parameters=pm25&countries=DZ&_k=dcc8ra
Andorra,https://openaq.org/#/locations?parameters=pm25&countries=AD&_k=crspt2
Antigua and Barbuda,https://openaq.org/#/locations?parameters=pm25&countries=AG&_k=l5x5he
Argentina,https://openaq.org/#/locations?parameters=pm25&countries=AR&_k=962zxt
Australia,
Austria,
Bahrain,
Bangladesh,
The country.csv looks like this:
Afghanistan,Algeria,Andorra,Antigua and Barbuda,Argentina,Australia,Austria,Bahrain,Bangladesh,Belgium,Bermuda,Bosnia and Herzegovina,Brazil,
The code used is:
driver = webdriver.Chrome(options = options, executable_path = driver_path)
url = 'https://openaq.org/#/locations?parameters=pm25&_k=ggmrvm'
driver.get(url)
time.sleep(2)
# This function opens .csv file that we created at the first stage
# .csv file includes names of countries
with open('1Countries.csv', newline='') as f:
reader = csv.reader(f)
list_of_countries = list(reader)
list_of_countries = list_of_countries[0]
print(list_of_countries) # printing a list of countries
# Let's create Data Frame of the country & country_url
df = pd.DataFrame(columns=['country', 'country_url'])
# With this function we are generating urls for each country page
for country in list_of_countries[:92]:
try:
path = ('//span[contains(text(),' + '\"' + country + '\"' + ')]')
# "path" is used to filter each country on the website by
# iterating country names.
next_button = driver.find_element_by_xpath(path)
next_button.click()
# Using "button.click" we are get on the page of next country
time.sleep(2)
country_url = (driver.current_url)
# "country_url" is used to get the url of the current page
next_button.click()
except:
country_url = None
d = [{'country': country, 'country_url': country_url}]
df = df.append(d)
I've tried increasing the sleep time, not sure what is leading to this?
The challenge you face is that the country list is scrollalble:
A bit convenient that your code stops working when they're not displayed.
It's a relatively easy solution - You need to scroll it into view. I've made a quick test with your code to confirm it's working. I removed the CSV part, hard coded a country that's further down the list and I've the parts to make it scroll to view:
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time
def ScrollIntoView(element):
actions = ActionChains(driver)
actions.move_to_element(element).perform()
url = 'https://openaq.org/#/locations?parameters=pm25&_k=ggmrvm'
driver = webdriver.Chrome()
driver.get(url)
driver.implicitly_wait(10)
country = 'Bermuda'
path = ('//span[contains(text(),' + '\"' + country + '\"' + ')]')
next_button = driver.find_element_by_xpath(path)
ScrollIntoView(next_button) # added this
next_button.click()
time.sleep(2)
country_url = (driver.current_url)
print(country_url) # added this
next_button.click()
This is the output from the print:
https://openaq.org/#/locations?parameters=pm25&countries=BM&_k=7sp499
You happy to merge that into your solution? (just say if you need more support)
If it helps a reason you didn't notice for yourself is that try was masking a NotInteractableException. Have a look at how to handle errors here
try statements are great and useful - but it's also good to track when the occur so you can fix them later. Borrowing some code from that link, you can try something like this in your catch:
except:
print("Unexpected error:", sys.exc_info()[0])

Saving selenium results/output at run time in text file using Python

I am running a script in Python3 using Selenium. I am getting my output what I expected. Now, I want to save my output to a text, or csv or json file. When I am trying to run my script and save result to a file I am getting an Error with open('bangkok_vendor.txt','a') as wt :
TypeError: 'NoneType' object is not callable
Which means loop in the program runs only one time and does not store data in file called bangkok_vendor.txt. In normal python scraper programs it would n't have any problem storing data but this is first time I am using selenium. Can you please help me with solution thanks.
I am trying to run this script from my terminal command and output is what to save to any file format :
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import csv
import requests
contents =[]
filename = 'link_business_filter.csv'
def copy_json():
with open("bangkok_vendor.text",'w') as wt:
for x in script2:
wt.writer(x)
wt.close()
with open(filename,'rt') as f:
data = csv.reader(f)
for row in data:
links = row[0]
contents.append(links)
for link in contents:
url_html = requests.get(link)
print(link)
browser = webdriver.Chrome('chromedriver')
open = browser.get(link)
source = browser.page_source
data = bs(source,"html.parser")
body = data.find('body')
script = body
x_path = '//*[#id="react-root"]/section/main/div'
script2 = browser.find_element_by_xpath(x_path)
script3 = script2.text
#script2.send_keys(keys.COMMAND + 't')
browser.close()
print(script3)
You need to pass script2 as a parameter for copy_json function and call it when you extract the data from the page.
Change write mode to append, otherwise the file will be reset every time you call copy_json function.
Dont overwrite built-in functions like open, otherwise you won't be able to open a file to write data once you move onto the second iteration.
I refactored your code a bit:
LINK_CSV = 'link_business_filter.csv'
SAVE_PATH = 'bangkok_vendor.txt'
def read_links():
links = []
with open(LINK_CSV) as f:
reader = csv.reader(f)
for row in reader:
links.append(row[0])
return links
def write_data(data):
with open(SAVE_PATH, mode='a') as f:
f.write(data + "\n")
if __name__ == '__main__':
browser = webdriver.Chrome('chromedriver')
links = read_links()
for link in links:
browser.get(link)
# You may have to wait a bit here
# until the page is loaded completely
html = browser.page_source
# Not sure what you're trying to do with body
# soup = BeautifulSoup(html, "html.parser")
# body = soup.find('body')
x_path = '//*[#id="react-root"]/section/main/div'
main_div = browser.find_element_by_xpath(x_path)
text = main_div.text
write_data(text)
# close browser after every link is processed
browser.quit()

Resources