I'm trying to extract emails from web pages, here is my email grabber function:
def emlgrb(x):
email_set = set()
for url in x:
try:
response = requests.get(url)
soup = bs.BeautifulSoup(response.text, "lxml")
emails = set(re.findall(r"[a-z0-9\.\-+_]+#[a-z0-9\.\-+_]+\.[a-z]+", soup.text, re.I))
email_set.update(emails)
except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError):
continue
return email_set
This function should be fed by another function, that creates a list of url. Feeder function:
def handle_local_links(url, link):
if link.startswith("/"):
return "".join([url, link])
return link
def get_links(url):
try:
response = requests.get(url, timeout=5)
soup = bs.BeautifulSoup(response.text, "lxml")
body = soup.body
links = [link.get("href") for link in body.find_all("a")]
links = [handle_local_links(url, link) for link in links]
links = [str(link.encode("ascii")) for link in links]
return links
It continues with many exceptions, which if raised - return empty list(not important). However return value from get_links() look like this:
["b'https://pythonprogramming.net/parsememcparseface//'"]
of course there are many of links in the list(cannot post it - reputation). emlgrb() function is not able to process the list (InvalidSchema: No connection adapters were found) However if I manually remove b and redundant quotes - so the list looks like this:
['https://pythonprogramming.net/parsememcparseface//']
emlgrb() works. Any suggestion where is the problem or haw to create "cleaning function" to get second list from first - are welcomed.
Thanks
The solution is to drop .encode('ascii')
def get_links(url):
try:
response = requests.get(url, timeout=5)
soup = bs.BeautifulSoup(response.text, "lxml")
body = soup.body
links = [link.get("href") for link in body.find_all("a")]
links = [handle_local_links(url, link) for link in links]
links = [str(link) for link in links]
return links
You can add coding in str() like in this pydoc: str(object=b'', encoding='utf-8', errors='strict')
That's because str() calls .__repr__() or .__str__() on the object, thus if it is bytes, then output is "b'string'". Actually that's what gets printed when you do print(bytes_obj). And calling .ecnode() on str object creates bytes object!
Related
Recently i was working with python beautiful soup to extract some data and put it into pandas DataFrame.
I used python beautiful soup to extract some of the hotel data from the website booking.com.
I was able to extract some of the attributes very correctly without any empty.
Here is my code snippet:
def get_Hotel_Facilities(soup):
try:
title = soup.find_all("div", attrs={"class":"db29ecfbe2 c21a2f2d97 fe87d598e8"})
new_list = []
# Inner NavigatableString Object
for i in range(len(title)):
new_list.append(title[i].text.strip())
except AttributeError:
new_list=""
return new_list
The above code is my function to retrieve the Facilities of a hotel and return the facilitites List items.
page_no=0
d = {"Hotel_Name":[], "Hotel_Rating":[], "Room_type":[],"Room_price":[],"Room_sqft":[],"Facilities":[],"Location":[]}
while (page_no<=25):
URL = f"https://www.booking.com/searchresults.html?aid=304142&label=gen173rf-1FCAEoggI46AdIM1gDaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGiAg1wcm9qZWN0cHJvLmlvqAIDuAKwwPadBsACAdICJDU0NThkNDAzLTM1OTMtNDRmOC1iZWQ0LTdhOTNjOTJmOWJlONgCBeACAQ&sid=2214b1422694e7b065e28995af4e22d9&sb=1&sb_lp=1&src=theme_landing_index&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Fhotel%2Findex.html%3Faid%3D304142%26label%3Dgen173rf1FCAEoggI46AdIM1gDaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGiAg1wcm9qZWN0cHJvLmlvqAIDuAKwwPadBsACAdICJDU0NThkNDAzLTM1OTMtNDRmOC1iZWQ0LTdhOTNjOTJmOWJlONgCBeACAQ%26sid%3D2214b1422694e7b065e28995af4e22d9%26&ss=goa&is_ski_area=0&checkin_year=2023&checkin_month=1&checkin_monthday=13&checkout_year=2023&checkout_month=1&checkout_monthday=14&group_adults=2&group_children=0&no_rooms=1&b_h4u_keep_filters=&from_sf=1&offset{page_no}"
new_webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(new_webpage.content,"html.parser")
links = soup.find_all("a", attrs={"class":"e13098a59f"})
for link in links:
new_webpage = requests.get(link.get('href'), headers=HEADERS)
new_soup = BeautifulSoup(new_webpage.content, "html.parser")
d["Hotel_Name"].append(get_Hotel_Name(new_soup))
d["Hotel_Rating"].append(get_Hotel_Rating(new_soup))
d["Room_type"].append(get_Room_type(new_soup))
d["Room_price"].append(get_Price(new_soup))
d["Room_sqft"].append(get_Room_Sqft(new_soup))
d["Facilities"].append(get_Hotel_Facilities(new_soup))
d["Location"].append(get_Hotel_Location(new_soup))
page_no += 25
The above code is the main one where the while loop will traverse the linked pages and retrieve the URL's of the pages. After retrieving ,it goes to every page to retrieve the corresponding atrributes.
I was able to retrieve the rest of the attributes correctly but i am not able to retrive the facilities, Like only some of the room facilities are being returned and some are not returning.
Here is my below o/p after making it into a pandas data frame.
Facilities o/p image
Please help me in this Problem as why some are coming and some are not coming.
P.S:- The facilities are available in the website
I have Tried using all the corresponding classes and attributes for retrieval but i am not getting the facilities column properly.
Probably as a predictive measure, the html fetched by the requests don't seem to consistent in their layouts or even the contents.
There might be more possible selectors, but try
def get_Hotel_Facilities(soup):
selectors = ['div[data-testid="property-highlights"]', '#facilities',
'.hp-description~div div.important_facility']
new_list = []
for sel in selectors:
for sect in soup.select(sel):
new_list += list(sect.stripped_strings)
return list(set(new_list)) # set <--> unique
But even with this, the results are inconsistent. E.g.: I tested on this page with
for i in range(10):
soup = BeautifulSoup(cloudscraper.create_scraper().get(url).content)
fl = get_Hotel_Facilities(soup) if soup else []
print(f'[{i}] {len(fl)} facilities: {", ".join(fl)}')
(But the inconsistencies might be due to using cloudscraper - maybe you'll get better results with your headers?)
I am trying to download reports from a companies website, https://www.investorab.com/investors-media/reports-presentations/. In the end, I would like to download all the available reports.
I have next to none experience in webscraping, so I have some trouble defining the correct search pattern. Previously I have needed to take out all links containing pdfs, i.e. I could use soup.select('div[id="id-name"] a[data-type="PDF"]'). But for this website, there is not listed a datatype for the links. How do I select all links under "Report and presentations"? Here is what I have tried, but it returns an empty list:
from bs4 import BeautifulSoup
import requests
url = "https://www.investorab.com/investors-media/reports-presentations/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
# Select all reports, publication_dates
reports = soup.select('div[class="html not-front not-logged-in no-sidebars page-events-archive i18n-en"] a[href]')
pub_dates = soup.select('div[class="html not-front not-logged-in no-sidebars page-events-archive i18n-en"] div[class="field-content"]')
I would also like to select all publications date, but also ends up with an empty list. Any help in the right direction is appreciated.
What you'll need to do is iterate through the pages, or what I did was just iterate through the year parameter. Once you get the list for the year, get the link of each report, then within each link, find the pdf link. You'll then use that pdf link to write to file:
from bs4 import BeautifulSoup
import requests
import os
# Gets all the links
linkList = []
url = 'https://vp053.alertir.com/v3/en/events-archive?'
for year in range(1917,2021):
query = 'type%5B%5D=report&type%5B%5D=annual_report&type%5B%5D=cmd&type%5B%5D=misc&year%5Bvalue%5D%5Byear%5D=' + str(year)
response = requests.get(url + query )
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a', href=True)
linkList += [link['href'] for link in links if 'v3' in link['href']]
print ('Gathered links for year %s.' %year)
# Go to each link and get the pdsf within them
print ('Downloading PDFs...')
for link in linkList:
url = 'https://vp053.alertir.com' + link
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
for pdflink in soup.select("a[href$='.pdf']"):
folder_location = 'C:/test/pdfDownloads/'
if not os.path.exists(folder_location):
os.mkdir(folder_location)
try:
filename = os.path.join(folder_location,pdflink['href'].split('/')[-1])
with open(filename, 'wb') as f:
f.write(requests.get('https://vp053.alertir.com' + pdflink['href']).content)
print ('Saved: %s' %pdflink['href'].split('/')[-1])
except Exception as ex:
print('%s not saved. %s' %(pdflink['href'],ex))
I am scraping dictionary data from https://www.dictionary.com/ website. The purpose is to remove the unwanted elements from the dictionary pages and save them offline for further processing. Because of the webpages are somewhat unstructured there may and may not be the elements present that are mentioned in the code below to remove; the absence of the elements gives an exception (In snippet 2). And since in the actual code, there are many elements to be removed and they may be present or absent, if we apply the try - except to every such statement the lines of code will increase drasticly.
Thus I am working on a work-around for this problem by creating a separate function for try - except (In snippet 3), the idea of which I got from here. But I am unable to get the code in snippet 3 working as the command such as soup.find_all('style') is returning None where as it should return the list of all the style tags similar to snippet 2. I cannot apply the refered solution directly as sometime I have to reach the intended element to remvove indirectly by refering to its parent or sibling such as in soup.find('h2',{'class':'css-1iltn77 e17deyx90'}).parent
Snippet 1 is used to set the environment for code execution.
It would be great if you could provide some suggestion to get snippet 3 working.
Snippet 1 (Setting the environment for executing code):
import urllib.request
import requests
from bs4 import BeautifulSoup
import re
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',}
folder = "dictionary_com"
Snippet 2 (working):
def makedefinition(url):
success = False
while success==False:
try:
request=urllib.request.Request(url,headers=headers)
final_url = urllib.request.urlopen(request, timeout=5).geturl()
r = requests.get(final_url, headers=headers, timeout=5)
success=True
except:
success=False
soup = BeautifulSoup(r.text, 'lxml')
soup = soup.find("section",{'class':'css-1f2po4u e1hj943x0'})
# there are many more elements to remove. mentioned only 2 for shortness
remove = soup.find_all("style") # style tags
remove.extend(safe_execute(soup.find('h2',{'class':'css-1iltn77 e17deyx90'}).parent)) # related content in the page
for x in remove: x.decompose()
return(soup)
# testing code on multiple urls
#url = "https://www.dictionary.com/browse/a"
#url = "https://www.dictionary.com/browse/a--christmas--carol"
#url = "https://www.dictionary.com/brdivowse/affection"
#url = "https://www.dictionary.com/browse/hot"
#url = "https://www.dictionary.com/browse/move--on"
url = "https://www.dictionary.com/browse/cuckold"
#url = "https://www.dictionary.com/browse/fear"
maggi = makedefinition(url)
with open(folder+"/demo.html", "w") as file:
file.write(str(maggi))
Snippet 3 (not working):
soup = None
def safe_execute(command):
global soup
try:
print(soup) # correct soup is printed
print(exec(command)) # this should print the list of style tags but printing None, and for related content this should throw some exception
return exec(command) # None is being returned for style
except Exception:
print(Exception.with_traceback())
return []
def makedefinition(url):
global soup
success = False
while success==False:
try:
request=urllib.request.Request(url,headers=headers)
final_url = urllib.request.urlopen(request, timeout=5).geturl()
r = requests.get(final_url, headers=headers, timeout=5)
success=True
except:
success=False
soup = BeautifulSoup(r.text, 'lxml')
soup = soup.find("section",{'class':'css-1f2po4u e1hj943x0'})
# there are many more elements to remove. mentioned only 2 for shortness
remove = safe_execute("soup.find_all('style')") # style tags
remove.extend(safe_execute("soup.find('h2',{'class':'css-1iltn77 e17deyx90'}).parent")) # related content in the page
for x in remove: x.decompose()
return(soup)
# testing code on multiple urls
#url = "https://www.dictionary.com/browse/a"
#url = "https://www.dictionary.com/browse/a--christmas--carol"
#url = "https://www.dictionary.com/brdivowse/affection"
#url = "https://www.dictionary.com/browse/hot"
#url = "https://www.dictionary.com/browse/move--on"
url = "https://www.dictionary.com/browse/cuckold"
#url = "https://www.dictionary.com/browse/fear"
maggi = makedefinition(url)
with open(folder+"/demo.html", "w") as file:
file.write(str(maggi))
In your code in snippet 3 you use the exec builtin method which returns None regardless of what it does with its argument. For details see this SO thread.
Remedy:
Use exec to modify a variable and return it instead of returning the output of exec itself.
def safe_execute(command):
d = {}
try:
exec(command, d)
return d['output']
except Exception:
print(Exception.with_traceback())
return []
Then call it as something like this:
remove = safe_execute("output = soup.find_all('style')")
EDIT:
Upon execution of this code, again None is returned. Upon debugging however, inside try section if we print(soup) a correct soup value is printed, but exec(command,d) gives NameError: name 'soup' is not defined.
This disparity have been overcome by using eval() instead of exec(). The function defined is:
def safe_execute(command):
global soup
try:
output = eval(command)
return(output)
except Exception:
return []
And the call looks like:
remove = safe_execute("soup.find_all('style')")
remove.extend(safe_execute("soup.find('h2',{'class':'css-1iltn77 e17deyx90'}).parent"))
suppose we have this swarm url "https://www.swarmapp.com/c/dZxqzKerUMc" how we can get the url under Apple Williamsburg hyperlink in link above.
I tried to filter it out according to html tags but there are many tags and lots of foursquare.com links.
below is a part of source code of the given link above
<h1><strong>Kristin Brooks</strong> at <a
href="https://foursquare.com/v/apple-williamsburg/57915fa838fab553338ff7cb"
target="_blank">Apple Williamsburg</a></h1>
the url foursquare in the code not always the same, so what is the best way to get that specific url uniquely for every given Swarm url.
I tried this:
import bs4
import requests
def get_4square_url(link):
response = requests.get(link)
soup = bs4.BeautifulSoup(response.text, "html.parser")
link = [a.attrs.get('href') for a in
soup.select('a[href=https://foursquare.com/v/*]')]
return link
print (get_4square_url('https://www.swarmapp.com/c/dZxqzKerUMc'))
I used https://foursquare.com/v/ as a pattern to get the desirable url
def get_4square_url(link):
try:
response = requests.get(link)
soup = bs4.BeautifulSoup(response.text, "html.parser")
for elem in soup.find_all('a',
href=re.compile('https://foursquare\.com/v/')): #here is my pattern
link = elem['href']
return link
except requests.exceptions.HTTPError or
requests.exceptions.ConnectionError or requests.exceptions.ConnectTimeout \
or urllib3.exceptions.MaxRetryError:
pass
I am scraping a forum page for posts and relevant links using BeautifulSoup.
The links on the page I want are in form r"xx/res/[0-9]{5}.html$".
So far, so good finding them in my BeautifulSoup object, with the following link format returned when I print: /xx/res/83071.html.
I now want to prepend the domain name 'http://website.com' to each result, and use the full url as the basis for further scraping.
My successful code looks like this:
url = 'http://website.com/xx/index.html'
res = urlopen(url)
soup = BeautifulSoup(res, 'html.parser')
links = soup.select('a',{'href':re.compile(r"xx/res/[0-9]{5}.html$")})
for l in links:
print(l['href'])
As a example, the following is printed to the console:
/xx/res/83071.html
/xx/res/81813.html
/xx/res/92014.html
/xx/res/92393.html
Hoping to get some help with the correct syntax to concatenate the prepended string to the output.
Thanks.
This will work for you:-
url = 'http://website.com/xx/index.html'
res = urlopen(url)
soup = BeautifulSoup(res, 'html.parser')
links = soup.select('a',{'href':re.compile(r"xx/res/[0-9]{5}.html$")})
for l in links:
print ('http://website.com'+l['href'])
There are several ways to do it. I personally like the string.format method.
store the base url:
xx = 'base_url'
your print line would be:
print('/{}/{}'.format(xx, l['href']))
where {} gets replaced by .format to instead have the variables you feed into the parameters.