get PDFs in iframe id="swGoogleDrive" - python-3.x

How do I get the PDFs found in the iframe of this URL?
(1) The following code throws an error.
import requests, re
from bs4 import BeautifulSoup
url = r'https://www.d88a.org/domain/102'
headers = {'User-Agent': 'C19SchoolsWebscrape'}
s = requests.Session()
r = s.get(url, headers=headers)
soup = BeautifulSoup(r.content, "lxml")
iframe_src = soup.select_one("swGoogleDrive").attrs["src"]
r = s.get(f"https:{iframe_src}")
print(r)
error: 'NoneType' object has no attribute 'attrs'
(2) This also throws an error.
response = requests.get(url, headers=headers)
t = re.search(b'(?<=artist":")(.*?)(?=")', response.content).group(0).decode("utf-8")
print(t)
error: 'NoneType' object has no attribute 'group'
Earlier threads I've referenced:
Python BeautifulSoup - Scrape Web Content Inside Iframes,
extract iFrame content using BeautifulSoup

To get all links to the PDFs, you can use this example:
import requests
from bs4 import BeautifulSoup
url = 'https://www.d88a.org/domain/102'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
soup = BeautifulSoup(requests.get(soup.iframe['src']).content, 'html.parser')
for a in soup.select('a'):
print(a['href'])
Prints:
https://drive.google.com/file/d/1bCXyoE7FWWI9RIcDWosHrohYQY7Ryb13/view?usp=drive_web
https://drive.google.com/file/d/1SlR-71M-jCMF-AO4ChdSbywolIF9yL1h/view?usp=drive_web
https://drive.google.com/file/d/1zbrt5Mnt0fZxjeD7DRYvfP6cskYKig27/view?usp=drive_web

Related

I am learning BeautifulSoup but I am getting an error

Here is my code
import requests
from bs4 import BeautifulSoup
result = requests.get("https://www.whitehouse.gov/briefings-statements/")
src = result.content
soup = BeautifulSoup(src, 'lxml')
urls = []
for h2_tag in soup.find_all("h2"):
a_tag = h2_tag.find('a')
urls.append(a_tag.attrs not in ['href'])
print(urls)
Here is the error
AttributeError: 'NoneType' object has no attribute 'attrs'
what is wrong with my code
Sometimes h2_tag.find('a') will return None. You can fix this problem by using a try/except:
import requests
from bs4 import BeautifulSoup
result = requests.get("https://www.whitehouse.gov/briefings-statements/")
src = result.content
soup = BeautifulSoup(src, 'lxml')
urls = []
for h2_tag in soup.find_all("h2"):
try:
a_tag = h2_tag.find('a')
urls.append(a_tag.attrs["href"])
except AttributeError:
continue
print(urls)
My preference for cleaner code is to put the restriction into the selection of nodes, rather than test later. In your case, you can do this by using css selectors which retrieve h2 that have an a child. Similar layout to yours:
import requests
from bs4 import BeautifulSoup
result = requests.get("https://www.whitehouse.gov/briefings-statements/")
src = result.content
soup = BeautifulSoup(src, 'lxml')
urls = []
for h2_tag in soup.select('h2:has(a)'):
a_tag = h2_tag.find('a')
urls.append(a_tag['href'])
print(urls)
However, we can be much more concise than the above:
urls = [i['href'] for i in soup.select('h2 > a')]
print(urls)
The above selecting a elements which are direct children of h2.

BeautifulSoup returns None python selenium

from selenium import webdriver
from bs4 import BeautifulSoup
import time
################import the chrome web driver and define the location###############
driver = webdriver.Chrome(executable_path='C:/Users/../Downloads/cd79/chromedriver.exe')
###################################################################################
###########open the web page and print the title##############
page = driver.get("https://kjustin765.wixsite.com/website")
print(driver.title)
driver.maximize_window()
time.sleep(5)
while True:
soup = BeautifulSoup(page.content, 'html.parser')
button1 = soup.find('span', class_='pWNha').text
if 'Yes' in button1:
driver.refresh()
else:
button1.click()
Why is the page being returned as None?
Here is the error
soup = BeautifulSoup(page.content, 'html.parser')
AttributeError: 'NoneType' object has no attribute 'content
To get the correct data use page.page_source instead of page.content:
soup = BeautifulSoup(page.page_source, 'html.parser')
The .content method comes from the requests library if you use it to request the page. For example:
import requests
page = requests.get(my_url).content

How to only retrieve the tag I specify using BeautifulSoup

I just want the written text out of this website: https://algorithms-tour.stitchfix.com/ so I can put it in Word doc and read it.
When I run the code, I get all the html and the tags, at the very end I get what I want, but I just want to separate the text.
import requests
from bs4 import BeautifulSoup
url = "https://algorithms-tour.stitchfix.com"
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
item = soup.find_all("p")
print(item)
Is there a way to get just content so I can clean it up some more?
You have a few options for this. If you only want text found within p tags, you can do this:
import requests
from bs4 import BeautifulSoup
url = "https://algorithms-tour.stitchfix.com"
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
items = soup.find_all("p")
result = []
for item in items:
result.append(item.string)
print(result)
Note that soup.find_all returns an iterable list, and not a single object.
An alternative, and easier method is to just use soup.get_text:
import requests
from bs4 import BeautifulSoup
url = "https://algorithms-tour.stitchfix.com"
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
print(soup.get_text())

How to scrape from web all children of an attribute with one class?

I have tried to get the highlighted area (in the screenshot) in the website using BeautifulSoup4, but I cannot get what I want. Maybe you have a recommendation doing it with another way.
Screenshot of the website I need to get data from
from bs4 import BeautifulSoup
import requests
import pprint
import re
import pyperclip
import urllib
import csv
import html5lib
urls = ['https://e-mehkeme.gov.az/Public/Cases?page=1',
'https://e-mehkeme.gov.az/Public/Cases?page=2'
]
# scrape elements
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
content = soup.findAll("input", class_="casedetail filled")
print(content)
My expected output is like this:
Ətraflı məlumat:
İşə baxan hakim və ya tərkib
Xəyalə Cəmilova - sədrlik edən hakim
İlham Kərimli - tərkib üzvü
İsmayıl Xəlilov - tərkib üzvü
Tərəflər
Cavabdeh: MAHMUDOV MAQSUD SOLTAN OĞLU
Cavabdeh: MAHMUDOV MAHMUD SOLTAN OĞLU
İddiaçı: QƏHRƏMANOVA AYNA NUĞAY QIZI
İşin mahiyyəti
Mənzil mübahisələri - Mənzildən çıxarılma
Using the base url first get all the caseid and then pass those caseid to target url and then get the value of the first td tag.
import requests
from bs4 import BeautifulSoup
urls = ['https://e-mehkeme.gov.az/Public/Cases?page=1',
'https://e-mehkeme.gov.az/Public/Cases?page=2'
]
target_url="https://e-mehkeme.gov.az/Public/CaseDetail?caseId={}"
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
for caseid in soup.select('input.casedetail'):
#print(caseid['value'])
soup1=BeautifulSoup(requests.get(target_url.format(caseid['value'])).content,'html.parser')
print(soup1.select_one("td").text)
I would write it this way. Extracting the id that needs to be put in GET request for detailed info
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://e-mehkeme.gov.az/Public/Cases?page=1','https://e-mehkeme.gov.az/Public/Cases?page=2']
def get_soup(url):
r = s.get(url)
soup = bs(r.content, 'lxml')
return soup
with requests.Session() as s:
for url in urls:
soup = get_soup(url)
detail_urls = [f'https://e-mehkeme.gov.az/Public/CaseDetail?caseId={i["value"]}' for i in soup.select('.caseId')]
for next_url in detail_urls:
soup = get_soup(next_url)
data = [string for string in soup.select_one('[colspan="4"]').stripped_strings]
print(data)

How do I extract data from the table using python?

I am trying to get the data from the table shown on this website 'https://www.qualitydiamonds.co.uk/one-carat-loose-diamonds/#'
I have tried the code below, but I am only able to get the data from the 4 main diamonds appearing on the webpage and none of the data from the actual table.
import requests
from bs4 import BeautifulSoup
url = "https://www.qualitydiamonds.co.uk/one-carat-loose-diamonds/"
response = requests.get(url)
print(response)
soup = BeautifulSoup(response.text, 'html.parser')
one_a_tag = soup.findAll('span', class_='price')
print(one_a_tag)
Hey you can try this code below:
import requests
import bs4 as bs
url = "https://www.qualitydiamonds.co.uk/one-carat-loose-diamonds/"
response = requests.get(url)
soup = bs.BeautifulSoup(response.text, 'lxml')
price = soup.find(class_='price')
print(price.text)

Resources