Loop url from dataframe and download pdf files in Python - python-3.x

Based on the code from here, I'm able to crawler url for each transation and save them into an excel file which can be downloaded here.
Now I would like to go further and click the url link:
For each url, I will need to open and save pdf format files:
How could I do that in Python? Any help would be greatly appreciated.
Code for references:
import shutil
from bs4 import BeautifulSoup
import requests
import os
from urllib.parse import urlparse
url = 'xxx'
for page in range(6):
r = requests.get(url.format(page))
soup = BeautifulSoup(r.content, "html.parser")
for link in soup.select("h3[class='sv-card-title']>a"):
r = requests.get(link.get("href"), stream=True)
r.raw.decode_content = True
with open('./files/' + link.text + '.pdf', 'wb') as f:
shutil.copyfileobj(r.raw, f)

An example of download a pdf file in your uploaded excel file.
from bs4 import BeautifulSoup
import requests
# Let's assume there is only one page.If you need to download many files, save them in a list.
url = 'http://xinsanban.eastmoney.com/Article/NoticeContent?id=AN201909041348533085'
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
link = soup.select_one(".lookmore")
title = soup.select_one(".newsContent").select_one("h1").text
print(title.strip() + '.pdf')
data = requests.get(link.get("href")).content
with open(title.strip().replace(":", "-") + '.pdf', "wb+") as f: # file name shouldn't contain ':', so I replace it to "-"
f.write(data)
And download successfully:

Here's bit different approach. You don't have to open those urls from the excel file as you can build the .pdf file source urls yourself.
For example:
import requests
urls = [
"http://data.eastmoney.com/notices/detail/871792/AN201909041348533085,JWU2JWEwJTk2JWU5JTljJTllJWU3JTg5JWE5JWU0JWI4JTlh.html",
"http://data.eastmoney.com/notices/detail/872955/AN201912101371726768,JWU0JWI4JWFkJWU5JTgzJWJkJWU3JTg5JWE5JWU0JWI4JTlh.html",
"http://data.eastmoney.com/notices/detail/832816/AN202008171399155565,JWU3JWI0JWEyJWU1JTg1JThiJWU3JTg5JWE5JWU0JWI4JTlh.html",
"http://data.eastmoney.com/notices/detail/831971/AN201505220009713696,JWU1JWJjJTgwJWU1JTg1JTgzJWU3JTg5JWE5JWU0JWI4JTlh.html",
]
for url in urls:
file_id, _ = url.split('/')[-1].split(',')
pdf_file_url = f"http://pdf.dfcfw.com/pdf/H2_{file_id}_1.pdf"
print(f"Fetching {pdf_file_url}...")
with open(f"{file_id}.pdf", "wb") as f:
f.write(requests.get(pdf_file_url).content)

Related

How to download image from URL using beautiful soup in high quality?

I am trying to download images using beautiful soup While Importing a list of URLs from .CSV file. Now I am getting results like below,
<img class="pick" src="backup/remote_2109image/008f3ef7-1da9-11ec-abad-88ae1db4aa6901.jpg" width="350height=616\"/>
In the below code, I am trying to get an image from URL that has the class 'pick'
Now, How Will I download this in a folder?
import csv
import requests
import os
import urllib
from bs4 import BeautifulSoup as bs
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
for image in tables:
print(image)
You might try this:
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
inParsed = urllib.parse.urlparse(imagesname) # break down url
rootUrl = f'{inParsed.scheme}://{inParsed.netloc}' # to get root
for image in tables:
imageUrl = urllib.parse.urljoin(rootUrl, imageUrl.get('src')) # add root to src
saveImgAs = [u for u in imageUrl.split('/') if u][-1] # get name from link
with open(saveImgAs, "wb") as f:
f.write(requests.get(imageUrl).content) # download
f.close()
print(saveImgAs, image)
I'm not entirely sure about the formation of imageUrl nor of how consistent your image src values might be - if I had a few of your row values, I would have been able to run a few tests first, but hopefully this works
I made some changes to download image from URL which is in CSV file
import csv
import requests
import os
import urllib
from bs4 import BeautifulSoup as bs
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
for image in tables:
img_url = image.get('src').replace('\\', '/')
real_url = "domain-name" + img_url
img_name = str(img_url.split('/')[-1])
urllib.request.urlretrieve(real_url, os.path.join(
path, img_name))

Getting incorrect link on parsing web page in BeautifulSoup

I'm trying to get the download link from the button in this page. But when I open the download link that I get from my code I get this message
I noticed that if I manually click the button and open the link in a new page the csrfKey part of the link is always same whereas when I run the code I get a different key every time. Here's my code
from bs4 import BeautifulSoup
import requests
import re
def GetPage(link):
source_new = requests.get(link).text
soup_new = BeautifulSoup(source_new, 'lxml')
container_new = soup_new.find_all(class_='ipsButton')
for data_new in container_new:
#print(data_new)
headline = data_new # Display text
match = re.findall('download', str(data_new), re.IGNORECASE)
if(match):
print(f'{headline["href"]}\n')
if __name__ == '__main__':
link = 'https://eci.gov.in/files/file/10985-5-number-and-types-of-constituencies/'
GetPage(link)
Before you get to the actual download links of the files, you need to agree to Terms and Conditions. So, you need to fake this with requests and then parse the next page you get.
Here's how:
import requests
from bs4 import BeautifulSoup
if __name__ == '__main__':
link = 'https://eci.gov.in/files/file/10985-5-number-and-types-of-constituencies/'
with requests.Session() as connection:
r = connection.get("https://eci.gov.in/")
confirmation_url = BeautifulSoup(
connection.get(link).text, 'lxml'
).select_one(".ipsApp .ipsButton_fullWidth")["href"]
fake_agree_to_continue = connection.get(
confirmation_url.replace("?do=download", "?do=download&confirm=1")
).text
download_links = [
a["href"] for a in
BeautifulSoup(
fake_agree_to_continue, "lxml"
).select(".ipsApp .ipsButton_small")[1:]]
for download_link in download_links:
response = connection.get(download_link)
file_name = (
response
.headers["Content-Disposition"]
.replace('"', "")
.split(" - ")[-1]
)
print(f"Downloading: {file_name}")
with open(file_name, "wb") as f:
f.write(response.content)
This should output:
Downloading: Number And Types Of Constituencies.pdf
Downloading: Number And Types Of Constituencies.xls
And save two files: a .pdf and a .xls. The later one looks like this:

Downloading/webscraping images from python

I am trying to download all the images from the website but been unable to do so. How I can download all the images from a specific section of a website and save it to my directory?
The below code exports all the image and saves the image link to a csv file, but I also want the image to save it in my directory also.
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20card'
req = Request(my_url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
page_soup = soup(webpage, "html.parser")
filename = "abc.csv"
f = open(filename, "w")
headers = "imagelink\n"
f.write(headers)
snackcrisps = page_soup.findAll("div",{"class":"divCategories divShops-newegg"})
crispitem = snackcrisps[0]
img = crispitem.findAll("div",{"class":"product_image_div productSmall_image_div_lit"})
img1 = img[0]
for img1 in img:
img2 = img1.findAll('img')
imageLink = img2[0].get('src')
print("imageLink: " + imageLink)
f.write(imageLink + "\n")
f.close()
How can I save the images in my local directory? Help needed!!
Many Thanks
I used the response to this post to formulate my answer.
First you need to build the full URL for the image you want. This could be as simple as appending "https:" to the beginning of the image link, or not changing the value at all. You'll have to investigate (review this post) how to adjust the URLs you find based on whether or not they are relative or absolute.
You'll want to use the requests module to make the request for the image.
import requests
import shutil
for img1 in img:
img2 = img1.findAll('img')
imageLink = img2[0].get('src')
if not "https:" in imageLink:
imageLink = "https:" + imageLink
r = requests.get(imageLink, stream=True)
if r.response = 200:
with open("my_file.jpg", 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)

Python: Scraping links into a CSV

I am relatively new to Python. I am trying to scrape url's from a site and write them to a csv file. I have been able to print the urls, however, I have been unable to write them or store them anywhere. Any help?
import requests
import csv
from bs4 import BeautifulSoup
url = 'http://comm.eval.org/communities/resources/libraryview?LibraryKey=1eff4fd7-afa0-42e1-b275-f65881b7489b'
r=requests.get(url)
html_url = r.text
soup = BeautifulSoup(html_url, "html.parser")
with open('output.csv', 'wb') as f:
bsoup_writer = csv.writer(f)
for link in soup.find_all('a'):
bsoup_writer.writerow([link.get('href'), link.get('class'), link, get('id')])
This should do what you're looking for:
with open('output.csv', 'wb') as f:
bsoup_writer = csv.writer(f)
for link in soup.find_all('a'):
bsoup_writer.writerow([link.get('href'), link.get('class'), link.get('id')])
Be sure to include the following csv import at the top of your script:
import csv

Can't append Base URL to create absolute links with Beatifulsoup Python 3

I get a list of links in the output file but need all of the links to show as absolute links. Some are absolute and others are relative. How do I append the base url to the relatives to ensure that I get only absolute links in the csv output?
I get back all the links but not all are absolute links e.g /subpage instead of http://page.com/subpage
from bs4 import BeautifulSoup
import requests
import csv
j = requests.get("http://cnn.com").content
soup = BeautifulSoup(j, "lxml")
#only return links to subpages e.g. a tag that contains href
data = []
for url in soup.find_all('a', href=True):
print(url['href'])
data.append(url['href'])
print(data)
with open("file.csv",'w') as csvfile:
write = csv.writer(csvfile, delimiter = ' ')
write.writerows(data)
content = open('file.csv', 'r').readlines()
content_set = set(content)
cleandata = open('file.csv', 'w')
for line in content_set:
cleandata.write(line)
with urljoin:
from urlparse import urljoin
...
base_url = "http://cnn.com"
absolute_url = urljoin(base_url, relative_url)

Resources