Specify outpath when downloading files from URL - python-3.x

I have some files I am downloading from a url.
I can currently access my files like this:
import requests
from bs4 import BeautifulSoup
import os
prefix = 'https://n5eil01u.ecs.nsidc.org/MOST/MOD10A1.006/'
download_url = "https:/path_to_website"
s = requests.session()
soup = BeautifulSoup(s.get(download_url).text, "lxml")
for a in soup.find_all('a', href=True):
final_link = os.path.join(prefix, a['href'])
result = s.get(final_link, stream = True)
with open(a['href'], 'wb') as out_file:
shutil.copyfileobj(result.raw, out_file)
This will download the files fine and puts it into a default directory of C:/User.
I would like to choose where to download my files though. You can choose where the outpath is with wget but my method with that downloads empty files as if they aren't being accessed.
I tried this with wget like this:
out_path = "C:/my_path"
prefix = 'https://n5eil01u.ecs.nsidc.org/MOST/MOD10A1.006/'
s = requests.session()
soup = BeautifulSoup(s.get(download_url).text, "lxml")
for a in page.find_all('a', href=True):
final_link = os.path.join(prefix, a['href'])
download = wget.download(final_link, out = out_path)
I think wget isn't working because I am accessing the website with authentication (not shown), and when I join the final link I am no longer accessing it with authentication. Is there a way to specify the outpath with shutil?

What about using the first method, replacing the path of the file opened with os.path.join(out_path, a['href']) ?
import requests
from bs4 import BeautifulSoup
import os
out_path = "C:\\my_path"
prefix = 'https://n5eil01u.ecs.nsidc.org/MOST/MOD10A1.006/'
download_url = "https:/path_to_website"
s = requests.session()
soup = BeautifulSoup(s.get(download_url).text, "lxml")
for a in soup.find_all('a', href=True):
final_link = os.path.join(prefix, a['href'])
result = s.get(final_link, stream = True)
new_file_path = os.path.join(out_path, a['href'])
with open(new_file_path, 'wb') as out_file: # this will create the new file at new_file_path
shutil.copyfileobj(result.raw, out_file)

You can create target path like below,
target_path = r'c:\windows\temp'
with open(os.path.join(target_path, a['href']), 'wb') as out_file:
shutil.copyfileobj(result.raw, out_file)

Related

How to download image from URL using beautiful soup in high quality?

I am trying to download images using beautiful soup While Importing a list of URLs from .CSV file. Now I am getting results like below,
<img class="pick" src="backup/remote_2109image/008f3ef7-1da9-11ec-abad-88ae1db4aa6901.jpg" width="350height=616\"/>
In the below code, I am trying to get an image from URL that has the class 'pick'
Now, How Will I download this in a folder?
import csv
import requests
import os
import urllib
from bs4 import BeautifulSoup as bs
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
for image in tables:
print(image)
You might try this:
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
inParsed = urllib.parse.urlparse(imagesname) # break down url
rootUrl = f'{inParsed.scheme}://{inParsed.netloc}' # to get root
for image in tables:
imageUrl = urllib.parse.urljoin(rootUrl, imageUrl.get('src')) # add root to src
saveImgAs = [u for u in imageUrl.split('/') if u][-1] # get name from link
with open(saveImgAs, "wb") as f:
f.write(requests.get(imageUrl).content) # download
f.close()
print(saveImgAs, image)
I'm not entirely sure about the formation of imageUrl nor of how consistent your image src values might be - if I had a few of your row values, I would have been able to run a few tests first, but hopefully this works
I made some changes to download image from URL which is in CSV file
import csv
import requests
import os
import urllib
from bs4 import BeautifulSoup as bs
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
for image in tables:
img_url = image.get('src').replace('\\', '/')
real_url = "domain-name" + img_url
img_name = str(img_url.split('/')[-1])
urllib.request.urlretrieve(real_url, os.path.join(
path, img_name))

Loop url from dataframe and download pdf files in Python

Based on the code from here, I'm able to crawler url for each transation and save them into an excel file which can be downloaded here.
Now I would like to go further and click the url link:
For each url, I will need to open and save pdf format files:
How could I do that in Python? Any help would be greatly appreciated.
Code for references:
import shutil
from bs4 import BeautifulSoup
import requests
import os
from urllib.parse import urlparse
url = 'xxx'
for page in range(6):
r = requests.get(url.format(page))
soup = BeautifulSoup(r.content, "html.parser")
for link in soup.select("h3[class='sv-card-title']>a"):
r = requests.get(link.get("href"), stream=True)
r.raw.decode_content = True
with open('./files/' + link.text + '.pdf', 'wb') as f:
shutil.copyfileobj(r.raw, f)
An example of download a pdf file in your uploaded excel file.
from bs4 import BeautifulSoup
import requests
# Let's assume there is only one page.If you need to download many files, save them in a list.
url = 'http://xinsanban.eastmoney.com/Article/NoticeContent?id=AN201909041348533085'
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
link = soup.select_one(".lookmore")
title = soup.select_one(".newsContent").select_one("h1").text
print(title.strip() + '.pdf')
data = requests.get(link.get("href")).content
with open(title.strip().replace(":", "-") + '.pdf', "wb+") as f: # file name shouldn't contain ':', so I replace it to "-"
f.write(data)
And download successfully:
Here's bit different approach. You don't have to open those urls from the excel file as you can build the .pdf file source urls yourself.
For example:
import requests
urls = [
"http://data.eastmoney.com/notices/detail/871792/AN201909041348533085,JWU2JWEwJTk2JWU5JTljJTllJWU3JTg5JWE5JWU0JWI4JTlh.html",
"http://data.eastmoney.com/notices/detail/872955/AN201912101371726768,JWU0JWI4JWFkJWU5JTgzJWJkJWU3JTg5JWE5JWU0JWI4JTlh.html",
"http://data.eastmoney.com/notices/detail/832816/AN202008171399155565,JWU3JWI0JWEyJWU1JTg1JThiJWU3JTg5JWE5JWU0JWI4JTlh.html",
"http://data.eastmoney.com/notices/detail/831971/AN201505220009713696,JWU1JWJjJTgwJWU1JTg1JTgzJWU3JTg5JWE5JWU0JWI4JTlh.html",
]
for url in urls:
file_id, _ = url.split('/')[-1].split(',')
pdf_file_url = f"http://pdf.dfcfw.com/pdf/H2_{file_id}_1.pdf"
print(f"Fetching {pdf_file_url}...")
with open(f"{file_id}.pdf", "wb") as f:
f.write(requests.get(pdf_file_url).content)

How to scrape simple image from webpage

I am very very new to python.
when I run the below:
from PIL import Image
import requests
import bs4
url = 'https://parts.bmwmonterey.com/a/BMW_2004_330i-Sedan/_52014_5798240/Cooling-System-Water-Hoses/17_0215.html'
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
image = soup.find('img')
image_url = image['src']
img = Image.open(requests.get(image_url, stream = True).raw)
img.save('image.jpg')
I got this error:
Invalid URL '/images/parts/BMW/fullsize/158518.jpg': No schema supplied. Perhaps you meant http:///images/parts/BMW/fullsize/158518.jpg?
In your code, the image_url gives the directory of the image where it stored on the hosting service. You need to append the domain name to the image_url variable and use the requests library to download it.
Use the following code and it will work.
import bs4
import requests
url = "https://parts.bmwmonterey.com/a/BMW_2004_330i-Sedan/_52014_5798240/Cooling-System-Water-Hoses/17_0215.html"
resp = requests.get(url)
soup = bs4.BeautifulSoup(resp.text, "html.parser")
img = soup.find('img')
image = img["src"]
img_url = "https://parts.bmwmonterey.com" + str(image)
r = requests.get(img_url)
with open("image.jpg","wb") as f:
f.write(r.content)

Downloading/webscraping images from python

I am trying to download all the images from the website but been unable to do so. How I can download all the images from a specific section of a website and save it to my directory?
The below code exports all the image and saves the image link to a csv file, but I also want the image to save it in my directory also.
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20card'
req = Request(my_url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
page_soup = soup(webpage, "html.parser")
filename = "abc.csv"
f = open(filename, "w")
headers = "imagelink\n"
f.write(headers)
snackcrisps = page_soup.findAll("div",{"class":"divCategories divShops-newegg"})
crispitem = snackcrisps[0]
img = crispitem.findAll("div",{"class":"product_image_div productSmall_image_div_lit"})
img1 = img[0]
for img1 in img:
img2 = img1.findAll('img')
imageLink = img2[0].get('src')
print("imageLink: " + imageLink)
f.write(imageLink + "\n")
f.close()
How can I save the images in my local directory? Help needed!!
Many Thanks
I used the response to this post to formulate my answer.
First you need to build the full URL for the image you want. This could be as simple as appending "https:" to the beginning of the image link, or not changing the value at all. You'll have to investigate (review this post) how to adjust the URLs you find based on whether or not they are relative or absolute.
You'll want to use the requests module to make the request for the image.
import requests
import shutil
for img1 in img:
img2 = img1.findAll('img')
imageLink = img2[0].get('src')
if not "https:" in imageLink:
imageLink = "https:" + imageLink
r = requests.get(imageLink, stream=True)
if r.response = 200:
with open("my_file.jpg", 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)

Python3 Download Code

Hi I want to download the linked pdf files from the following url:
https://arxiv.org/find/all/1/all:+5g/0/1/0/all/0/1?skip=0&query_id=32bdbf71e4007c69
Is there any Python3 code available for this? Any help will be appreciated.
The following code worked for me:
import os
from bs4 import BeautifulSoup
# Python 3.x
from urllib.request import urlopen, urlretrieve
URL = 'https://arxiv.org/find/all/1/all:+5g/0/1/0/all/0/1?skip=0&query_id=32bdbf71e4007c69'
OUTPUT_DIR = '' # path to output folder, '.' or '' uses current folder
u = urlopen(URL)
try:
html = u.read().decode('utf-8')
finally:
u.close()
soup = BeautifulSoup(html, "html.parser")
#print(soup)
for link in soup.select('a[href^="/pdf"]'):
href = link.get('href')
href1 = 'https://arxiv.org'+ href + '.pdf'
#print(href)
print(href)
print(href1)
if not any(href1.endswith(x) for x in ['.pdf']):
continue
filename = os.path.join(OUTPUT_DIR, href1.rsplit('/', 1)[-1])
# We need a https:// URL for this site
#href = href.replace('http://','https://')
print(filename)
print("Downloading %s to %s..." % (href1, filename) )
urlretrieve(href1, filename)
print("Done.")

Resources