Downloading/webscraping images from python - python-3.x

I am trying to download all the images from the website but been unable to do so. How I can download all the images from a specific section of a website and save it to my directory?
The below code exports all the image and saves the image link to a csv file, but I also want the image to save it in my directory also.
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20card'
req = Request(my_url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
page_soup = soup(webpage, "html.parser")
filename = "abc.csv"
f = open(filename, "w")
headers = "imagelink\n"
f.write(headers)
snackcrisps = page_soup.findAll("div",{"class":"divCategories divShops-newegg"})
crispitem = snackcrisps[0]
img = crispitem.findAll("div",{"class":"product_image_div productSmall_image_div_lit"})
img1 = img[0]
for img1 in img:
img2 = img1.findAll('img')
imageLink = img2[0].get('src')
print("imageLink: " + imageLink)
f.write(imageLink + "\n")
f.close()
How can I save the images in my local directory? Help needed!!
Many Thanks

I used the response to this post to formulate my answer.
First you need to build the full URL for the image you want. This could be as simple as appending "https:" to the beginning of the image link, or not changing the value at all. You'll have to investigate (review this post) how to adjust the URLs you find based on whether or not they are relative or absolute.
You'll want to use the requests module to make the request for the image.
import requests
import shutil
for img1 in img:
img2 = img1.findAll('img')
imageLink = img2[0].get('src')
if not "https:" in imageLink:
imageLink = "https:" + imageLink
r = requests.get(imageLink, stream=True)
if r.response = 200:
with open("my_file.jpg", 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)

Related

How to download image from URL using beautiful soup in high quality?

I am trying to download images using beautiful soup While Importing a list of URLs from .CSV file. Now I am getting results like below,
<img class="pick" src="backup/remote_2109image/008f3ef7-1da9-11ec-abad-88ae1db4aa6901.jpg" width="350height=616\"/>
In the below code, I am trying to get an image from URL that has the class 'pick'
Now, How Will I download this in a folder?
import csv
import requests
import os
import urllib
from bs4 import BeautifulSoup as bs
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
for image in tables:
print(image)
You might try this:
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
inParsed = urllib.parse.urlparse(imagesname) # break down url
rootUrl = f'{inParsed.scheme}://{inParsed.netloc}' # to get root
for image in tables:
imageUrl = urllib.parse.urljoin(rootUrl, imageUrl.get('src')) # add root to src
saveImgAs = [u for u in imageUrl.split('/') if u][-1] # get name from link
with open(saveImgAs, "wb") as f:
f.write(requests.get(imageUrl).content) # download
f.close()
print(saveImgAs, image)
I'm not entirely sure about the formation of imageUrl nor of how consistent your image src values might be - if I had a few of your row values, I would have been able to run a few tests first, but hopefully this works
I made some changes to download image from URL which is in CSV file
import csv
import requests
import os
import urllib
from bs4 import BeautifulSoup as bs
with open('cat.csv', 'r') as file:
reader = csv.reader(file)
for row in reader:
imagesname = ' '.join(row)
r = requests.get(imagesname)
soup = bs(r.content, 'html.parser')
tables = soup.find_all('img', class_='pick')
for image in tables:
img_url = image.get('src').replace('\\', '/')
real_url = "domain-name" + img_url
img_name = str(img_url.split('/')[-1])
urllib.request.urlretrieve(real_url, os.path.join(
path, img_name))

Loop url from dataframe and download pdf files in Python

Based on the code from here, I'm able to crawler url for each transation and save them into an excel file which can be downloaded here.
Now I would like to go further and click the url link:
For each url, I will need to open and save pdf format files:
How could I do that in Python? Any help would be greatly appreciated.
Code for references:
import shutil
from bs4 import BeautifulSoup
import requests
import os
from urllib.parse import urlparse
url = 'xxx'
for page in range(6):
r = requests.get(url.format(page))
soup = BeautifulSoup(r.content, "html.parser")
for link in soup.select("h3[class='sv-card-title']>a"):
r = requests.get(link.get("href"), stream=True)
r.raw.decode_content = True
with open('./files/' + link.text + '.pdf', 'wb') as f:
shutil.copyfileobj(r.raw, f)
An example of download a pdf file in your uploaded excel file.
from bs4 import BeautifulSoup
import requests
# Let's assume there is only one page.If you need to download many files, save them in a list.
url = 'http://xinsanban.eastmoney.com/Article/NoticeContent?id=AN201909041348533085'
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
link = soup.select_one(".lookmore")
title = soup.select_one(".newsContent").select_one("h1").text
print(title.strip() + '.pdf')
data = requests.get(link.get("href")).content
with open(title.strip().replace(":", "-") + '.pdf', "wb+") as f: # file name shouldn't contain ':', so I replace it to "-"
f.write(data)
And download successfully:
Here's bit different approach. You don't have to open those urls from the excel file as you can build the .pdf file source urls yourself.
For example:
import requests
urls = [
"http://data.eastmoney.com/notices/detail/871792/AN201909041348533085,JWU2JWEwJTk2JWU5JTljJTllJWU3JTg5JWE5JWU0JWI4JTlh.html",
"http://data.eastmoney.com/notices/detail/872955/AN201912101371726768,JWU0JWI4JWFkJWU5JTgzJWJkJWU3JTg5JWE5JWU0JWI4JTlh.html",
"http://data.eastmoney.com/notices/detail/832816/AN202008171399155565,JWU3JWI0JWEyJWU1JTg1JThiJWU3JTg5JWE5JWU0JWI4JTlh.html",
"http://data.eastmoney.com/notices/detail/831971/AN201505220009713696,JWU1JWJjJTgwJWU1JTg1JTgzJWU3JTg5JWE5JWU0JWI4JTlh.html",
]
for url in urls:
file_id, _ = url.split('/')[-1].split(',')
pdf_file_url = f"http://pdf.dfcfw.com/pdf/H2_{file_id}_1.pdf"
print(f"Fetching {pdf_file_url}...")
with open(f"{file_id}.pdf", "wb") as f:
f.write(requests.get(pdf_file_url).content)

How to scrape simple image from webpage

I am very very new to python.
when I run the below:
from PIL import Image
import requests
import bs4
url = 'https://parts.bmwmonterey.com/a/BMW_2004_330i-Sedan/_52014_5798240/Cooling-System-Water-Hoses/17_0215.html'
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
image = soup.find('img')
image_url = image['src']
img = Image.open(requests.get(image_url, stream = True).raw)
img.save('image.jpg')
I got this error:
Invalid URL '/images/parts/BMW/fullsize/158518.jpg': No schema supplied. Perhaps you meant http:///images/parts/BMW/fullsize/158518.jpg?
In your code, the image_url gives the directory of the image where it stored on the hosting service. You need to append the domain name to the image_url variable and use the requests library to download it.
Use the following code and it will work.
import bs4
import requests
url = "https://parts.bmwmonterey.com/a/BMW_2004_330i-Sedan/_52014_5798240/Cooling-System-Water-Hoses/17_0215.html"
resp = requests.get(url)
soup = bs4.BeautifulSoup(resp.text, "html.parser")
img = soup.find('img')
image = img["src"]
img_url = "https://parts.bmwmonterey.com" + str(image)
r = requests.get(img_url)
with open("image.jpg","wb") as f:
f.write(r.content)

Python web scraper won't save image files

I started working on a small image scraping terminal program that is supposed to save images to a specified file within the program hierarchy. This comes from a basic tutorial I found online. However, whenever I enter in a search term into the terminal to start scraping bing.com (yeah, i know), the program crashes. The errors i get seem to focus on either the image file type not being recognized or the file path where the images will be saved is not being recognized:
from bs4 import BeautifulSoup
import requests
from PIL import Image
from io import BytesIO
search = input("Search for:")
params = {"q": search}
r = requests.get("http://www.bing.com/images/search", params=params)
soup = BeautifulSoup(r.text, "html.parser")
links = soup.findAll("a", {"class": "thumb"})
for item in links:
img_obj = requests.get(item.attrs["href"])
print("Getting", item.attrs["href"])
title = item.attrs["href"].split("/")[-1]
img = Image.open(BytesIO(img_obj.content))
img.save("./scraped_images/" + title, img.format)
Error thrown: Exception has occurred: FileNotFoundError
[Errno 2] No such file or directory: './scraped_images/3849747391_4a7dc3f19e_b.jpg'
I've tried adding a file path variable (using pathlib) and concatenating that with with the other necessary variables:
from bs4 import BeautifulSoup
import requests
from PIL import Image
from io import BytesIO
from pathlib import Path
image_folder = Path("./scraped_images/")
search = input("Search for:")
params = {"q": search}
r = requests.get("http://www.bing.com/images/search", params=params)
soup = BeautifulSoup(r.text, "html.parser")
links = soup.findAll("a", {"class": "thumb"})
for item in links:
img_obj = requests.get(item.attrs["href"])
print("Getting", item.attrs["href"])
title = item.attrs["href"].split("/")[-1]
img = Image.open(BytesIO(img_obj.content))
img.save(image_folder + title, img.format)
Error thrown: Exception has occurred: TypeError
unsupported operand type(s) for +: 'WindowsPath' and 'str'
I've checked the documentation for PIL, BeautifulSoup, etc. to see if any updates may have been screwing me up, i've checked the elements on bing to see if the classes are correct, and even tried searching by different class and nothing worked. I'm at a loss. Any thoughts or guidance is appreciated. Thanks!
I have changed your code a bit:
from bs4 import BeautifulSoup
import requests
from pathlib import Path
import os
image_folder = Path("./scraped_images/")
if not os.path.isdir(image_folder):
print('Making %s'%(image_folder))
os.mkdir(image_folder)
search = input("Search for:")
params = {"q": search}
r = requests.get("http://www.bing.com/images/search", params=params)
soup = BeautifulSoup(r.text, "html.parser")
links = soup.findAll("a", {"class": "thumb"})
for item in links:
img_obj = requests.get(item.attrs["href"])
print("Getting", item.attrs["href"])
title = item.attrs["href"].split("/")[-1]
if img_obj.ok:
with open('%s/%s'%(image_folder, title), 'wb') as file:
file.write(img_obj.content)
You can use PIL but in this case you do not need it.
I also improved the code with PIL to work better:
from bs4 import BeautifulSoup
import requests
from PIL import Image
from io import BytesIO
from pathlib import Path
s = requests.Session()
image_folder = Path("./scraped_images/")
search = input("Search for:")
params = {"q": search}
r = s.get("http://www.bing.com/images/search", params=params)
soup = BeautifulSoup(r.text, "html.parser")
links = soup.findAll("a", {"class": "thumb"})
for item in links:
try:
img_obj = s.get(item.attrs["href"], headers={'User-Agent': 'User-Agent: Mozilla/5.0'})
if img_obj.ok:
print("Getting", item.attrs["href"])
title = item.attrs["href"].split("/")[-1]
if '?' in title:
title = title.split('?')[0]
img = Image.open(BytesIO(img_obj.content))
img.save(str(image_folder) + '/' + title, img.format)
else:
continue
except OSError:
print('\nError downloading %s try to visit'
'\n%s\n'
'manually and try to get the image manually.\n' %(title, item.attrs["href"]))
I use a requests session and added try and except if PIL can't make the image. I also only make try to make a image if the request get a 200 response from the site.

Specify outpath when downloading files from URL

I have some files I am downloading from a url.
I can currently access my files like this:
import requests
from bs4 import BeautifulSoup
import os
prefix = 'https://n5eil01u.ecs.nsidc.org/MOST/MOD10A1.006/'
download_url = "https:/path_to_website"
s = requests.session()
soup = BeautifulSoup(s.get(download_url).text, "lxml")
for a in soup.find_all('a', href=True):
final_link = os.path.join(prefix, a['href'])
result = s.get(final_link, stream = True)
with open(a['href'], 'wb') as out_file:
shutil.copyfileobj(result.raw, out_file)
This will download the files fine and puts it into a default directory of C:/User.
I would like to choose where to download my files though. You can choose where the outpath is with wget but my method with that downloads empty files as if they aren't being accessed.
I tried this with wget like this:
out_path = "C:/my_path"
prefix = 'https://n5eil01u.ecs.nsidc.org/MOST/MOD10A1.006/'
s = requests.session()
soup = BeautifulSoup(s.get(download_url).text, "lxml")
for a in page.find_all('a', href=True):
final_link = os.path.join(prefix, a['href'])
download = wget.download(final_link, out = out_path)
I think wget isn't working because I am accessing the website with authentication (not shown), and when I join the final link I am no longer accessing it with authentication. Is there a way to specify the outpath with shutil?
What about using the first method, replacing the path of the file opened with os.path.join(out_path, a['href']) ?
import requests
from bs4 import BeautifulSoup
import os
out_path = "C:\\my_path"
prefix = 'https://n5eil01u.ecs.nsidc.org/MOST/MOD10A1.006/'
download_url = "https:/path_to_website"
s = requests.session()
soup = BeautifulSoup(s.get(download_url).text, "lxml")
for a in soup.find_all('a', href=True):
final_link = os.path.join(prefix, a['href'])
result = s.get(final_link, stream = True)
new_file_path = os.path.join(out_path, a['href'])
with open(new_file_path, 'wb') as out_file: # this will create the new file at new_file_path
shutil.copyfileobj(result.raw, out_file)
You can create target path like below,
target_path = r'c:\windows\temp'
with open(os.path.join(target_path, a['href']), 'wb') as out_file:
shutil.copyfileobj(result.raw, out_file)

Resources