python web scraping _images scraping - web

I faced problem as shown in attached photos when i used this code the images showed like this “ not appear “
from email.mime import image
import requests
from bs4 import BeautifulSoup
import os
url = 'https://www.imdb.com/search/title/?groups=top_100&sort=user_rating,desc&count=100&start=%22+%22&ref_=adv_nxt'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
images = soup.find_all('img')
for image in images:
name = image['alt']
link = image ['src']
with open (name + '.jpg','wb') as f:
im = requests.get(link)
f.write(im.content)

Related

How to scrape simple image from webpage

I am very very new to python.
when I run the below:
from PIL import Image
import requests
import bs4
url = 'https://parts.bmwmonterey.com/a/BMW_2004_330i-Sedan/_52014_5798240/Cooling-System-Water-Hoses/17_0215.html'
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
image = soup.find('img')
image_url = image['src']
img = Image.open(requests.get(image_url, stream = True).raw)
img.save('image.jpg')
I got this error:
Invalid URL '/images/parts/BMW/fullsize/158518.jpg': No schema supplied. Perhaps you meant http:///images/parts/BMW/fullsize/158518.jpg?
In your code, the image_url gives the directory of the image where it stored on the hosting service. You need to append the domain name to the image_url variable and use the requests library to download it.
Use the following code and it will work.
import bs4
import requests
url = "https://parts.bmwmonterey.com/a/BMW_2004_330i-Sedan/_52014_5798240/Cooling-System-Water-Hoses/17_0215.html"
resp = requests.get(url)
soup = bs4.BeautifulSoup(resp.text, "html.parser")
img = soup.find('img')
image = img["src"]
img_url = "https://parts.bmwmonterey.com" + str(image)
r = requests.get(img_url)
with open("image.jpg","wb") as f:
f.write(r.content)

Python web scraper won't save image files

I started working on a small image scraping terminal program that is supposed to save images to a specified file within the program hierarchy. This comes from a basic tutorial I found online. However, whenever I enter in a search term into the terminal to start scraping bing.com (yeah, i know), the program crashes. The errors i get seem to focus on either the image file type not being recognized or the file path where the images will be saved is not being recognized:
from bs4 import BeautifulSoup
import requests
from PIL import Image
from io import BytesIO
search = input("Search for:")
params = {"q": search}
r = requests.get("http://www.bing.com/images/search", params=params)
soup = BeautifulSoup(r.text, "html.parser")
links = soup.findAll("a", {"class": "thumb"})
for item in links:
img_obj = requests.get(item.attrs["href"])
print("Getting", item.attrs["href"])
title = item.attrs["href"].split("/")[-1]
img = Image.open(BytesIO(img_obj.content))
img.save("./scraped_images/" + title, img.format)
Error thrown: Exception has occurred: FileNotFoundError
[Errno 2] No such file or directory: './scraped_images/3849747391_4a7dc3f19e_b.jpg'
I've tried adding a file path variable (using pathlib) and concatenating that with with the other necessary variables:
from bs4 import BeautifulSoup
import requests
from PIL import Image
from io import BytesIO
from pathlib import Path
image_folder = Path("./scraped_images/")
search = input("Search for:")
params = {"q": search}
r = requests.get("http://www.bing.com/images/search", params=params)
soup = BeautifulSoup(r.text, "html.parser")
links = soup.findAll("a", {"class": "thumb"})
for item in links:
img_obj = requests.get(item.attrs["href"])
print("Getting", item.attrs["href"])
title = item.attrs["href"].split("/")[-1]
img = Image.open(BytesIO(img_obj.content))
img.save(image_folder + title, img.format)
Error thrown: Exception has occurred: TypeError
unsupported operand type(s) for +: 'WindowsPath' and 'str'
I've checked the documentation for PIL, BeautifulSoup, etc. to see if any updates may have been screwing me up, i've checked the elements on bing to see if the classes are correct, and even tried searching by different class and nothing worked. I'm at a loss. Any thoughts or guidance is appreciated. Thanks!
I have changed your code a bit:
from bs4 import BeautifulSoup
import requests
from pathlib import Path
import os
image_folder = Path("./scraped_images/")
if not os.path.isdir(image_folder):
print('Making %s'%(image_folder))
os.mkdir(image_folder)
search = input("Search for:")
params = {"q": search}
r = requests.get("http://www.bing.com/images/search", params=params)
soup = BeautifulSoup(r.text, "html.parser")
links = soup.findAll("a", {"class": "thumb"})
for item in links:
img_obj = requests.get(item.attrs["href"])
print("Getting", item.attrs["href"])
title = item.attrs["href"].split("/")[-1]
if img_obj.ok:
with open('%s/%s'%(image_folder, title), 'wb') as file:
file.write(img_obj.content)
You can use PIL but in this case you do not need it.
I also improved the code with PIL to work better:
from bs4 import BeautifulSoup
import requests
from PIL import Image
from io import BytesIO
from pathlib import Path
s = requests.Session()
image_folder = Path("./scraped_images/")
search = input("Search for:")
params = {"q": search}
r = s.get("http://www.bing.com/images/search", params=params)
soup = BeautifulSoup(r.text, "html.parser")
links = soup.findAll("a", {"class": "thumb"})
for item in links:
try:
img_obj = s.get(item.attrs["href"], headers={'User-Agent': 'User-Agent: Mozilla/5.0'})
if img_obj.ok:
print("Getting", item.attrs["href"])
title = item.attrs["href"].split("/")[-1]
if '?' in title:
title = title.split('?')[0]
img = Image.open(BytesIO(img_obj.content))
img.save(str(image_folder) + '/' + title, img.format)
else:
continue
except OSError:
print('\nError downloading %s try to visit'
'\n%s\n'
'manually and try to get the image manually.\n' %(title, item.attrs["href"]))
I use a requests session and added try and except if PIL can't make the image. I also only make try to make a image if the request get a 200 response from the site.

How to extract img src from web page via lxml in beautifulsoup using python?

I am new in python and I am working on web scraping project from amazon and I have a problem on how to extract the product img src from product page via lxml using BeautifulSoup
I tried the following code to extract it but it doesn't show the url of the img.
here is my code:
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.amazon.com/crocs-Unisex-Classic-Black-Women/dp/B0014C0LSY/ref=sr_1_2?_encoding=UTF8&qid=1560091629&s=fashion-womens-intl-ship&sr=1-2&th=1&psc=1'
r = requests.get(URL, headers={'User-Agent':'Mozilla/5.0'})
s = BeautifulSoup(r.text, "lxml")
img = s.find(class_="imgTagWrapper").img['src']
# I tried this code.
print(img)
I tried this code...but it shows like this:
data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAoHBwgHBgoICAgLCgoLDhgQDg0NDh0VFhEYIx8lJCIfIiEmKzcvJik0KSEiMEExNDk7Pj4+JS5ESUM8SDc9Pjv/2wBDAQoLCw4NDhwQEBw7KCIoOzs7Ozs7Ozs7Ozs7Ozs7Ozs7Ozs7Ozs7Ozs7Ozs7Ozs7Ozs7Ozs7Ozs7Ozs7Ozs7Ozv/wAARCAG9AM4DASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0t....//
any help ?
What you are seeing there is the base64 encoding of the image. What you do with it depends on what you're doing with image URLs.
The image you like to grab from that page is available in the value of this key data-a-dynamic-image. There are multiple images with different sizes in there. All you need to do now is create a conditional statement to isolate that image containing 395.
import json
import requests
from bs4 import BeautifulSoup
url = 'https://www.amazon.com/crocs-Unisex-Classic-Black-Women/dp/B0014C0LSY/ref=sr_1_2?_encoding=UTF8&qid=1560091629&s=fashion-womens-intl-ship&sr=1-2&th=1&psc=1'
r = requests.get(url, headers={'User-Agent':'Mozilla/5.0'})
s = BeautifulSoup(r.text, "lxml")
img = s.find(id="landingImage")['data-a-dynamic-image']
img = json.loads(img)
for k,v in img.items():
if '395' in k:
print(k)
Output:
https://images-na.ssl-images-amazon.com/images/I/71oNMAAC7sL._UX395_.jpg
In that case try like this and pick the one suits your need:
for k,v in img.items():
print(k)

BeautifulSoup python: Get the text with no tags and get the adjacent links

I am trying to extract the movie titles and links for it from this site
from bs4 import BeautifulSoup
from requests import get
link = "https://tamilrockerrs.ch"
r = get(link).content
#r = open('json.html','rb').read()
b = BeautifulSoup(r,'html5lib')
a = b.findAll('p')[1]
But the problem is there is no tag for the titles. I can't extract the titles and if I could do that how can I bind the links and title together.
Thanks in Advance
You can find title and link by this way.
from bs4 import BeautifulSoup
import requests
url= "http://tamilrockerrs.ch"
response= requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'html.parser')
data = soup.find_all('div', {"class":"title"})
for film in data:
print("Title:", film.find('a').text) # get the title here
print("Link:", film.find('a').get("href")) #get the link here

Not showing full HTML links where are letters ćčžšđ

I'm scraping data from the phonebook, and the problem is that it's not showing full html link. It stops before the čćžšđ letters and my question is how to include čćžšđ letters in the html link.
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'http://www.imenik.hr/imenik/trazi/1/Osobe/sve' \
'/sve/vaznost/mjesto:vi%C5%A1njan.html'
r = urlopen(url).read()
page_soup = BeautifulSoup(r, 'html.parser')
for container in page_soup.find_all('div', class_ = 'list_naslov'):
base_url = 'http://www.imenik.hr'
for link in container.find_all('a', href=True):
print('href:',link['href'])
request_href = base_url + link['href'].replace(' ', '%20')
print(request_href)

Resources