Beautifulsoup image downloading error - python-3.x

I am trying to download the images from the imageurls that come back from a Beautifulsoup scrape. I was trying to get this code to work after reading on some other examples tho the
Getting error:
f.write(requests.get(img))
TypeError: a bytes-like object is required, not 'Response
line: f.write(requests.get(img)[What goes here?]) is causing me trouble now. I use soup = BeautifulSoup(source, 'html.parser')
Where as the reference uses soup = BeautifulSoup(r.content)
import os
dir_path = os.path.dirname(os.path.realpath(__file__))
import urllib.request
from bs4 import BeautifulSoup
import codecs
import sys
import requests
from os.path import basename
lines = open('list.txt').read().splitlines()
for designers in lines:
for i in range(1):
url = 'https://www.example.com/Listings?st=' + author + '{}'.format(i)
source = urllib.request.urlopen(url)
soup = BeautifulSoup(source, 'html.parser')
for products in soup.find_all('li', class_='widget'):
image = products.find('img', class_='lazy-load')
print(image.get('data-src'))
img = (image.get('data-src'))
with open(basename(img), "wb") as f:
f.write(requests.get(img)**[What goes here?]**)
Thanks!

Related

Extracting links from a URL until there is data and then moving to the next URL

I have a URL from where I am trying to extract data. I have found a way to extract data from that url. but how do i then move to the next URL if the existing URL doesn't have any data?
The base URL have used in main function is,
https://posoco.in/reports/daily-reports/
instead I only want to extract data from 2020-21 and then go on from here,
https://posoco.in/reports/daily-reports/daily-reports-2020-21/
this way if the pdf files from 2020-21 is all extracted, then the program should start extracting from the next URL which is 2021-22 so on as long as the website exists in an automated way so program can check them automated every-year.
The code I have written.
#import libraries
import re
import tabula
import datetime
from datetime import datetime,timedelta
from datetime import timedelta, date
import requests
import pandas as pd
import glob
import logging
import os
import urllib.request
import urllib.error
import urllib.parse
from bs4 import BeautifulSoup
import wget
def source_urls(url):
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
filelink = []
for link in tags:
if ('daily-reports-' in link.get('href','')):
filelink.append(link.get('href'))
return filelink
def get_urls(url):
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
tags = soup('a')
filelink = []
for link in tags:
if ('_nldc_psp/?wpdmdl' in link.get('href','')):
filelink.append(link.get('href'))
return filelink
if __name__ == "__main__":
url = 'https://posoco.in/reports/daily-reports/'
file_links = source_urls(url)
sorted_file_links = sorted(file_links)
for files in sorted_file_links:
sub_files = get_urls(files)
for x in sub_files:
print(x)
The program output
https://posoco.in/download/31.03.14_nldc_psp/?wpdmdl=3256
https://posoco.in/download/30.03.14_nldc_psp/?wpdmdl=3255
https://posoco.in/download/29.03.14_nldc_psp/?wpdmdl=3254
https://posoco.in/download/28.03.14_nldc_psp/?wpdmdl=3253
....
...
...
...
...
https://posoco.in/download/11-03-21_nldc_psp/?wpdmdl=35681
https://posoco.in/download/10-03-21_nldc_psp/?wpdmdl=35649
https://posoco.in/download/09-03-21_nldc_psp/?wpdmdl=35627
https://posoco.in/download/08-03-21_nldc_psp/?wpdmdl=35612
https://posoco.in/download/07-03-21_nldc_psp/?wpdmdl=35589
I have pasted all the libraries, but only a few are used here. rest are used to download and for processing and logging.
Well, you've to set an condition where it's will pickup the main urls which is equal/bigger than 2021-20
And then you can parse the inner urls.
Also, There's no need to use .get here!
Since you are using a condition to pickup urls include _nldc_psp/?wpdmdlwhich means if there's a pattern then return it. so you DONNOT need to replace the value with empty values link.get('href','') and then return it!
import httpx
import trio
from bs4 import BeautifulSoup
from datetime import datetime
from pprint import pprint as pp
cond = datetime.strptime('2020-21', '%Y-%d')
async def get_urls(client):
r = await client.get('https://posoco.in/reports/daily-reports/')
soup = BeautifulSoup(r.text, 'lxml')
return [x['href'] for x in soup.select('a[href*=reports-]') if datetime.strptime(x['href'].split('-', 3)[-1][:-1], '%Y-%d') >= cond]
async def main():
async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
links = await get_urls(client)
async def get_pdf(url):
r = await client.get(url)
soup = BeautifulSoup(r.text, 'lxml')
pp([x['href']
for x in soup.select('a[href*="nldc_psp"]')])
if links:
for link in links:
nurse.start_soon(get_pdf, link)
if __name__ == "__main__":
trio.run(main)

Scraping without specific strings in Python3

I'm trying to scrape only emoji in Python3. I used starttwith method with if statement but the result got some Unicodes that emoji's HTML tag seems to be same as others. I have no idea why some emoji is converted into Unicode. Could you give me any advice ?? or there is any ways to remove this unicode from list.
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
import re
import os
list0 = []
site_url = "https://www.emojiall.com/zh-hant/categories/A"
get_url = requests.get(site_url)
soup = BeautifulSoup(get_url.text, "lxml")
for script in soup(["span"]):
script.extract()
emojis = soup.select('.emoji_font')
words = soup.select('.emoji_name_truncate')
for emoji0 in emojis:
emoji1 = emoji0.getText()
if not repr(emoji1).startswith(r'\U'):
list0.append(emoji1)
else:
continue
print(list0)
I updated editor and it works well .
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
import re
import os
list0 = []
site_url = "https://www.emojiall.com/zh-hant/categories/A"
get_url = requests.get(site_url)
soup = BeautifulSoup(get_url.text, "lxml")
for script in soup(["span"]):
script.extract()
emojis = soup.select('.emoji_font')
words = soup.select('.emoji_name_truncate')
for emoji0 in emojis:
emoji1 = emoji0.getText()
if not repr(emoji1).startswith(r"'\U"):
list0.append(emoji1)
else:
continue
print(list0)

Python web scraper won't save image files

I started working on a small image scraping terminal program that is supposed to save images to a specified file within the program hierarchy. This comes from a basic tutorial I found online. However, whenever I enter in a search term into the terminal to start scraping bing.com (yeah, i know), the program crashes. The errors i get seem to focus on either the image file type not being recognized or the file path where the images will be saved is not being recognized:
from bs4 import BeautifulSoup
import requests
from PIL import Image
from io import BytesIO
search = input("Search for:")
params = {"q": search}
r = requests.get("http://www.bing.com/images/search", params=params)
soup = BeautifulSoup(r.text, "html.parser")
links = soup.findAll("a", {"class": "thumb"})
for item in links:
img_obj = requests.get(item.attrs["href"])
print("Getting", item.attrs["href"])
title = item.attrs["href"].split("/")[-1]
img = Image.open(BytesIO(img_obj.content))
img.save("./scraped_images/" + title, img.format)
Error thrown: Exception has occurred: FileNotFoundError
[Errno 2] No such file or directory: './scraped_images/3849747391_4a7dc3f19e_b.jpg'
I've tried adding a file path variable (using pathlib) and concatenating that with with the other necessary variables:
from bs4 import BeautifulSoup
import requests
from PIL import Image
from io import BytesIO
from pathlib import Path
image_folder = Path("./scraped_images/")
search = input("Search for:")
params = {"q": search}
r = requests.get("http://www.bing.com/images/search", params=params)
soup = BeautifulSoup(r.text, "html.parser")
links = soup.findAll("a", {"class": "thumb"})
for item in links:
img_obj = requests.get(item.attrs["href"])
print("Getting", item.attrs["href"])
title = item.attrs["href"].split("/")[-1]
img = Image.open(BytesIO(img_obj.content))
img.save(image_folder + title, img.format)
Error thrown: Exception has occurred: TypeError
unsupported operand type(s) for +: 'WindowsPath' and 'str'
I've checked the documentation for PIL, BeautifulSoup, etc. to see if any updates may have been screwing me up, i've checked the elements on bing to see if the classes are correct, and even tried searching by different class and nothing worked. I'm at a loss. Any thoughts or guidance is appreciated. Thanks!
I have changed your code a bit:
from bs4 import BeautifulSoup
import requests
from pathlib import Path
import os
image_folder = Path("./scraped_images/")
if not os.path.isdir(image_folder):
print('Making %s'%(image_folder))
os.mkdir(image_folder)
search = input("Search for:")
params = {"q": search}
r = requests.get("http://www.bing.com/images/search", params=params)
soup = BeautifulSoup(r.text, "html.parser")
links = soup.findAll("a", {"class": "thumb"})
for item in links:
img_obj = requests.get(item.attrs["href"])
print("Getting", item.attrs["href"])
title = item.attrs["href"].split("/")[-1]
if img_obj.ok:
with open('%s/%s'%(image_folder, title), 'wb') as file:
file.write(img_obj.content)
You can use PIL but in this case you do not need it.
I also improved the code with PIL to work better:
from bs4 import BeautifulSoup
import requests
from PIL import Image
from io import BytesIO
from pathlib import Path
s = requests.Session()
image_folder = Path("./scraped_images/")
search = input("Search for:")
params = {"q": search}
r = s.get("http://www.bing.com/images/search", params=params)
soup = BeautifulSoup(r.text, "html.parser")
links = soup.findAll("a", {"class": "thumb"})
for item in links:
try:
img_obj = s.get(item.attrs["href"], headers={'User-Agent': 'User-Agent: Mozilla/5.0'})
if img_obj.ok:
print("Getting", item.attrs["href"])
title = item.attrs["href"].split("/")[-1]
if '?' in title:
title = title.split('?')[0]
img = Image.open(BytesIO(img_obj.content))
img.save(str(image_folder) + '/' + title, img.format)
else:
continue
except OSError:
print('\nError downloading %s try to visit'
'\n%s\n'
'manually and try to get the image manually.\n' %(title, item.attrs["href"]))
I use a requests session and added try and except if PIL can't make the image. I also only make try to make a image if the request get a 200 response from the site.

Get the name of Instagram profile and the date of post with Python

I'm in the process of learning python3 and I try to solve a simple task. I want to get the name of account and the date of post from instagram link.
import requests
from bs4 import BeautifulSoup
html = requests.get('https://www.instagram.com/p/BuPSnoTlvTR')
soup = BeautifulSoup(html.text, 'lxml')
item = soup.select_one("meta[property='og:description']")
name = item.find_previous_sibling().get("content").split("•")[0]
print(name)
This code works sometimes with links like this https://www.instagram.com/kingtop
But I need it to work also with post of image like this https://www.instagram.com/p/BuxB00KFI-x/
That's all what I could make, but this is not working. And I can't get the date also.
Do you have any ideas? I appreciate any help.
I found a way to get the name of account. Now I'm trying to find a way to get an upload date
import requests
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
import time
from multiprocessing import Pool
from requests.exceptions import HTTPError
start = time.time()
file = open('users.txt', 'r', encoding="ISO-8859-1")
urls = file.readlines()
for url in urls:
url = url.strip ('\n')
try:
req = requests.get(url)
req.raise_for_status()
except HTTPError as http_err:
output = open('output2.txt', 'a')
output.write(f'не найдена\n')
except Exception as err:
output = open('output2.txt', 'a')
output.write(f'не найдены\n')
else:
output = open('output2.txt', 'a')
soup = BeautifulSoup(req.text, "lxml")
the_url = soup.select("[rel='canonical']")[0]['href']
the_url2=the_url.replace('https://www.instagram.com/','')
head, sep, tail = the_url2.partition('/')
output.write (head+'\n')

error while using selenium and writing to file

import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
import pandas as pd
from pytube import YouTube
browser = webdriver.Chrome("C:/Users/Downloads/chromedriver_win32/chromedriver.exe")
browser.get("https://www.youtube.com/channel/UCaKt8dvEIPnEHWSbLYhzrxg/videos")
time.sleep(1)
elem = browser.find_element_by_tag_name("body")
no_of_pagedowns = 100
while no_of_pagedowns:
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)
no_of_pagedowns-=1
html = browser.page_source
soup = BeautifulSoup(html, "lxml")
tags = soup.find_all('a')
fname = "C:/Stock_in_CFD/Output.txt"
text_file = open(fname, "w+", encoding="utf-8")
for tag in tags:
t = tag.get('href')
text_file.write(t)
When I am running the above code. I am getting error
TypeError: write() argument must be str, not None
When I am not using selenium I am able to do it.
I am using selenium since I want scroll down entire page before parsing before using BeautifulSoup

Resources